768 files changed, 28193 insertions, 14141 deletions
diff --git a/Documentation/auxdisplay/cfag12864b-example.c b/Documentation/auxdisplay/cfag12864b-example.c
index 1d2c010bae1..e7823ffb1ca 100644
--- a/Documentation/auxdisplay/cfag12864b-example.c
+++ b/Documentation/auxdisplay/cfag12864b-example.c
@@ -194,7 +194,6 @@ static void cfag12864b_blit(void)
  */
 
 #include <stdio.h>
-#include <string.h>
 
 #define EXAMPLES	6
 
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 6eb1a97e88c..455d4e6d346 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -408,6 +408,26 @@ You can attach the current shell task by echoing 0:
 
 # echo 0 > tasks
 
+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy.  This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems.  Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
 3. Kernel API
 =============
 
@@ -501,7 +521,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
 called multiple times against a cgroup.
 
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	       struct task_struct *task)
+	       struct task_struct *task, bool threadgroup)
 (cgroup_mutex held by caller)
 
 Called prior to moving a task into a cgroup; if the subsystem
@@ -509,14 +529,20 @@ returns an error, this will abort the attach operation.  If a NULL
 task is passed, then a successful result indicates that *any*
 unspecified task can be moved into the cgroup. Note that this isn't
 called on a fork. If this method returns 0 (success) then this should
-remain valid while the caller holds cgroup_mutex.
+remain valid while the caller holds cgroup_mutex. If threadgroup is
+true, then a successful result indicates that all threads in the given
+thread's threadgroup can be moved together.
 
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	    struct cgroup *old_cgrp, struct task_struct *task)
+	    struct cgroup *old_cgrp, struct task_struct *task,
+	    bool threadgroup)
 (cgroup_mutex held by caller)
 
 Called after the task has been attached to the cgroup, to allow any
 post-attachment activity that requires memory allocations or blocking.
+If threadgroup is true, the subsystem should take care of all threads
+in the specified thread's threadgroup. Currently does not support any
+subsystem that might need the old_cgrp for every thread in the group.
 
 void fork(struct cgroup_subsy *ss, struct task_struct *task)
 
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 23d1262c077..b871f2552b4 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that
 pages that are selected for reclaiming come from the per cgroup LRU
 list.
 
+NOTE: Reclaim does not work for the root cgroup, since we cannot set any
+limits on the root cgroup.
+
 2. Locking
 
 The memory controller uses the following hierarchy
@@ -210,6 +213,7 @@ We can alter the memory limit:
 NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
 mega or gigabytes.
 NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
+NOTE: We cannot set limits on the root cgroup any more.
 
 # cat /cgroups/0/memory.limit_in_bytes
 4194304
@@ -375,7 +379,42 @@ cgroups created below it.
 
 NOTE2: This feature can be enabled/disabled per subtree.
 
-7. TODO
+7. Soft limits
+
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+
+a. There is no memory contention
+b. They do not exceed their hard limit
+
+When the system detects memory contention or low memory control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+
+Please note that soft limits is a best effort feature, it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is setup such that
+it gets invoked from balance_pgdat (kswapd).
+
+7.1 Interface
+
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 megabytes)
+
+# echo 256M > memory.soft_limit_in_bytes
+
+If we want to change this to 1G, we can at any time use
+
+# echo 1G > memory.soft_limit_in_bytes
+
+NOTE1: Soft limits take effect over a long period of time, since they involve
+       reclaiming memory for balancing between memory cgroups
+NOTE2: It is recommended to set the soft limit always below the hard limit,
+       otherwise the hard limit will take precedence.
+
+8. TODO
 
 1. Add support for accounting huge pages (as a separate controller)
 2. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
index 9f59fcbf5d8..ba046b8fa92 100644
--- a/Documentation/crypto/async-tx-api.txt
+++ b/Documentation/crypto/async-tx-api.txt
@@ -54,20 +54,23 @@ features surfaced as a result:
 
 3.1 General format of the API:
 struct dma_async_tx_descriptor *
-async_<operation>(<op specific parameters>,
-		  enum async_tx_flags flags,
-        	  struct dma_async_tx_descriptor *dependency,
-        	  dma_async_tx_callback callback_routine,
-		  void *callback_parameter);
+async_<operation>(<op specific parameters>, struct async_submit ctl *submit)
 
 3.2 Supported operations:
-memcpy       - memory copy between a source and a destination buffer
-memset       - fill a destination buffer with a byte value
-xor          - xor a series of source buffers and write the result to a
-	       destination buffer
-xor_zero_sum - xor a series of source buffers and set a flag if the
-	       result is zero.  The implementation attempts to prevent
-	       writes to memory
+memcpy  - memory copy between a source and a destination buffer
+memset  - fill a destination buffer with a byte value
+xor     - xor a series of source buffers and write the result to a
+	  destination buffer
+xor_val - xor a series of source buffers and set a flag if the
+	  result is zero.  The implementation attempts to prevent
+	  writes to memory
+pq	- generate the p+q (raid6 syndrome) from a series of source buffers
+pq_val  - validate that a p and or q buffer are in sync with a given series of
+	  sources
+datap	- (raid6_datap_recov) recover a raid6 data block and the p block
+	  from the given sources
+2data	- (raid6_2data_recov) recover 2 raid6 data blocks from the given
+	  sources
 
 3.3 Descriptor management:
 The return value is non-NULL and points to a 'descriptor' when the operation
@@ -80,8 +83,8 @@ acknowledged by the application before the offload engine driver is allowed to
 recycle (or free) the descriptor.  A descriptor can be acked by one of the
 following methods:
 1/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted
-2/ setting the ASYNC_TX_DEP_ACK flag to acknowledge the parent
-   descriptor of a new operation.
+2/ submitting an unacknowledged descriptor as a dependency to another
+   async_tx call will implicitly set the acknowledged state.
 3/ calling async_tx_ack() on the descriptor.
 
 3.4 When does the operation execute?
@@ -119,30 +122,42 @@ of an operation.
 Perform a xor->copy->xor operation where each operation depends on the
 result from the previous operation:
 
-void complete_xor_copy_xor(void *param)
+void callback(void *param)
 {
-	printk("complete\n");
+	struct completion *cmp = param;
+
+	complete(cmp);
 }
 
-int run_xor_copy_xor(struct page **xor_srcs,
-		     int xor_src_cnt,
-		     struct page *xor_dest,
-		     size_t xor_len,
-		     struct page *copy_src,
-		     struct page *copy_dest,
-		     size_t copy_len)
+void run_xor_copy_xor(struct page **xor_srcs,
+		      int xor_src_cnt,
+		      struct page *xor_dest,
+		      size_t xor_len,
+		      struct page *copy_src,
+		      struct page *copy_dest,
+		      size_t copy_len)
 {
 	struct dma_async_tx_descriptor *tx;
+	addr_conv_t addr_conv[xor_src_cnt];
+	struct async_submit_ctl submit;
+	addr_conv_t addr_conv[NDISKS];
+	struct completion cmp;
+
+	init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL,
+			  addr_conv);
+	tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit)
 
-	tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
-		       ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL);
-	tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len,
-			  ASYNC_TX_DEP_ACK, tx, NULL, NULL);
-	tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
-		       ASYNC_TX_XOR_DROP_DST | ASYNC_TX_DEP_ACK | ASYNC_TX_ACK,
-		       tx, complete_xor_copy_xor, NULL);
+	submit->depend_tx = tx;
+	tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, &submit);
+
+	init_completion(&cmp);
+	init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST | ASYNC_TX_ACK, tx,
+			  callback, &cmp, addr_conv);
+	tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit);
 
 	async_tx_issue_pending_all();
+
+	wait_for_completion(&cmp);
 }
 
 See include/linux/async_tx.h for more information on the flags.  See the
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index fa75220f8d3..89a47b5aff0 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -354,14 +354,6 @@ Who:  Krzysztof Piotr Oledzki <ole@ans.pl>
 
 ---------------------------
 
-What:	fscher and fscpos drivers
-When:	June 2009
-Why:	Deprecated by the new fschmd driver.
-Who:	Hans de Goede <hdegoede@redhat.com>
-	Jean Delvare <khali@linux-fr.org>
-
----------------------------
-
 What:	sysfs ui for changing p4-clockmod parameters
 When:	September 2009
 Why:	See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index 6208f55c44c..57e0b80a527 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -18,11 +18,11 @@ the 9p client is available in the form of a USENIX paper:
 
 Other applications are described in the following papers:
 	* XCPU & Clustering
-		http://www.xcpu.org/xcpu-talk.pdf
+		http://xcpu.org/papers/xcpu-talk.pdf
 	* KVMFS: control file system for KVM
-		http://www.xcpu.org/kvmfs.pdf
-	* CellFS: A New ProgrammingModel for the Cell BE
-		http://www.xcpu.org/cellfs-talk.pdf
+		http://xcpu.org/papers/kvmfs.pdf
+	* CellFS: A New Programming Model for the Cell BE
+		http://xcpu.org/papers/cellfs-talk.pdf
 	* PROSE I/O: Using 9p to enable Application Partitions
 		http://plan9.escet.urjc.es/iwp9/cready/PROSE_iwp9_2006.pdf
 
@@ -48,6 +48,7 @@ OPTIONS
                                 (see rfdno and wfdno)
 			virtio	- connect to the next virtio channel available
 				(from lguest or KVM with trans_virtio module)
+			rdma	- connect to a specified RDMA channel
 
   uname=name	user name to attempt mount as on the remote server.  The
   		server may override or ignore this value.  Certain user
@@ -59,16 +60,22 @@ OPTIONS
   cache=mode	specifies a caching policy.  By default, no caches are used.
 			loose = no attempts are made at consistency,
                                 intended for exclusive, read-only mounts
+			fscache = use FS-Cache for a persistent, read-only
+				cache backend.
 
   debug=n	specifies debug level.  The debug level is a bitmask.
-  			0x01 = display verbose error messages
-			0x02 = developer debug (DEBUG_CURRENT)
-			0x04 = display 9p trace
-			0x08 = display VFS trace
-			0x10 = display Marshalling debug
-			0x20 = display RPC debug
-			0x40 = display transport debug
-			0x80 = display allocation debug
+			0x01  = display verbose error messages
+			0x02  = developer debug (DEBUG_CURRENT)
+			0x04  = display 9p trace
+			0x08  = display VFS trace
+			0x10  = display Marshalling debug
+			0x20  = display RPC debug
+			0x40  = display transport debug
+			0x80  = display allocation debug
+			0x100 = display protocol message debug
+			0x200 = display Fid debug
+			0x400 = display packet debug
+			0x800 = display fscache tracing debug
 
   rfdno=n	the file descriptor for reading with trans=fd
 
@@ -100,6 +107,10 @@ OPTIONS
 			any   = v9fs does single attach and performs all
 				operations as one user
 
+  cachetag	cache tag to use the specified persistent cache.
+		cache tags for existing cache sessions can be listed at
+		/sys/fs/9p/caches. (applies only to cache=fscache)
+
 RESOURCES
 =========
 
@@ -118,7 +129,7 @@ and export.
 A Linux version of the 9p server is now maintained under the npfs project
 on sourceforge (http://sourceforge.net/projects/npfs).  The currently
 maintained version is the single-threaded version of the server (named spfs)
-available from the same CVS repository.
+available from the same SVN repository.
 
 There are user and developer mailing lists available through the v9fs project
 on sourceforge (http://sourceforge.net/projects/v9fs).
@@ -126,7 +137,8 @@ on sourceforge (http://sourceforge.net/projects/v9fs).
 A stand-alone version of the module (which should build for any 2.6 kernel)
 is available via (http://github.com/ericvh/9p-sac/tree/master)
 
-News and other information is maintained on SWiK (http://swik.net/v9fs).
+News and other information is maintained on SWiK (http://swik.net/v9fs)
+and the Wiki (http://sf.net/apps/mediawiki/v9fs/index.php).
 
 Bug reports may be issued through the kernel.org bugzilla 
 (http://bugzilla.kernel.org)
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 736540045dc..23a181074f9 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -4,7 +4,7 @@ Shared Subtrees
 Contents:
 	1) Overview
 	2) Features
-	3) smount command
+	3) Setting mount states
 	4) Use-case
 	5) Detailed semantics
 	6) Quiz
@@ -41,14 +41,14 @@ replicas continue to be exactly same.
 
 	Here is an example:
 
-	Lets say /mnt has a mount that is shared.
+	Let's say /mnt has a mount that is shared.
 	mount --make-shared /mnt
 
-	note: mount command does not yet support the --make-shared flag.
-	I have included a small C program which does the same by executing
-	'smount /mnt shared'
+	Note: mount(8) command now supports the --make-shared flag,
+	so the sample 'smount' program is no longer needed and has been
+	removed.
 
-	#mount --bind /mnt /tmp
+	# mount --bind /mnt /tmp
 	The above command replicates the mount at /mnt to the mountpoint /tmp
 	and the contents of both the mounts remain identical.
 
@@ -58,8 +58,8 @@ replicas continue to be exactly same.
 	#ls /tmp
 	a b c
 
-	Now lets say we mount a device at /tmp/a
-	#mount /dev/sd0  /tmp/a
+	Now let's say we mount a device at /tmp/a
+	# mount /dev/sd0  /tmp/a
 
 	#ls /tmp/a
 	t1 t2 t2
@@ -80,21 +80,20 @@ replicas continue to be exactly same.
 
 	Here is an example:
 
-	Lets say /mnt has a mount which is shared.
-	#mount --make-shared /mnt
+	Let's say /mnt has a mount which is shared.
+	# mount --make-shared /mnt
 
-	Lets bind mount /mnt to /tmp
-	#mount --bind /mnt /tmp
+	Let's bind mount /mnt to /tmp
+	# mount --bind /mnt /tmp
 
 	the new mount at /tmp becomes a shared mount and it is a replica of
 	the mount at /mnt.
 
-	Now lets make the mount at /tmp; a slave of /mnt
-	#mount --make-slave /tmp
-	[or smount /tmp slave]
+	Now let's make the mount at /tmp; a slave of /mnt
+	# mount --make-slave /tmp
 
-	lets mount /dev/sd0 on /mnt/a
-	#mount /dev/sd0 /mnt/a
+	let's mount /dev/sd0 on /mnt/a
+	# mount /dev/sd0 /mnt/a
 
 	#ls /mnt/a
 	t1 t2 t3
@@ -104,9 +103,9 @@ replicas continue to be exactly same.
 
 	Note the mount event has propagated to the mount at /tmp
 
-	However lets see what happens if we mount something on the mount at /tmp
+	However let's see what happens if we mount something on the mount at /tmp
 
-	#mount /dev/sd1 /tmp/b
+	# mount /dev/sd1 /tmp/b
 
 	#ls /tmp/b
 	s1 s2 s3
@@ -124,12 +123,11 @@ replicas continue to be exactly same.
 
 2d) A unbindable mount is a unbindable private mount
 
-	lets say we have a mount at /mnt and we make is unbindable
+	let's say we have a mount at /mnt and we make is unbindable
 
-	#mount --make-unbindable /mnt
-	 [ smount /mnt  unbindable ]
+	# mount --make-unbindable /mnt
 
-	 Lets try to bind mount this mount somewhere else.
+	 Let's try to bind mount this mount somewhere else.
 	 # mount --bind /mnt /tmp
 	 mount: wrong fs type, bad option, bad superblock on /mnt,
 	        or too many mounted file systems
@@ -137,149 +135,15 @@ replicas continue to be exactly same.
 	Binding a unbindable mount is a invalid operation.
 
 
-3) smount command
+3) Setting mount states
 
-	Currently the mount command is not aware of shared subtree features.
-	Work is in progress to add the support in mount ( util-linux package ).
-	Till then use the following program.
+	The mount command (util-linux package) can be used to set mount
+	states:
 
-	------------------------------------------------------------------------
-	//
-	//this code was developed my Miklos Szeredi <miklos@szeredi.hu>
-	//and modified by Ram Pai <linuxram@us.ibm.com>
-	// sample usage:
-	//              smount /tmp shared
-	//
-	#include <stdio.h>
-	#include <stdlib.h>
-	#include <unistd.h>
-	#include <string.h>
-	#include <sys/mount.h>
-	#include <sys/fsuid.h>
-
-	#ifndef MS_REC
-	#define MS_REC		0x4000	/* 16384: Recursive loopback */
-	#endif
-
-	#ifndef MS_SHARED
-	#define MS_SHARED		1<<20	/* Shared */
-	#endif
-
-	#ifndef MS_PRIVATE
-	#define MS_PRIVATE		1<<18	/* Private */
-	#endif
-
-	#ifndef MS_SLAVE
-	#define MS_SLAVE		1<<19	/* Slave */
-	#endif
-
-	#ifndef MS_UNBINDABLE
-	#define MS_UNBINDABLE		1<<17	/* Unbindable */
-	#endif
-
-	int main(int argc, char *argv[])
-	{
-		int type;
-		if(argc != 3) {
-			fprintf(stderr, "usage: %s dir "
-			"<rshared|rslave|rprivate|runbindable|shared|slave"
-			"|private|unbindable>\n" , argv[0]);
-			return 1;
-		}
-
-		fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]);
-
-		if (strcmp(argv[2],"rshared")==0)
-			type=(MS_SHARED|MS_REC);
-		else if (strcmp(argv[2],"rslave")==0)
-			type=(MS_SLAVE|MS_REC);
-		else if (strcmp(argv[2],"rprivate")==0)
-			type=(MS_PRIVATE|MS_REC);
-		else if (strcmp(argv[2],"runbindable")==0)
-			type=(MS_UNBINDABLE|MS_REC);
-		else if (strcmp(argv[2],"shared")==0)
-			type=MS_SHARED;
-		else if (strcmp(argv[2],"slave")==0)
-			type=MS_SLAVE;
-		else if (strcmp(argv[2],"private")==0)
-			type=MS_PRIVATE;
-		else if (strcmp(argv[2],"unbindable")==0)
-			type=MS_UNBINDABLE;
-		else {
-			fprintf(stderr, "invalid operation: %s\n", argv[2]);
-			return 1;
-		}
-		setfsuid(getuid());
-
-		if(mount("", argv[1], "dontcare", type, "") == -1) {
-			perror("mount");
-			return 1;
-		}
-		return 0;
-	}
-	-----------------------------------------------------------------------
-
-	Copy the above code snippet into smount.c
-	gcc -o smount smount.c
-
-
-	(i) To mark all the mounts under /mnt as shared execute the following
-	command:
-
-	 	smount /mnt rshared
-		the corresponding syntax planned for mount command is
-		mount --make-rshared /mnt
-
-	    just to mark a mount /mnt as shared, execute the following
-	    command:
-	 	smount /mnt shared
-		the corresponding syntax planned for mount command is
-		mount --make-shared /mnt
-
-	(ii) To mark all the shared mounts under /mnt as slave execute the
-	following
-
-	     command:
-		smount /mnt rslave
-		the corresponding syntax planned for mount command is
-		mount --make-rslave /mnt
-
-	    just to mark a mount /mnt as slave, execute the following
-	    command:
-	 	smount /mnt slave
-		the corresponding syntax planned for mount command is
-		mount --make-slave /mnt
-
-	(iii) To mark all the mounts under /mnt as private execute the
-	following command:
-
-		smount /mnt rprivate
-		the corresponding syntax planned for mount command is
-		mount --make-rprivate /mnt
-
-	    just to mark a mount /mnt as private, execute the following
-	    command:
-	 	smount /mnt private
-		the corresponding syntax planned for mount command is
-		mount --make-private /mnt
-
-	      NOTE: by default all the mounts are created as private. But if
-	      you want to change some shared/slave/unbindable  mount as
-	      private at a later point in time, this command can help.
-
-	(iv) To mark all the mounts under /mnt as unbindable execute the
-	following
-
-	     command:
-		smount /mnt runbindable
-		the corresponding syntax planned for mount command is
-		mount --make-runbindable /mnt
-
-	    just to mark a mount /mnt as unbindable, execute the following
-	    command:
-	 	smount /mnt unbindable
-		the corresponding syntax planned for mount command is
-		mount --make-unbindable /mnt
+	mount --make-shared mountpoint
+	mount --make-slave mountpoint
+	mount --make-private mountpoint
+	mount --make-unbindable mountpoint
 
 
 4) Use cases
@@ -350,7 +214,7 @@ replicas continue to be exactly same.
 		mount --rbind / /view/v3
 		mount --rbind / /view/v4
 
-		and if /usr has a versioning filesystem mounted, than that
+		and if /usr has a versioning filesystem mounted, then that
 		mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and
 		/view/v4/usr too
 
@@ -390,7 +254,7 @@ replicas continue to be exactly same.
 
 		For example:
 			mount --make-shared /mnt
-			mount --bin /mnt /tmp
+			mount --bind /mnt /tmp
 
 		The mount at /mnt and that at /tmp are both shared and belong
 		to the same peer group. Anything mounted or unmounted under
@@ -558,7 +422,7 @@ replicas continue to be exactly same.
 	then the subtree under the unbindable mount is pruned in the new
 	location.
 
-	eg: lets say we have the following mount tree.
+	eg: let's say we have the following mount tree.
 
 		A
 	      /   \
@@ -566,7 +430,7 @@ replicas continue to be exactly same.
 	     / \ / \
 	     D E F G
 
-	     Lets say all the mount except the mount C in the tree are
+	     Let's say all the mount except the mount C in the tree are
 	     of a type other than unbindable.
 
 	     If this tree is rbound to say Z
@@ -683,13 +547,13 @@ replicas continue to be exactly same.
 	'b' on mounts that receive propagation from mount 'B' and does not have
 	sub-mounts within them are unmounted.
 
-	Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to
+	Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to
 	each other.
 
-	lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
+	let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
 	'B1', 'B2' and 'B3' respectively.
 
-	lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
+	let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
 	mount 'B1', 'B2' and 'B3' respectively.
 
 	if 'C1' is unmounted, all the mounts that are most-recently-mounted on
@@ -710,7 +574,7 @@ replicas continue to be exactly same.
 	A cloned namespace contains all the mounts as that of the parent
 	namespace.
 
-	Lets say 'A' and 'B' are the corresponding mounts in the parent and the
+	Let's say 'A' and 'B' are the corresponding mounts in the parent and the
 	child namespace.
 
 	If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to
@@ -759,11 +623,11 @@ replicas continue to be exactly same.
 		mount --make-slave /mnt
 
 		At this point we have the first mount at /tmp and
-		its root dentry is 1. Lets call this mount 'A'
+		its root dentry is 1. Let's call this mount 'A'
 		And then we have a second mount at /tmp1 with root
-		dentry 2. Lets call this mount 'B'
+		dentry 2. Let's call this mount 'B'
 		Next we have a third mount at /mnt with root dentry
-		mnt. Lets call this mount 'C'
+		mnt. Let's call this mount 'C'
 
 		'B' is the slave of 'A' and 'C' is a slave of 'B'
 		A -> B -> C
@@ -794,7 +658,7 @@ replicas continue to be exactly same.
 
 	Q3 Why is unbindable mount needed?
 
-		Lets say we want to replicate the mount tree at multiple
+		Let's say we want to replicate the mount tree at multiple
 		locations within the same subtree.
 
 		if one rbind mounts a tree within the same subtree 'n' times
@@ -803,7 +667,7 @@ replicas continue to be exactly same.
 		mounts. Here is a example.
 
 		step 1:
-		   lets say the root tree has just two directories with
+		   let's say the root tree has just two directories with
 		   one vfsmount.
 				    root
 				   /    \
@@ -875,7 +739,7 @@ replicas continue to be exactly same.
 		Unclonable mounts come in handy here.
 
 		step 1:
-		   lets say the root tree has just two directories with
+		   let's say the root tree has just two directories with
 		   one vfsmount.
 				    root
 				   /    \
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f49eecf2e57..623f094c9d8 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -536,6 +536,7 @@ struct address_space_operations {
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
 	int (*launder_page) (struct page *);
+	int (*error_remove_page) (struct mapping *mapping, struct page *page);
 };
 
   writepage: called by the VM to write a dirty page to backing store.
@@ -694,6 +695,12 @@ struct address_space_operations {
   	prevent redirtying the page, it is kept locked during the whole
 	operation.
 
+  error_remove_page: normally set to generic_error_remove_page if truncation
+	is ok for this address space. Used for memory failure handling.
+	Setting this implies you deal with pages going away under you,
+	unless you have them locked or reference counts increased.
+
+
 The File Object
 ===============
 
diff --git a/Documentation/hwmon/coretemp b/Documentation/hwmon/coretemp
index dbbe6c7025b..92267b62db5 100644
--- a/Documentation/hwmon/coretemp
+++ b/Documentation/hwmon/coretemp
@@ -4,7 +4,9 @@ Kernel driver coretemp
 Supported chips:
   * All Intel Core family
     Prefix: 'coretemp'
-    CPUID: family 0x6, models 0xe, 0xf, 0x16, 0x17
+    CPUID: family 0x6, models 0xe (Pentium M DC), 0xf (Core 2 DC 65nm),
+                              0x16 (Core 2 SC 65nm), 0x17 (Penryn 45nm),
+                              0x1a (Nehalem), 0x1c (Atom), 0x1e (Lynnfield)
     Datasheet: Intel 64 and IA-32 Architectures Software Developer's Manual
                Volume 3A: System Programming Guide
                http://softwarecommunity.intel.com/Wiki/Mobility/720.htm
diff --git a/Documentation/hwmon/fscher b/Documentation/hwmon/fscher
deleted file mode 100644
index 64031659aff..00000000000
--- a/Documentation/hwmon/fscher
+++ /dev/null
@@ -1,169 +0,0 @@
-Kernel driver fscher
-====================
-
-Supported chips:
-  * Fujitsu-Siemens Hermes chip
-    Prefix: 'fscher'
-    Addresses scanned: I2C 0x73
-
-Authors:
-        Reinhard Nissl <rnissl@gmx.de> based on work
-        from Hermann Jung <hej@odn.de>,
-        Frodo Looijaard <frodol@dds.nl>,
-        Philip Edelbrock <phil@netroedge.com>
-
-Description
------------
-
-This driver implements support for the Fujitsu-Siemens Hermes chip. It is
-described in the 'Register Set Specification BMC Hermes based Systemboard'
-from Fujitsu-Siemens.
-
-The Hermes chip implements a hardware-based system management, e.g. for
-controlling fan speed and core voltage. There is also a watchdog counter on
-the chip which can trigger an alarm and even shut the system down.
-
-The chip provides three temperature values (CPU, motherboard and
-auxiliary), three voltage values (+12V, +5V and battery) and three fans
-(power supply, CPU and auxiliary).
-
-Temperatures are measured in degrees Celsius. The resolution is 1 degree.
-
-Fan rotation speeds are reported in RPM (rotations per minute). The value
-can be divided by a programmable divider (1, 2 or 4) which is stored on
-the chip.
-
-Voltage sensors (also known as "in" sensors) report their values in volts.
-
-All values are reported as final values from the driver. There is no need
-for further calculations.
-
-
-Detailed description
---------------------
-
-Below you'll find a single line description of all the bit values. With
-this information, you're able to decode e. g. alarms, wdog, etc. To make
-use of the watchdog, you'll need to set the watchdog time and enable the
-watchdog. After that it is necessary to restart the watchdog time within
-the specified period of time, or a system reset will occur.
-
-* revision
-  READING & 0xff = 0x??: HERMES revision identification
-
-* alarms
-  READING & 0x80 = 0x80: CPU throttling active
-  READING & 0x80 = 0x00: CPU running at full speed
-
-  READING & 0x10 = 0x10: software event (see control:1)
-  READING & 0x10 = 0x00: no software event
-
-  READING & 0x08 = 0x08: watchdog event (see wdog:2)
-  READING & 0x08 = 0x00: no watchdog event
-
-  READING & 0x02 = 0x02: thermal event (see temp*:1)
-  READING & 0x02 = 0x00: no thermal event
-
-  READING & 0x01 = 0x01: fan event (see fan*:1)
-  READING & 0x01 = 0x00: no fan event
-
-  READING & 0x13 ! 0x00: ALERT LED is flashing
-
-* control
-  READING & 0x01 = 0x01: software event
-  READING & 0x01 = 0x00: no software event
-
-  WRITING & 0x01 = 0x01: set software event
-  WRITING & 0x01 = 0x00: clear software event
-
-* watchdog_control
-  READING & 0x80 = 0x80: power off on watchdog event while thermal event
-  READING & 0x80 = 0x00: watchdog power off disabled (just system reset enabled)
-
-  READING & 0x40 = 0x40: watchdog timebase 60 seconds (see also wdog:1)
-  READING & 0x40 = 0x00: watchdog timebase  2 seconds
-
-  READING & 0x10 = 0x10: watchdog enabled
-  READING & 0x10 = 0x00: watchdog disabled
-
-  WRITING & 0x80 = 0x80: enable "power off on watchdog event while thermal event"
-  WRITING & 0x80 = 0x00: disable "power off on watchdog event while thermal event"
-
-  WRITING & 0x40 = 0x40: set watchdog timebase to 60 seconds
-  WRITING & 0x40 = 0x00: set watchdog timebase to  2 seconds
-
-  WRITING & 0x20 = 0x20: disable watchdog
-
-  WRITING & 0x10 = 0x10: enable watchdog / restart watchdog time
-
-* watchdog_state
-  READING & 0x02 = 0x02: watchdog system reset occurred
-  READING & 0x02 = 0x00: no watchdog system reset occurred
-
-  WRITING & 0x02 = 0x02: clear watchdog event
-
-* watchdog_preset
-  READING & 0xff = 0x??: configured watch dog time in units (see wdog:3 0x40)
-
-  WRITING & 0xff = 0x??: configure watch dog time in units
-
-* in*     (0: +5V, 1: +12V, 2: onboard 3V battery)
-  READING: actual voltage value
-
-* temp*_status   (1: CPU sensor, 2: onboard sensor, 3: auxiliary sensor)
-  READING & 0x02 = 0x02: thermal event (overtemperature)
-  READING & 0x02 = 0x00: no thermal event
-
-  READING & 0x01 = 0x01: sensor is working
-  READING & 0x01 = 0x00: sensor is faulty
-
-  WRITING & 0x02 = 0x02: clear thermal event
-
-* temp*_input   (1: CPU sensor, 2: onboard sensor, 3: auxiliary sensor)
-  READING: actual temperature value
-
-* fan*_status   (1: power supply fan, 2: CPU fan, 3: auxiliary fan)
-  READING & 0x04 = 0x04: fan event (fan fault)
-  READING & 0x04 = 0x00: no fan event
-
-  WRITING & 0x04 = 0x04: clear fan event
-
-* fan*_div (1: power supply fan, 2: CPU fan, 3: auxiliary fan)
-  	Divisors 2,4 and 8 are supported, both for reading and writing
-
-* fan*_pwm   (1: power supply fan, 2: CPU fan, 3: auxiliary fan)
-  READING & 0xff = 0x00: fan may be switched off
-  READING & 0xff = 0x01: fan must run at least at minimum speed (supply: 6V)
-  READING & 0xff = 0xff: fan must run at maximum speed (supply: 12V)
-  READING & 0xff = 0x??: fan must run at least at given speed (supply: 6V..12V)
-
-  WRITING & 0xff = 0x00: fan may be switched off
-  WRITING & 0xff = 0x01: fan must run at least at minimum speed (supply: 6V)
-  WRITING & 0xff = 0xff: fan must run at maximum speed (supply: 12V)
-  WRITING & 0xff = 0x??: fan must run at least at given speed (supply: 6V..12V)
-
-* fan*_input   (1: power supply fan, 2: CPU fan, 3: auxiliary fan)
-  READING: actual RPM value
-
-
-Limitations
------------
-
-* Measuring fan speed
-It seems that the chip counts "ripples" (typical fans produce 2 ripples per
-rotation while VERAX fans produce 18) in a 9-bit register. This register is
-read out every second, then the ripple prescaler (2, 4 or 8) is applied and
-the result is stored in the 8 bit output register. Due to the limitation of
-the counting register to 9 bits, it is impossible to measure a VERAX fan
-properly (even with a prescaler of 8). At its maximum speed of 3500 RPM the
-fan produces 1080 ripples per second which causes the counting register to
-overflow twice, leading to only 186 RPM.
-
-* Measuring input voltages
-in2 ("battery") reports the voltage of the onboard lithium battery and not
-+3.3V from the power supply.
-
-* Undocumented features
-Fujitsu-Siemens Computers has not documented all features of the chip so
-far. Their software, System Guard, shows that there are a still some
-features which cannot be controlled by this implementation.
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index aafca0a8f66..947374977ca 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -135,6 +135,7 @@ Code	Seq#	Include File		Comments
 					<http://mikonos.dia.unisa.it/tcfs>
 'l'	40-7F	linux/udf_fs_i.h	in development:
 					<http://sourceforge.net/projects/linux-udf/>
+'m'	00-09	linux/mmtimer.h
 'm'	all	linux/mtio.h		conflict!
 'm'	all	linux/soundcard.h	conflict!
 'm'	all	linux/synclink.h	conflict!
diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.txt
index f3355b6812d..bb3bf38f03d 100644
--- a/Documentation/kbuild/kbuild.txt
+++ b/Documentation/kbuild/kbuild.txt
@@ -65,6 +65,22 @@ INSTALL_PATH
 INSTALL_PATH specifies where to place the updated kernel and system map
 images. Default is /boot, but you can set it to other values.
 
+INSTALLKERNEL
+--------------------------------------------------
+Install script called when using "make install".
+The default name is "installkernel".
+
+The script will be called with the following arguments:
+    $1 - kernel version
+    $2 - kernel image file
+    $3 - kernel map file
+    $4 - default install path (use root directory if blank)
+
+The implmentation of "make install" is architecture specific
+and it may differ from the above.
+
+INSTALLKERNEL is provided to enable the possibility to
+specify a custom installer when cross compiling a kernel.
 
 MODLIB
 --------------------------------------------------
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index d76cfd8712e..71c602d6168 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -18,6 +18,7 @@ This document describes the Linux kernel Makefiles.
 	   --- 3.9 Dependency tracking
 	   --- 3.10 Special Rules
 	   --- 3.11 $(CC) support functions
+	   --- 3.12 $(LD) support functions
 
 	=== 4 Host Program support
 	   --- 4.1 Simple Host Program
@@ -435,14 +436,14 @@ more details, with real examples.
 	The second argument is optional, and if supplied will be used
 	if first argument is not supported.
 
-    ld-option
-	ld-option is used to check if $(CC) when used to link object files
+    cc-ldoption
+	cc-ldoption is used to check if $(CC) when used to link object files
 	supports the given option.  An optional second option may be
 	specified if first option are not supported.
 
 	Example:
 		#arch/i386/kernel/Makefile
-		vsyscall-flags += $(call ld-option, -Wl$(comma)--hash-style=sysv)
+		vsyscall-flags += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 
 	In the above example, vsyscall-flags will be assigned the option
 	-Wl$(comma)--hash-style=sysv if it is supported by $(CC).
@@ -570,6 +571,19 @@ more details, with real examples.
 			endif
 		endif
 
+--- 3.12 $(LD) support functions
+
+    ld-option
+	ld-option is used to check if $(LD) supports the supplied option.
+	ld-option takes two options as arguments.
+	The second argument is an optional option that can be used if the
+	first option is not supported by $(LD).
+
+	Example:
+		#Makefile
+		LDFLAGS_vmlinux += $(call really-ld-option, -X)
+
+
 === 4 Host Program support
 
 Kbuild supports building executables on the host for use during the
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 1458448436c..62682500878 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -96,13 +96,16 @@ handles that the Linux kernel will allocate. When you get lots
 of error messages about running out of file handles, you might
 want to increase this limit.
 
-The three values in file-nr denote the number of allocated
-file handles, the number of unused file handles and the maximum
-number of file handles. When the allocated file handles come
-close to the maximum, but the number of unused file handles is
-significantly greater than 0, you've encountered a peak in your 
-usage of file handles and you don't need to increase the maximum.
-
+Historically, the three values in file-nr denoted the number of
+allocated file handles, the number of allocated but unused file
+handles, and the maximum number of file handles. Linux 2.6 always
+reports 0 as the number of free file handles -- this is not an
+error, it just means that the number of allocated file handles
+exactly matches the number of used file handles.
+
+Attempts to allocate more file descriptors than file-max are
+reported with printk, look for "VFS: file-max limit <number>
+reached".
 ==============================================================
 
 nr_open:
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index b3d8b492274..a028b92001e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -22,6 +22,7 @@ show up in /proc/sys/kernel:
 - callhome		     [ S390 only ]
 - auto_msgmni
 - core_pattern
+- core_pipe_limit
 - core_uses_pid
 - ctrl-alt-del
 - dentry-state
@@ -135,6 +136,27 @@ core_pattern is used to specify a core dumpfile pattern name.
 
 ==============================================================
 
+core_pipe_limit:
+
+This sysctl is only applicable when core_pattern is configured to pipe core
+files to user space helper a (when the first character of core_pattern is a '|',
+see above).  When collecting cores via a pipe to an application, it is
+occasionally usefull for the collecting application to gather data about the
+crashing process from its /proc/pid directory.  In order to do this safely, the
+kernel must wait for the collecting process to exit, so as not to remove the
+crashing processes proc files prematurely.  This in turn creates the possibility
+that a misbehaving userspace collecting process can block the reaping of a
+crashed process simply by never exiting.  This sysctl defends against that.  It
+defines how many concurrent crashing processes may be piped to user space
+applications in parallel.  If this value is exceeded, then those crashing
+processes above that value are noted via the kernel log and their cores are
+skipped.  0 is a special value, indicating that unlimited processes may be
+captured in parallel, but that no waiting will take place (i.e. the collecting
+process is not guaranteed access to /proc/<crahing pid>/).  This value defaults
+to 0.
+
+==============================================================
+
 core_uses_pid:
 
 The default coredump filename is "core".  By setting
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index e6fb1ec2744..a6e360d2055 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/vm:
 - legacy_va_layout
 - lowmem_reserve_ratio
 - max_map_count
+- memory_failure_early_kill
+- memory_failure_recovery
 - min_free_kbytes
 - min_slab_ratio
 - min_unmapped_ratio
@@ -53,7 +55,6 @@ Currently, these files are in /proc/sys/vm:
 - vfs_cache_pressure
 - zone_reclaim_mode
 
-
 ==============================================================
 
 block_dump
@@ -275,6 +276,44 @@ e.g., up to one or two maps per allocation.
 
 The default value is 65536.
 
+=============================================================
+
+memory_failure_early_kill:
+
+Control how to kill processes when uncorrected memory error (typically
+a 2bit error in a memory module) is detected in the background by hardware
+that cannot be handled by the kernel. In some cases (like the page
+still having a valid copy on disk) the kernel will handle the failure
+transparently without affecting any applications. But if there is
+no other uptodate copy of the data it will kill to prevent any data
+corruptions from propagating.
+
+1: Kill all processes that have the corrupted and not reloadable page mapped
+as soon as the corruption is detected.  Note this is not supported
+for a few types of pages, like kernel internally allocated data or
+the swap cache, but works for the majority of user pages.
+
+0: Only unmap the corrupted page from all processes and only kill a process
+who tries to access it.
+
+The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
+handle this if they want to.
+
+This is only active on architectures/platforms with advanced machine
+check handling and depends on the hardware capabilities.
+
+Applications can override this setting individually with the PR_MCE_KILL prctl
+
+==============================================================
+
+memory_failure_recovery
+
+Enable memory failure recovery (when supported by the platform)
+
+1: Attempt recovery.
+
+0: Always panic on a memory failure.
+
 ==============================================================
 
 min_free_kbytes:
diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore
index 33e8a023df0..09b164a5700 100644
--- a/Documentation/vm/.gitignore
+++ b/Documentation/vm/.gitignore
@@ -1 +1,2 @@
+page-types
 slabinfo
diff --git a/Documentation/vm/locking b/Documentation/vm/locking
index f366fa95617..25fadb44876 100644
--- a/Documentation/vm/locking
+++ b/Documentation/vm/locking
@@ -80,7 +80,7 @@ Note: PTL can also be used to guarantee that no new clones using the
 mm start up ... this is a loose form of stability on mm_users. For
 example, it is used in copy_mm to protect against a racing tlb_gather_mmu
 single address space optimization, so that the zap_page_range (from
-vmtruncate) does not lose sending ipi's to cloned threads that might 
+truncate) does not lose sending ipi's to cloned threads that might
 be spawned underneath it and go to user mode to drag in pte's into tlbs.
 
 swap_lock
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
index 3eda8ea0085..fa1a30d9e9d 100644
--- a/Documentation/vm/page-types.c
+++ b/Documentation/vm/page-types.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com>
  */
 
+#define _LARGEFILE64_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -13,12 +14,33 @@
 #include <string.h>
 #include <getopt.h>
 #include <limits.h>
+#include <assert.h>
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/fcntl.h>
 
 
 /*
+ * pagemap kernel ABI bits
+ */
+
+#define PM_ENTRY_BYTES      sizeof(uint64_t)
+#define PM_STATUS_BITS      3
+#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
+#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
+#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
+#define PM_PSHIFT_BITS      6
+#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
+#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
+#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
+#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+
+#define PM_PRESENT          PM_STATUS(4LL)
+#define PM_SWAP             PM_STATUS(2LL)
+
+
+/*
  * kernel page flags
  */
 
@@ -126,6 +148,14 @@ static int		nr_addr_ranges;
 static unsigned long	opt_offset[MAX_ADDR_RANGES];
 static unsigned long	opt_size[MAX_ADDR_RANGES];
 
+#define MAX_VMAS	10240
+static int		nr_vmas;
+static unsigned long	pg_start[MAX_VMAS];
+static unsigned long	pg_end[MAX_VMAS];
+static unsigned long	voffset;
+
+static int		pagemap_fd;
+
 #define MAX_BIT_FILTERS	64
 static int		nr_bit_filters;
 static uint64_t		opt_mask[MAX_BIT_FILTERS];
@@ -135,7 +165,6 @@ static int		page_size;
 
 #define PAGES_BATCH	(64 << 10)	/* 64k pages */
 static int		kpageflags_fd;
-static uint64_t		kpageflags_buf[KPF_BYTES * PAGES_BATCH];
 
 #define HASH_SHIFT	13
 #define HASH_SIZE	(1 << HASH_SHIFT)
@@ -158,6 +187,11 @@ static uint64_t 	page_flags[HASH_SIZE];
 	type __min2 = (y);			\
 	__min1 < __min2 ? __min1 : __min2; })
 
+#define max_t(type, x, y) ({			\
+	type __max1 = (x);			\
+	type __max2 = (y);			\
+	__max1 > __max2 ? __max1 : __max2; })
+
 static unsigned long pages2mb(unsigned long pages)
 {
 	return (pages * page_size) >> 20;
@@ -224,26 +258,34 @@ static char *page_flag_longname(uint64_t flags)
 static void show_page_range(unsigned long offset, uint64_t flags)
 {
 	static uint64_t      flags0;
+	static unsigned long voff;
 	static unsigned long index;
 	static unsigned long count;
 
-	if (flags == flags0 && offset == index + count) {
+	if (flags == flags0 && offset == index + count &&
+	    (!opt_pid || voffset == voff + count)) {
 		count++;
 		return;
 	}
 
-	if (count)
-		printf("%lu\t%lu\t%s\n",
+	if (count) {
+		if (opt_pid)
+			printf("%lx\t", voff);
+		printf("%lx\t%lx\t%s\n",
 				index, count, page_flag_name(flags0));
+	}
 
 	flags0 = flags;
 	index  = offset;
+	voff   = voffset;
 	count  = 1;
 }
 
 static void show_page(unsigned long offset, uint64_t flags)
 {
-	printf("%lu\t%s\n", offset, page_flag_name(flags));
+	if (opt_pid)
+		printf("%lx\t", voffset);
+	printf("%lx\t%s\n", offset, page_flag_name(flags));
 }
 
 static void show_summary(void)
@@ -383,6 +425,8 @@ static void walk_pfn(unsigned long index, unsigned long count)
 	lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET);
 
 	while (count) {
+		uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH];
+
 		batch = min_t(unsigned long, count, PAGES_BATCH);
 		n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES);
 		if (n == 0)
@@ -404,6 +448,81 @@ static void walk_pfn(unsigned long index, unsigned long count)
 	}
 }
 
+
+#define PAGEMAP_BATCH	4096
+static unsigned long task_pfn(unsigned long pgoff)
+{
+	static uint64_t buf[PAGEMAP_BATCH];
+	static unsigned long start;
+	static long count;
+	uint64_t pfn;
+
+	if (pgoff < start || pgoff >= start + count) {
+		if (lseek64(pagemap_fd,
+			    (uint64_t)pgoff * PM_ENTRY_BYTES,
+			    SEEK_SET) < 0) {
+			perror("pagemap seek");
+			exit(EXIT_FAILURE);
+		}
+		count = read(pagemap_fd, buf, sizeof(buf));
+		if (count == 0)
+			return 0;
+		if (count < 0) {
+			perror("pagemap read");
+			exit(EXIT_FAILURE);
+		}
+		if (count % PM_ENTRY_BYTES) {
+			fatal("pagemap read not aligned.\n");
+			exit(EXIT_FAILURE);
+		}
+		count /= PM_ENTRY_BYTES;
+		start = pgoff;
+	}
+
+	pfn = buf[pgoff - start];
+	if (pfn & PM_PRESENT)
+		pfn = PM_PFRAME(pfn);
+	else
+		pfn = 0;
+
+	return pfn;
+}
+
+static void walk_task(unsigned long index, unsigned long count)
+{
+	int i = 0;
+	const unsigned long end = index + count;
+
+	while (index < end) {
+
+		while (pg_end[i] <= index)
+			if (++i >= nr_vmas)
+				return;
+		if (pg_start[i] >= end)
+			return;
+
+		voffset = max_t(unsigned long, pg_start[i], index);
+		index   = min_t(unsigned long, pg_end[i], end);
+
+		assert(voffset < index);
+		for (; voffset < index; voffset++) {
+			unsigned long pfn = task_pfn(voffset);
+			if (pfn)
+				walk_pfn(pfn, 1);
+		}
+	}
+}
+
+static void add_addr_range(unsigned long offset, unsigned long size)
+{
+	if (nr_addr_ranges >= MAX_ADDR_RANGES)
+		fatal("too many addr ranges\n");
+
+	opt_offset[nr_addr_ranges] = offset;
+	opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
+	nr_addr_ranges++;
+}
+
 static void walk_addr_ranges(void)
 {
 	int i;
@@ -415,10 +534,13 @@ static void walk_addr_ranges(void)
 	}
 
 	if (!nr_addr_ranges)
-		walk_pfn(0, ULONG_MAX);
+		add_addr_range(0, ULONG_MAX);
 
 	for (i = 0; i < nr_addr_ranges; i++)
-		walk_pfn(opt_offset[i], opt_size[i]);
+		if (!opt_pid)
+			walk_pfn(opt_offset[i], opt_size[i]);
+		else
+			walk_task(opt_offset[i], opt_size[i]);
 
 	close(kpageflags_fd);
 }
@@ -446,8 +568,8 @@ static void usage(void)
 "            -r|--raw                  Raw mode, for kernel developers\n"
 "            -a|--addr    addr-spec    Walk a range of pages\n"
 "            -b|--bits    bits-spec    Walk pages with specified bits\n"
-#if 0 /* planned features */
 "            -p|--pid     pid          Walk process address space\n"
+#if 0 /* planned features */
 "            -f|--file    filename     Walk file address space\n"
 #endif
 "            -l|--list                 Show page details in ranges\n"
@@ -459,7 +581,7 @@ static void usage(void)
 "            N+M                       pages range from N to N+M-1\n"
 "            N,M                       pages range from N to M-1\n"
 "            N,                        pages range from N to end\n"
-"            ,M                        pages range from 0 to M\n"
+"            ,M                        pages range from 0 to M-1\n"
 "bits-spec:\n"
 "            bit1,bit2                 (flags & (bit1|bit2)) != 0\n"
 "            bit1,bit2=bit1            (flags & (bit1|bit2)) == bit1\n"
@@ -496,21 +618,57 @@ static unsigned long long parse_number(const char *str)
 
 static void parse_pid(const char *str)
 {
+	FILE *file;
+	char buf[5000];
+
 	opt_pid = parse_number(str);
-}
 
-static void parse_file(const char *name)
-{
+	sprintf(buf, "/proc/%d/pagemap", opt_pid);
+	pagemap_fd = open(buf, O_RDONLY);
+	if (pagemap_fd < 0) {
+		perror(buf);
+		exit(EXIT_FAILURE);
+	}
+
+	sprintf(buf, "/proc/%d/maps", opt_pid);
+	file = fopen(buf, "r");
+	if (!file) {
+		perror(buf);
+		exit(EXIT_FAILURE);
+	}
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		unsigned long vm_start;
+		unsigned long vm_end;
+		unsigned long long pgoff;
+		int major, minor;
+		char r, w, x, s;
+		unsigned long ino;
+		int n;
+
+		n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
+			   &vm_start,
+			   &vm_end,
+			   &r, &w, &x, &s,
+			   &pgoff,
+			   &major, &minor,
+			   &ino);
+		if (n < 10) {
+			fprintf(stderr, "unexpected line: %s\n", buf);
+			continue;
+		}
+		pg_start[nr_vmas] = vm_start / page_size;
+		pg_end[nr_vmas] = vm_end / page_size;
+		if (++nr_vmas >= MAX_VMAS) {
+			fprintf(stderr, "too many VMAs\n");
+			break;
+		}
+	}
+	fclose(file);
 }
 
-static void add_addr_range(unsigned long offset, unsigned long size)
+static void parse_file(const char *name)
 {
-	if (nr_addr_ranges >= MAX_ADDR_RANGES)
-		fatal("too much addr ranges\n");
-
-	opt_offset[nr_addr_ranges] = offset;
-	opt_size[nr_addr_ranges] = size;
-	nr_addr_ranges++;
 }
 
 static void parse_addr_range(const char *optarg)
@@ -676,8 +834,10 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	if (opt_list && opt_pid)
+		printf("voffset\t");
 	if (opt_list == 1)
-		printf("offset\tcount\tflags\n");
+		printf("offset\tlen\tflags\n");
 	if (opt_list == 2)
 		printf("offset\tflags\n");
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 7c1c0b05b29..e797c4d48cf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -257,12 +257,6 @@ W:	http://www.lesswatts.org/projects/acpi/
 S:	Supported
 F:	drivers/acpi/fan.c
 
-ACPI PCI HOTPLUG DRIVER
-M:	Kristen Carlson Accardi <kristen.c.accardi@intel.com>
-L:	linux-pci@vger.kernel.org
-S:	Supported
-F:	drivers/pci/hotplug/acpi*
-
 ACPI THERMAL DRIVER
 M:	Zhang Rui <rui.zhang@intel.com>
 L:	linux-acpi@vger.kernel.org
@@ -2331,7 +2325,9 @@ S:	Orphan
 F:	drivers/hwmon/
 
 HARDWARE RANDOM NUMBER GENERATOR CORE
-S:	Orphan
+M:	Matt Mackall <mpm@selenic.com>
+M:	Herbert Xu <herbert@gondor.apana.org.au>
+S:	Odd fixes
 F:	Documentation/hw_random.txt
 F:	drivers/char/hw_random/
 F:	include/linux/hw_random.h
@@ -4003,11 +3999,11 @@ F:	Documentation/PCI/
 F:	drivers/pci/
 F:	include/linux/pci*
 
-PCIE HOTPLUG DRIVER
-M:	Kristen Carlson Accardi <kristen.c.accardi@intel.com>
+PCI HOTPLUG
+M:	Jesse Barnes <jbarnes@virtuousgeek.org>
 L:	linux-pci@vger.kernel.org
 S:	Supported
-F:	drivers/pci/pcie/
+F:	drivers/pci/hotplug
 
 PCMCIA SUBSYSTEM
 P:	Linux PCMCIA Team
@@ -4670,12 +4666,6 @@ F:	drivers/serial/serial_lh7a40x.c
 F:	drivers/usb/gadget/lh7a40*
 F:	drivers/usb/host/ohci-lh7a40*
 
-SHPC HOTPLUG DRIVER
-M:	Kristen Carlson Accardi <kristen.c.accardi@intel.com>
-L:	linux-pci@vger.kernel.org
-S:	Supported
-F:	drivers/pci/hotplug/shpchp*
-
 SIMPLE FIRMWARE INTERFACE (SFI)
 P:	Len Brown
 M:	lenb@kernel.org
@@ -4687,7 +4677,6 @@ F:	arch/x86/kernel/*sfi*
 F:	drivers/sfi/
 F:	include/linux/sfi*.h
 
-
 SIMTEC EB110ATX (Chalice CATS)
 P:	Ben Dooks
 M:	Vincent Sanders <support@simtec.co.uk>
diff --git a/Makefile b/Makefile
index 433493a2b77..f908accd332 100644
--- a/Makefile
+++ b/Makefile
@@ -179,9 +179,46 @@ SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
 # Alternatively CROSS_COMPILE can be set in the environment.
 # Default value for CROSS_COMPILE is not to prefix executables
 # Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile
+#
+# To force ARCH and CROSS_COMPILE settings include kernel.* files
+# in the kernel tree - do not patch this file.
 export KBUILD_BUILDHOST := $(SUBARCH)
-ARCH		?= $(SUBARCH)
-CROSS_COMPILE	?=
+
+# Kbuild save the ARCH and CROSS_COMPILE setting in kernel.* files.
+# Restore these settings and check that user did not specify
+# conflicting values.
+
+saved_arch  := $(shell cat include/generated/kernel.arch  2> /dev/null)
+saved_cross := $(shell cat include/generated/kernel.cross 2> /dev/null)
+
+ifneq ($(CROSS_COMPILE),)
+        ifneq ($(saved_cross),)
+                ifneq ($(CROSS_COMPILE),$(saved_cross))
+                        $(error CROSS_COMPILE changed from \
+                                "$(saved_cross)" to \
+                                 to "$(CROSS_COMPILE)". \
+                                 Use "make mrproper" to fix it up)
+                endif
+        endif
+else
+    CROSS_COMPILE := $(saved_cross)
+endif
+
+ifneq ($(ARCH),)
+        ifneq ($(saved_arch),)
+                ifneq ($(saved_arch),$(ARCH))
+                        $(error ARCH changed from \
+                                "$(saved_arch)" to "$(ARCH)". \
+                                 Use "make mrproper" to fix it up)
+                endif
+        endif
+else
+        ifneq ($(saved_arch),)
+                ARCH := $(saved_arch)
+        else
+                ARCH := $(SUBARCH)
+        endif
+endif
 
 # Architecture as present in compile.h
 UTS_MACHINE 	:= $(ARCH)
@@ -315,6 +352,7 @@ OBJCOPY		= $(CROSS_COMPILE)objcopy
 OBJDUMP		= $(CROSS_COMPILE)objdump
 AWK		= awk
 GENKSYMS	= scripts/genksyms/genksyms
+INSTALLKERNEL  := installkernel
 DEPMOD		= /sbin/depmod
 KALLSYMS	= scripts/kallsyms
 PERL		= perl
@@ -353,7 +391,8 @@ KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
 export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
 export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
-export CPP AR NM STRIP OBJCOPY OBJDUMP MAKE AWK GENKSYMS PERL UTS_MACHINE
+export CPP AR NM STRIP OBJCOPY OBJDUMP
+export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
@@ -444,6 +483,11 @@ ifeq ($(config-targets),1)
 include $(srctree)/arch/$(SRCARCH)/Makefile
 export KBUILD_DEFCONFIG KBUILD_KCONFIG
 
+# save ARCH & CROSS_COMPILE settings
+$(shell mkdir -p include/generated &&                            \
+        echo $(ARCH)          > include/generated/kernel.arch && \
+        echo $(CROSS_COMPILE) > include/generated/kernel.cross)
+
 config: scripts_basic outputmakefile FORCE
 	$(Q)mkdir -p include/linux include/config
 	$(Q)$(MAKE) $(build)=scripts/kconfig $@
@@ -571,6 +615,9 @@ KBUILD_CFLAGS	+= $(call cc-option,-fno-strict-overflow)
 # revert to pre-gcc-4.4 behaviour of .eh_frame
 KBUILD_CFLAGS	+= $(call cc-option,-fno-dwarf2-cfi-asm)
 
+# conserve stack if available
+KBUILD_CFLAGS   += $(call cc-option,-fconserve-stack)
+
 # Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
 # But warn user when we do so
 warn-assign = \
@@ -591,12 +638,12 @@ endif
 
 # Use --build-id when available.
 LDFLAGS_BUILD_ID = $(patsubst -Wl$(comma)%,%,\
-			      $(call ld-option, -Wl$(comma)--build-id,))
+			      $(call cc-ldoption, -Wl$(comma)--build-id,))
 LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID)
 LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID)
 
 ifeq ($(CONFIG_STRIP_ASM_SYMS),y)
-LDFLAGS_vmlinux	+= -X
+LDFLAGS_vmlinux	+= $(call ld-option, -X,)
 endif
 
 # Default kernel image to build when no specific target is given.
@@ -980,11 +1027,6 @@ prepare0: archprepare FORCE
 # All the preparing..
 prepare: prepare0
 
-# Leave this as default for preprocessing vmlinux.lds.S, which is now
-# done in arch/$(ARCH)/kernel/Makefile
-
-export CPPFLAGS_vmlinux.lds += -P -C -U$(ARCH)
-
 # The asm symlink changes when $(ARCH) changes.
 # Detect this and ask user to run make mrproper
 # If asm is a stale symlink (point to dir that does not exist) remove it
diff --git a/arch/alpha/include/asm/fcntl.h b/arch/alpha/include/asm/fcntl.h
index 25da0017ec8..e42823e954a 100644
--- a/arch/alpha/include/asm/fcntl.h
+++ b/arch/alpha/include/asm/fcntl.h
@@ -26,6 +26,8 @@
 #define F_GETOWN	6	/*  for sockets. */
 #define F_SETSIG	10	/*  for sockets. */
 #define F_GETSIG	11	/*  for sockets. */
+#define F_SETOWN_EX	12
+#define F_GETOWN_EX	13
 
 /* for posix fcntl() and lockf() */
 #define F_RDLCK		1
diff --git a/arch/alpha/include/asm/smp.h b/arch/alpha/include/asm/smp.h
index 547e90951ce..3f390e8cc0b 100644
--- a/arch/alpha/include/asm/smp.h
+++ b/arch/alpha/include/asm/smp.h
@@ -47,7 +47,7 @@ extern struct cpuinfo_alpha cpu_data[NR_CPUS];
 extern int smp_num_cpus;
 
 extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi(cpumask_t mask);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
 #else /* CONFIG_SMP */
 
diff --git a/arch/alpha/include/asm/topology.h b/arch/alpha/include/asm/topology.h
index b4f284c72ff..36b3a30ba0e 100644
--- a/arch/alpha/include/asm/topology.h
+++ b/arch/alpha/include/asm/topology.h
@@ -22,23 +22,6 @@ static inline int cpu_to_node(int cpu)
 	return node;
 }
 
-static inline cpumask_t node_to_cpumask(int node)
-{
-	cpumask_t node_cpu_mask = CPU_MASK_NONE;
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		if (cpu_to_node(cpu) == node)
-			cpu_set(cpu, node_cpu_mask);
-	}
-
-#ifdef DEBUG_NUMA
-	printk("node %d: cpu_mask: %016lx\n", node, node_cpu_mask);
-#endif
-
-	return node_cpu_mask;
-}
-
 extern struct cpumask node_to_cpumask_map[];
 /* FIXME: This is dumb, recalculating every time.  But simple. */
 static const struct cpumask *cpumask_of_node(int node)
@@ -55,7 +38,6 @@ static const struct cpumask *cpumask_of_node(int node)
 	return &node_to_cpumask_map[node];
 }
 
-#define pcibus_to_cpumask(bus)	(cpu_online_map)
 #define cpumask_of_pcibus(bus)	(cpu_online_mask)
 
 #endif /* !CONFIG_NUMA */
diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c
index e302daecbe5..8e059e58b0a 100644
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -1016,7 +1016,7 @@ marvel_agp_bind_memory(alpha_agp_info *agp, off_t pg_start, struct agp_memory *m
 {
 	struct marvel_agp_aperture *aper = agp->aperture.sysdata;
 	return iommu_bind(aper->arena, aper->pg_start + pg_start, 
-			  mem->page_count, mem->memory);
+			  mem->page_count, mem->pages);
 }
 
 static int 
diff --git a/arch/alpha/kernel/core_titan.c b/arch/alpha/kernel/core_titan.c
index 319fcb74611..76686497b1e 100644
--- a/arch/alpha/kernel/core_titan.c
+++ b/arch/alpha/kernel/core_titan.c
@@ -680,7 +680,7 @@ titan_agp_bind_memory(alpha_agp_info *agp, off_t pg_start, struct agp_memory *me
 {
 	struct titan_agp_aperture *aper = agp->aperture.sysdata;
 	return iommu_bind(aper->arena, aper->pg_start + pg_start, 
-			  mem->page_count, mem->memory);
+			  mem->page_count, mem->pages);
 }
 
 static int 
diff --git a/arch/alpha/kernel/pci_impl.h b/arch/alpha/kernel/pci_impl.h
index 00edd04b585..85457b2d451 100644
--- a/arch/alpha/kernel/pci_impl.h
+++ b/arch/alpha/kernel/pci_impl.h
@@ -198,7 +198,7 @@ extern unsigned long size_for_memory(unsigned long max);
 
 extern int iommu_reserve(struct pci_iommu_arena *, long, long);
 extern int iommu_release(struct pci_iommu_arena *, long, long);
-extern int iommu_bind(struct pci_iommu_arena *, long, long, unsigned long *);
+extern int iommu_bind(struct pci_iommu_arena *, long, long, struct page **);
 extern int iommu_unbind(struct pci_iommu_arena *, long, long);
 
 
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index d15aedfe606..8449504f5e0 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -876,7 +876,7 @@ iommu_release(struct pci_iommu_arena *arena, long pg_start, long pg_count)
 
 int
 iommu_bind(struct pci_iommu_arena *arena, long pg_start, long pg_count, 
-	   unsigned long *physaddrs)
+	   struct page **pages)
 {
 	unsigned long flags;
 	unsigned long *ptes;
@@ -896,7 +896,7 @@ iommu_bind(struct pci_iommu_arena *arena, long pg_start, long pg_count,
 	}
 		
 	for(i = 0, j = pg_start; i < pg_count; i++, j++)
-		ptes[j] = mk_iommu_pte(physaddrs[i]);
+		ptes[j] = mk_iommu_pte(page_to_phys(pages[i]));
 
 	spin_unlock_irqrestore(&arena->lock, flags);
 
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 3a2fb7a02db..289039bb6bb 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -19,7 +19,6 @@
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/user.h>
-#include <linux/utsname.h>
 #include <linux/time.h>
 #include <linux/major.h>
 #include <linux/stat.h>
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index b1fe5674c3a..42aa078a5e4 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -548,16 +548,16 @@ setup_profiling_timer(unsigned int multiplier)
 
 
 static void
-send_ipi_message(cpumask_t to_whom, enum ipi_message_type operation)
+send_ipi_message(const struct cpumask *to_whom, enum ipi_message_type operation)
 {
 	int i;
 
 	mb();
-	for_each_cpu_mask(i, to_whom)
+	for_each_cpu(i, to_whom)
 		set_bit(operation, &ipi_data[i].bits);
 
 	mb();
-	for_each_cpu_mask(i, to_whom)
+	for_each_cpu(i, to_whom)
 		wripir(i);
 }
 
@@ -624,7 +624,7 @@ smp_send_reschedule(int cpu)
 		printk(KERN_WARNING
 		       "smp_send_reschedule: Sending IPI to self.\n");
 #endif
-	send_ipi_message(cpumask_of_cpu(cpu), IPI_RESCHEDULE);
+	send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE);
 }
 
 void
@@ -636,17 +636,17 @@ smp_send_stop(void)
 	if (hard_smp_processor_id() != boot_cpu_id)
 		printk(KERN_WARNING "smp_send_stop: Not on boot cpu.\n");
 #endif
-	send_ipi_message(to_whom, IPI_CPU_STOP);
+	send_ipi_message(&to_whom, IPI_CPU_STOP);
 }
 
-void arch_send_call_function_ipi(cpumask_t mask)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
 	send_ipi_message(mask, IPI_CALL_FUNC);
 }
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-	send_ipi_message(cpumask_of_cpu(cpu), IPI_CALL_FUNC_SINGLE);
+	send_ipi_message(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
 }
 
 static void
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 54661125a8b..a73caaf6676 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -14,7 +14,7 @@ LDFLAGS_vmlinux	:=-p --no-undefined -X
 ifeq ($(CONFIG_CPU_ENDIAN_BE8),y)
 LDFLAGS_vmlinux	+= --be8
 endif
-CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
+
 OBJCOPYFLAGS	:=-O binary -R .note -R .note.gnu.build-id -R .comment -S
 GZFLAGS		:=-9
 #KBUILD_CFLAGS	+=-pipe
@@ -279,7 +279,7 @@ define archhelp
   echo  '                  (supply initrd image via make variable INITRD=<path>)'
   echo  '  install       - Install uncompressed kernel'
   echo  '  zinstall      - Install compressed kernel'
-  echo  '                  Install using (your) ~/bin/installkernel or'
-  echo  '                  (distribution) /sbin/installkernel or'
+  echo  '                  Install using (your) ~/bin/$(INSTALLKERNEL) or'
+  echo  '                  (distribution) /sbin/$(INSTALLKERNEL) or'
   echo  '                  install to $$(INSTALL_PATH) and run lilo'
 endef
diff --git a/arch/arm/boot/install.sh b/arch/arm/boot/install.sh
index 9f9bed20734..06ea7d42ce8 100644
--- a/arch/arm/boot/install.sh
+++ b/arch/arm/boot/install.sh
@@ -21,8 +21,8 @@
 #
 
 # User may have a custom install script
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 if [ "$(basename $2)" = "zImage" ]; then
 # Compressed install
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 1a711ea8418..fd03fb63a33 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -334,14 +334,14 @@ static inline void outer_flush_range(unsigned long start, unsigned long end)
 #ifndef CONFIG_CPU_CACHE_VIPT
 static inline void flush_cache_mm(struct mm_struct *mm)
 {
-	if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
+	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm)))
 		__cpuc_flush_user_all();
 }
 
 static inline void
 flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
-	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
+	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm)))
 		__cpuc_flush_user_range(start & PAGE_MASK, PAGE_ALIGN(end),
 					vma->vm_flags);
 }
@@ -349,7 +349,7 @@ flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long
 static inline void
 flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn)
 {
-	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) {
+	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
 		unsigned long addr = user_addr & PAGE_MASK;
 		__cpuc_flush_user_range(addr, addr + PAGE_SIZE, vma->vm_flags);
 	}
@@ -360,7 +360,7 @@ flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
 			 unsigned long uaddr, void *kaddr,
 			 unsigned long len, int write)
 {
-	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) {
+	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
 		unsigned long addr = (unsigned long)kaddr;
 		__cpuc_coherent_kern_range(addr, addr + len);
 	}
diff --git a/arch/arm/include/asm/hardware/iop3xx-adma.h b/arch/arm/include/asm/hardware/iop3xx-adma.h
index 83e6ba338e2..1a8c7279a28 100644
--- a/arch/arm/include/asm/hardware/iop3xx-adma.h
+++ b/arch/arm/include/asm/hardware/iop3xx-adma.h
@@ -187,11 +187,74 @@ union iop3xx_desc {
 	void *ptr;
 };
 
+/* No support for p+q operations */
+static inline int
+iop_chan_pq_slot_count(size_t len, int src_cnt, int *slots_per_op)
+{
+	BUG();
+	return 0;
+}
+
+static inline void
+iop_desc_init_pq(struct iop_adma_desc_slot *desc, int src_cnt,
+		  unsigned long flags)
+{
+	BUG();
+}
+
+static inline void
+iop_desc_set_pq_addr(struct iop_adma_desc_slot *desc, dma_addr_t *addr)
+{
+	BUG();
+}
+
+static inline void
+iop_desc_set_pq_src_addr(struct iop_adma_desc_slot *desc, int src_idx,
+			 dma_addr_t addr, unsigned char coef)
+{
+	BUG();
+}
+
+static inline int
+iop_chan_pq_zero_sum_slot_count(size_t len, int src_cnt, int *slots_per_op)
+{
+	BUG();
+	return 0;
+}
+
+static inline void
+iop_desc_init_pq_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt,
+			  unsigned long flags)
+{
+	BUG();
+}
+
+static inline void
+iop_desc_set_pq_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len)
+{
+	BUG();
+}
+
+#define iop_desc_set_pq_zero_sum_src_addr iop_desc_set_pq_src_addr
+
+static inline void
+iop_desc_set_pq_zero_sum_addr(struct iop_adma_desc_slot *desc, int pq_idx,
+			      dma_addr_t *src)
+{
+	BUG();
+}
+
 static inline int iop_adma_get_max_xor(void)
 {
 	return 32;
 }
 
+static inline int iop_adma_get_max_pq(void)
+{
+	BUG();
+	return 0;
+}
+
 static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan)
 {
 	int id = chan->device->id;
@@ -332,6 +395,11 @@ static inline int iop_chan_zero_sum_slot_count(size_t len, int src_cnt,
 	return slot_cnt;
 }
 
+static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc)
+{
+	return 0;
+}
+
 static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
 					struct iop_adma_chan *chan)
 {
@@ -349,6 +417,14 @@ static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
 	return 0;
 }
 
+
+static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc,
+					  struct iop_adma_chan *chan)
+{
+	BUG();
+	return 0;
+}
+
 static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc,
 					struct iop_adma_chan *chan)
 {
@@ -756,13 +832,14 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc,
 	hw_desc->src[0] = val;
 }
 
-static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
+static inline enum sum_check_flags
+iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
 {
 	struct iop3xx_desc_aau *hw_desc = desc->hw_desc;
 	struct iop3xx_aau_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field;
 
 	iop_paranoia(!(desc_ctrl.tx_complete && desc_ctrl.zero_result_en));
-	return desc_ctrl.zero_result_err;
+	return desc_ctrl.zero_result_err << SUM_CHECK_P;
 }
 
 static inline void iop_chan_append(struct iop_adma_chan *chan)
diff --git a/arch/arm/include/asm/hardware/iop_adma.h b/arch/arm/include/asm/hardware/iop_adma.h
index 385c6e8cbbd..59b8c3892f7 100644
--- a/arch/arm/include/asm/hardware/iop_adma.h
+++ b/arch/arm/include/asm/hardware/iop_adma.h
@@ -86,6 +86,7 @@ struct iop_adma_chan {
  * @idx: pool index
  * @unmap_src_cnt: number of xor sources
  * @unmap_len: transaction bytecount
+ * @tx_list: list of descriptors that are associated with one operation
  * @async_tx: support for the async_tx api
  * @group_list: list of slots that make up a multi-descriptor transaction
  *	for example transfer lengths larger than the supported hw max
@@ -102,10 +103,12 @@ struct iop_adma_desc_slot {
 	u16 idx;
 	u16 unmap_src_cnt;
 	size_t unmap_len;
+	struct list_head tx_list;
 	struct dma_async_tx_descriptor async_tx;
 	union {
 		u32 *xor_check_result;
 		u32 *crc32_result;
+		u32 *pq_check_result;
 	};
 };
 
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index bcdb9291ef0..de6cefb329d 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -103,14 +103,15 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 #ifdef CONFIG_SMP
 	/* check for possible thread migration */
-	if (!cpus_empty(next->cpu_vm_mask) && !cpu_isset(cpu, next->cpu_vm_mask))
+	if (!cpumask_empty(mm_cpumask(next)) &&
+	    !cpumask_test_cpu(cpu, mm_cpumask(next)))
 		__flush_icache_all();
 #endif
-	if (!cpu_test_and_set(cpu, next->cpu_vm_mask) || prev != next) {
+	if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next) {
 		check_context(next);
 		cpu_switch_mm(next->pgd, next);
 		if (cache_is_vivt())
-			cpu_clear(cpu, prev->cpu_vm_mask);
+			cpumask_clear_cpu(cpu, mm_cpumask(prev));
 	}
 #endif
 }
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index a06e735b262..e0d763be184 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -93,7 +93,6 @@ extern void platform_cpu_enable(unsigned int cpu);
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 /*
  * show local interrupt info
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index c964f3fc3bc..a45ab5dd825 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -350,7 +350,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask)) {
+	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) {
 		if (tlb_flag(TLB_V3_FULL))
 			asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (zero) : "cc");
 		if (tlb_flag(TLB_V4_U_FULL))
@@ -388,7 +388,7 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) {
+	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
 		if (tlb_flag(TLB_V3_PAGE))
 			asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (uaddr) : "cc");
 		if (tlb_flag(TLB_V4_U_PAGE))
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 3213c9382b1..c446aeff7b8 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -2,7 +2,8 @@
 # Makefile for the linux kernel.
 #
 
-AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
+CPPFLAGS_vmlinux.lds := -DTEXT_OFFSET=$(TEXT_OFFSET)
+AFLAGS_head.o        := -DTEXT_OFFSET=$(TEXT_OFFSET)
 
 ifdef CONFIG_DYNAMIC_FTRACE
 CFLAGS_REMOVE_ftrace.o = -pg
diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c
index 3f470866bb8..e7cbb50dc35 100644
--- a/arch/arm/kernel/init_task.c
+++ b/arch/arm/kernel/init_task.c
@@ -24,9 +24,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  *
  * The things we do for performance..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index de885fd256c..e0d32770bb3 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -189,7 +189,7 @@ int __cpuexit __cpu_disable(void)
 	read_lock(&tasklist_lock);
 	for_each_process(p) {
 		if (p->mm)
-			cpu_clear(cpu, p->mm->cpu_vm_mask);
+			cpumask_clear_cpu(cpu, mm_cpumask(p->mm));
 	}
 	read_unlock(&tasklist_lock);
 
@@ -257,7 +257,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
 	atomic_inc(&mm->mm_users);
 	atomic_inc(&mm->mm_count);
 	current->active_mm = mm;
-	cpu_set(cpu, mm->cpu_vm_mask);
+	cpumask_set_cpu(cpu, mm_cpumask(mm));
 	cpu_switch_mm(mm->pgd, mm);
 	enter_lazy_tlb(mm, current);
 	local_flush_tlb_all();
@@ -643,7 +643,7 @@ void flush_tlb_all(void)
 void flush_tlb_mm(struct mm_struct *mm)
 {
 	if (tlb_ops_need_broadcast())
-		on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, &mm->cpu_vm_mask);
+		on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, mm_cpumask(mm));
 	else
 		local_flush_tlb_mm(mm);
 }
@@ -654,7 +654,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 		struct tlb_args ta;
 		ta.ta_vma = vma;
 		ta.ta_start = uaddr;
-		on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, &vma->vm_mm->cpu_vm_mask);
+		on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, mm_cpumask(vma->vm_mm));
 	} else
 		local_flush_tlb_page(vma, uaddr);
 }
@@ -677,7 +677,7 @@ void flush_tlb_range(struct vm_area_struct *vma,
 		ta.ta_vma = vma;
 		ta.ta_start = start;
 		ta.ta_end = end;
-		on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, &vma->vm_mm->cpu_vm_mask);
+		on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mm_cpumask(vma->vm_mm));
 	} else
 		local_flush_tlb_range(vma, start, end);
 }
diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c
index b3ec641b5cf..78ecaac6520 100644
--- a/arch/arm/kernel/sys_arm.c
+++ b/arch/arm/kernel/sys_arm.c
@@ -25,7 +25,6 @@
 #include <linux/mman.h>
 #include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/ipc.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/arm/mach-at91/at91sam9263_devices.c b/arch/arm/mach-at91/at91sam9263_devices.c
index 55719a97427..fb5c23af101 100644
--- a/arch/arm/mach-at91/at91sam9263_devices.c
+++ b/arch/arm/mach-at91/at91sam9263_devices.c
@@ -757,6 +757,42 @@ void __init at91_add_device_ac97(struct ac97c_platform_data *data)
 void __init at91_add_device_ac97(struct ac97c_platform_data *data) {}
 #endif
 
+/* --------------------------------------------------------------------
+ *  CAN Controller
+ * -------------------------------------------------------------------- */
+
+#if defined(CONFIG_CAN_AT91) || defined(CONFIG_CAN_AT91_MODULE)
+static struct resource can_resources[] = {
+	[0] = {
+		.start	= AT91SAM9263_BASE_CAN,
+		.end	= AT91SAM9263_BASE_CAN + SZ_16K - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= AT91SAM9263_ID_CAN,
+		.end	= AT91SAM9263_ID_CAN,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device at91sam9263_can_device = {
+	.name		= "at91_can",
+	.id		= -1,
+	.resource	= can_resources,
+	.num_resources	= ARRAY_SIZE(can_resources),
+};
+
+void __init at91_add_device_can(struct at91_can_data *data)
+{
+	at91_set_A_periph(AT91_PIN_PA13, 0);	/* CANTX */
+	at91_set_A_periph(AT91_PIN_PA14, 0);	/* CANRX */
+	at91sam9263_can_device.dev.platform_data = data;
+
+	platform_device_register(&at91sam9263_can_device);
+}
+#else
+void __init at91_add_device_can(struct at91_can_data *data) {}
+#endif
 
 /* --------------------------------------------------------------------
  *  LCD Controller
diff --git a/arch/arm/mach-at91/board-sam9263ek.c b/arch/arm/mach-at91/board-sam9263ek.c
index 26f1aa6049a..2d867fb0630 100644
--- a/arch/arm/mach-at91/board-sam9263ek.c
+++ b/arch/arm/mach-at91/board-sam9263ek.c
@@ -400,6 +400,23 @@ static struct gpio_led ek_pwm_led[] = {
 	}
 };
 
+/*
+ * CAN
+ */
+static void sam9263ek_transceiver_switch(int on)
+{
+	if (on) {
+		at91_set_gpio_output(AT91_PIN_PA18, 1); /* CANRXEN */
+		at91_set_gpio_output(AT91_PIN_PA19, 0); /* CANRS */
+	} else {
+		at91_set_gpio_output(AT91_PIN_PA18, 0); /* CANRXEN */
+		at91_set_gpio_output(AT91_PIN_PA19, 1); /* CANRS */
+	}
+}
+
+static struct at91_can_data ek_can_data = {
+	.transceiver_switch = sam9263ek_transceiver_switch,
+};
 
 static void __init ek_board_init(void)
 {
@@ -431,6 +448,8 @@ static void __init ek_board_init(void)
 	/* LEDs */
 	at91_gpio_leds(ek_leds, ARRAY_SIZE(ek_leds));
 	at91_pwm_leds(ek_pwm_led, ARRAY_SIZE(ek_pwm_led));
+	/* CAN */
+	at91_add_device_can(&ek_can_data);
 }
 
 MACHINE_START(AT91SAM9263EK, "Atmel AT91SAM9263-EK")
diff --git a/arch/arm/mach-at91/include/mach/board.h b/arch/arm/mach-at91/include/mach/board.h
index 583f38a38df..2f4fcedc02b 100644
--- a/arch/arm/mach-at91/include/mach/board.h
+++ b/arch/arm/mach-at91/include/mach/board.h
@@ -188,6 +188,12 @@ extern void __init at91_add_device_isi(void);
  /* Touchscreen Controller */
 extern void __init at91_add_device_tsadcc(void);
 
+/* CAN */
+struct at91_can_data {
+	void (*transceiver_switch)(int on);
+};
+extern void __init at91_add_device_can(struct at91_can_data *data);
+
  /* LEDs */
 extern void __init at91_init_leds(u8 cpu_led, u8 timer_led);
 extern void __init at91_gpio_leds(struct gpio_led *leds, int nr);
diff --git a/arch/arm/mach-iop13xx/include/mach/adma.h b/arch/arm/mach-iop13xx/include/mach/adma.h
index 5722e86f217..6d3782d85a9 100644
--- a/arch/arm/mach-iop13xx/include/mach/adma.h
+++ b/arch/arm/mach-iop13xx/include/mach/adma.h
@@ -150,6 +150,8 @@ static inline int iop_adma_get_max_xor(void)
 	return 16;
 }
 
+#define iop_adma_get_max_pq iop_adma_get_max_xor
+
 static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan)
 {
 	return __raw_readl(ADMA_ADAR(chan));
@@ -211,7 +213,10 @@ iop_chan_xor_slot_count(size_t len, int src_cnt, int *slots_per_op)
 #define IOP_ADMA_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
 #define IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
 #define IOP_ADMA_XOR_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
+#define IOP_ADMA_PQ_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
 #define iop_chan_zero_sum_slot_count(l, s, o) iop_chan_xor_slot_count(l, s, o)
+#define iop_chan_pq_slot_count iop_chan_xor_slot_count
+#define iop_chan_pq_zero_sum_slot_count iop_chan_xor_slot_count
 
 static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
 					struct iop_adma_chan *chan)
@@ -220,6 +225,13 @@ static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
 	return hw_desc->dest_addr;
 }
 
+static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc,
+					  struct iop_adma_chan *chan)
+{
+	struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
+	return hw_desc->q_dest_addr;
+}
+
 static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc,
 					struct iop_adma_chan *chan)
 {
@@ -319,6 +331,58 @@ iop_desc_init_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt,
 	return 1;
 }
 
+static inline void
+iop_desc_init_pq(struct iop_adma_desc_slot *desc, int src_cnt,
+		  unsigned long flags)
+{
+	struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
+	union {
+		u32 value;
+		struct iop13xx_adma_desc_ctrl field;
+	} u_desc_ctrl;
+
+	u_desc_ctrl.value = 0;
+	u_desc_ctrl.field.src_select = src_cnt - 1;
+	u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */
+	u_desc_ctrl.field.pq_xfer_en = 1;
+	u_desc_ctrl.field.p_xfer_dis = !!(flags & DMA_PREP_PQ_DISABLE_P);
+	u_desc_ctrl.field.int_en = flags & DMA_PREP_INTERRUPT;
+	hw_desc->desc_ctrl = u_desc_ctrl.value;
+}
+
+static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc)
+{
+	struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
+	union {
+		u32 value;
+		struct iop13xx_adma_desc_ctrl field;
+	} u_desc_ctrl;
+
+	u_desc_ctrl.value = hw_desc->desc_ctrl;
+	return u_desc_ctrl.field.pq_xfer_en;
+}
+
+static inline void
+iop_desc_init_pq_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt,
+			  unsigned long flags)
+{
+	struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
+	union {
+		u32 value;
+		struct iop13xx_adma_desc_ctrl field;
+	} u_desc_ctrl;
+
+	u_desc_ctrl.value = 0;
+	u_desc_ctrl.field.src_select = src_cnt - 1;
+	u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */
+	u_desc_ctrl.field.zero_result = 1;
+	u_desc_ctrl.field.status_write_back_en = 1;
+	u_desc_ctrl.field.pq_xfer_en = 1;
+	u_desc_ctrl.field.p_xfer_dis = !!(flags & DMA_PREP_PQ_DISABLE_P);
+	u_desc_ctrl.field.int_en = flags & DMA_PREP_INTERRUPT;
+	hw_desc->desc_ctrl = u_desc_ctrl.value;
+}
+
 static inline void iop_desc_set_byte_count(struct iop_adma_desc_slot *desc,
 					struct iop_adma_chan *chan,
 					u32 byte_count)
@@ -351,6 +415,7 @@ iop_desc_set_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len)
 	}
 }
 
+#define iop_desc_set_pq_zero_sum_byte_count iop_desc_set_zero_sum_byte_count
 
 static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc,
 					struct iop_adma_chan *chan,
@@ -361,6 +426,16 @@ static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc,
 	hw_desc->upper_dest_addr = 0;
 }
 
+static inline void
+iop_desc_set_pq_addr(struct iop_adma_desc_slot *desc, dma_addr_t *addr)
+{
+	struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
+
+	hw_desc->dest_addr = addr[0];
+	hw_desc->q_dest_addr = addr[1];
+	hw_desc->upper_dest_addr = 0;
+}
+
 static inline void iop_desc_set_memcpy_src_addr(struct iop_adma_desc_slot *desc,
 					dma_addr_t addr)
 {
@@ -389,6 +464,29 @@ static inline void iop_desc_set_xor_src_addr(struct iop_adma_desc_slot *desc,
 }
 
 static inline void
+iop_desc_set_pq_src_addr(struct iop_adma_desc_slot *desc, int src_idx,
+			 dma_addr_t addr, unsigned char coef)
+{
+	int slot_cnt = desc->slot_cnt, slots_per_op = desc->slots_per_op;
+	struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc, *iter;
+	struct iop13xx_adma_src *src;
+	int i = 0;
+
+	do {
+		iter = iop_hw_desc_slot_idx(hw_desc, i);
+		src = &iter->src[src_idx];
+		src->src_addr = addr;
+		src->pq_upper_src_addr = 0;
+		src->pq_dmlt = coef;
+		slot_cnt -= slots_per_op;
+		if (slot_cnt) {
+			i += slots_per_op;
+			addr += IOP_ADMA_PQ_MAX_BYTE_COUNT;
+		}
+	} while (slot_cnt);
+}
+
+static inline void
 iop_desc_init_interrupt(struct iop_adma_desc_slot *desc,
 	struct iop_adma_chan *chan)
 {
@@ -399,6 +497,15 @@ iop_desc_init_interrupt(struct iop_adma_desc_slot *desc,
 }
 
 #define iop_desc_set_zero_sum_src_addr iop_desc_set_xor_src_addr
+#define iop_desc_set_pq_zero_sum_src_addr iop_desc_set_pq_src_addr
+
+static inline void
+iop_desc_set_pq_zero_sum_addr(struct iop_adma_desc_slot *desc, int pq_idx,
+			      dma_addr_t *src)
+{
+	iop_desc_set_xor_src_addr(desc, pq_idx, src[pq_idx]);
+	iop_desc_set_xor_src_addr(desc, pq_idx+1, src[pq_idx+1]);
+}
 
 static inline void iop_desc_set_next_desc(struct iop_adma_desc_slot *desc,
 					u32 next_desc_addr)
@@ -428,18 +535,20 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc,
 	hw_desc->block_fill_data = val;
 }
 
-static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
+static inline enum sum_check_flags
+iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
 {
 	struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
 	struct iop13xx_adma_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field;
 	struct iop13xx_adma_byte_count byte_count = hw_desc->byte_count_field;
+	enum sum_check_flags flags;
 
 	BUG_ON(!(byte_count.tx_complete && desc_ctrl.zero_result));
 
-	if (desc_ctrl.pq_xfer_en)
-		return byte_count.zero_result_err_q;
-	else
-		return byte_count.zero_result_err;
+	flags = byte_count.zero_result_err_q << SUM_CHECK_Q;
+	flags |= byte_count.zero_result_err << SUM_CHECK_P;
+
+	return flags;
 }
 
 static inline void iop_chan_append(struct iop_adma_chan *chan)
diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c
index bee42c609df..5c147fb66a0 100644
--- a/arch/arm/mach-iop13xx/setup.c
+++ b/arch/arm/mach-iop13xx/setup.c
@@ -477,10 +477,8 @@ void __init iop13xx_platform_init(void)
 			plat_data = &iop13xx_adma_0_data;
 			dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR, plat_data->cap_mask);
-			dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask);
-			dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask);
+			dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
 			dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
-			dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
 			dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
 			break;
 		case IOP13XX_INIT_ADMA_1:
@@ -489,10 +487,8 @@ void __init iop13xx_platform_init(void)
 			plat_data = &iop13xx_adma_1_data;
 			dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR, plat_data->cap_mask);
-			dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask);
-			dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask);
+			dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
 			dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
-			dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
 			dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
 			break;
 		case IOP13XX_INIT_ADMA_2:
@@ -501,14 +497,11 @@ void __init iop13xx_platform_init(void)
 			plat_data = &iop13xx_adma_2_data;
 			dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR, plat_data->cap_mask);
-			dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask);
-			dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask);
+			dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
 			dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
-			dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
 			dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
-			dma_cap_set(DMA_PQ_XOR, plat_data->cap_mask);
-			dma_cap_set(DMA_PQ_UPDATE, plat_data->cap_mask);
-			dma_cap_set(DMA_PQ_ZERO_SUM, plat_data->cap_mask);
+			dma_cap_set(DMA_PQ, plat_data->cap_mask);
+			dma_cap_set(DMA_PQ_VAL, plat_data->cap_mask);
 			break;
 		}
 	}
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index fc84fcc7438..6bda76a4319 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -59,6 +59,6 @@ void __new_context(struct mm_struct *mm)
 	}
 	spin_unlock(&cpu_asid_lock);
 
-	mm->cpu_vm_mask = cpumask_of_cpu(smp_processor_id());
+	cpumask_copy(mm_cpumask(mm), cpumask_of(smp_processor_id()));
 	mm->context.id = asid;
 }
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 575f3ad722e..b27942909b2 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -50,7 +50,7 @@ static void flush_pfn_alias(unsigned long pfn, unsigned long vaddr)
 void flush_cache_mm(struct mm_struct *mm)
 {
 	if (cache_is_vivt()) {
-		if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
+		if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm)))
 			__cpuc_flush_user_all();
 		return;
 	}
@@ -73,7 +73,7 @@ void flush_cache_mm(struct mm_struct *mm)
 void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
 	if (cache_is_vivt()) {
-		if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
+		if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm)))
 			__cpuc_flush_user_range(start & PAGE_MASK, PAGE_ALIGN(end),
 						vma->vm_flags);
 		return;
@@ -97,7 +97,7 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned
 void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn)
 {
 	if (cache_is_vivt()) {
-		if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) {
+		if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
 			unsigned long addr = user_addr & PAGE_MASK;
 			__cpuc_flush_user_range(addr, addr + PAGE_SIZE, vma->vm_flags);
 		}
@@ -113,7 +113,7 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
 			 unsigned long len, int write)
 {
 	if (cache_is_vivt()) {
-		if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) {
+		if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
 			unsigned long addr = (unsigned long)kaddr;
 			__cpuc_coherent_kern_range(addr, addr + len);
 		}
@@ -126,7 +126,7 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
 	}
 
 	/* VIPT non-aliasing cache */
-	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask) &&
+	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm)) &&
 	    vma->vm_flags & VM_EXEC) {
 		unsigned long addr = (unsigned long)kaddr;
 		/* only flushing the kernel mapping on non-aliasing VIPT */
diff --git a/arch/arm/plat-iop/adma.c b/arch/arm/plat-iop/adma.c
index 3c127aabe21..1ff6a37e893 100644
--- a/arch/arm/plat-iop/adma.c
+++ b/arch/arm/plat-iop/adma.c
@@ -179,7 +179,6 @@ static int __init iop3xx_adma_cap_init(void)
 	dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask);
 	#else
 	dma_cap_set(DMA_MEMCPY, iop3xx_dma_0_data.cap_mask);
-	dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_0_data.cap_mask);
 	dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask);
 	#endif
 
@@ -188,7 +187,6 @@ static int __init iop3xx_adma_cap_init(void)
 	dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask);
 	#else
 	dma_cap_set(DMA_MEMCPY, iop3xx_dma_1_data.cap_mask);
-	dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_1_data.cap_mask);
 	dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask);
 	#endif
 
@@ -198,7 +196,7 @@ static int __init iop3xx_adma_cap_init(void)
 	dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask);
 	#else
 	dma_cap_set(DMA_XOR, iop3xx_aau_data.cap_mask);
-	dma_cap_set(DMA_ZERO_SUM, iop3xx_aau_data.cap_mask);
+	dma_cap_set(DMA_XOR_VAL, iop3xx_aau_data.cap_mask);
 	dma_cap_set(DMA_MEMSET, iop3xx_aau_data.cap_mask);
 	dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask);
 	#endif
diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c
index 57ec9f2dcd9..6b2343e6fe3 100644
--- a/arch/avr32/kernel/init_task.c
+++ b/arch/avr32/kernel/init_task.c
@@ -18,9 +18,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 /*
  * Initial thread structure. Must be aligned on an 8192-byte boundary.
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c
index 376f18c4a6c..94925641e53 100644
--- a/arch/avr32/mm/init.c
+++ b/arch/avr32/mm/init.c
@@ -24,11 +24,9 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 
-#define __page_aligned	__attribute__((section(".data.page_aligned")))
-
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned;
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_data;
 
 struct page *empty_zero_page;
 EXPORT_SYMBOL(empty_zero_page);
diff --git a/arch/blackfin/Makefile b/arch/blackfin/Makefile
index 6f9533c3d75..f063b772934 100644
--- a/arch/blackfin/Makefile
+++ b/arch/blackfin/Makefile
@@ -155,7 +155,7 @@ define archhelp
   echo  '* vmImage.gz      - Kernel-only image for U-Boot (arch/$(ARCH)/boot/vmImage.gz)'
   echo  '  vmImage.lzma    - Kernel-only image for U-Boot (arch/$(ARCH)/boot/vmImage.lzma)'
   echo  '  install         - Install kernel using'
-  echo  '                     (your) ~/bin/$(CROSS_COMPILE)installkernel or'
-  echo  '                     (distribution) PATH: $(CROSS_COMPILE)installkernel or'
+  echo  '                     (your) ~/bin/$(INSTALLKERNEL) or'
+  echo  '                     (distribution) PATH: $(INSTALLKERNEL) or'
   echo  '                     install to $$(INSTALL_PATH)'
 endef
diff --git a/arch/blackfin/boot/install.sh b/arch/blackfin/boot/install.sh
index 9560a6b2910..e2c6e40902b 100644
--- a/arch/blackfin/boot/install.sh
+++ b/arch/blackfin/boot/install.sh
@@ -36,9 +36,9 @@ verify "$3"
 
 # User may have a custom install script
 
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if which ${CROSS_COMPILE}installkernel >/dev/null 2>&1; then
-	exec ${CROSS_COMPILE}installkernel "$@"
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if which ${INSTALLKERNEL} >/dev/null 2>&1; then
+	exec ${INSTALLKERNEL} "$@"
 fi
 
 # Default install - same as make zlilo
diff --git a/arch/cris/Makefile b/arch/cris/Makefile
index 71e17d3eedd..29c2ceb38a7 100644
--- a/arch/cris/Makefile
+++ b/arch/cris/Makefile
@@ -42,8 +42,6 @@ LD = $(CROSS_COMPILE)ld -mcrislinux
 
 OBJCOPYFLAGS := -O binary -R .note -R .comment -S
 
-CPPFLAGS_vmlinux.lds = -DDRAM_VIRTUAL_BASE=0x$(CONFIG_ETRAX_DRAM_VIRTUAL_BASE)
-
 KBUILD_AFLAGS += -mlinux -march=$(arch-y) $(inc)
 KBUILD_CFLAGS += -mlinux -march=$(arch-y) -pipe $(inc)
 KBUILD_CPPFLAGS += $(inc)
diff --git a/arch/cris/kernel/Makefile b/arch/cris/kernel/Makefile
index ee7bcd4d20b..b45640b3e60 100644
--- a/arch/cris/kernel/Makefile
+++ b/arch/cris/kernel/Makefile
@@ -3,6 +3,7 @@
 # Makefile for the linux kernel.
 #
 
+CPPFLAGS_vmlinux.lds := -DDRAM_VIRTUAL_BASE=0x$(CONFIG_ETRAX_DRAM_VIRTUAL_BASE)
 extra-y	:= vmlinux.lds
 
 obj-y   := process.o traps.o irq.o ptrace.o setup.o time.o sys_cris.o
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 51dcd04d277..c99aeab7cef 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -45,9 +45,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union 
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c
index 1d3df1d9495..3c3e0b336a9 100644
--- a/arch/frv/kernel/init_task.c
+++ b/arch/frv/kernel/init_task.c
@@ -19,9 +19,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c
index be722fc1acf..0d4d3e3a4cf 100644
--- a/arch/frv/kernel/pm.c
+++ b/arch/frv/kernel/pm.c
@@ -150,7 +150,7 @@ static int user_atoi(char __user *ubuf, size_t len)
 /*
  * Send us to sleep.
  */
-static int sysctl_pm_do_suspend(ctl_table *ctl, int write, struct file *filp,
+static int sysctl_pm_do_suspend(ctl_table *ctl, int write,
 				void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int retval, mode;
@@ -198,13 +198,13 @@ static int try_set_cmode(int new_cmode)
 }
 
 
-static int cmode_procctl(ctl_table *ctl, int write, struct file *filp,
+static int cmode_procctl(ctl_table *ctl, int write,
 			 void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int new_cmode;
 
 	if (!write)
-		return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+		return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
 	new_cmode = user_atoi(buffer, *lenp);
 
@@ -301,13 +301,13 @@ static int try_set_cm(int new_cm)
 	return 0;
 }
 
-static int p0_procctl(ctl_table *ctl, int write, struct file *filp,
+static int p0_procctl(ctl_table *ctl, int write,
 		      void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int new_p0;
 
 	if (!write)
-		return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+		return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
 	new_p0 = user_atoi(buffer, *lenp);
 
@@ -345,13 +345,13 @@ static int p0_sysctl(ctl_table *table,
 	return 1;
 }
 
-static int cm_procctl(ctl_table *ctl, int write, struct file *filp,
+static int cm_procctl(ctl_table *ctl, int write,
 		      void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int new_cm;
 
 	if (!write)
-		return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+		return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
 	new_cm = user_atoi(buffer, *lenp);
 
diff --git a/arch/frv/kernel/sys_frv.c b/arch/frv/kernel/sys_frv.c
index baadc97f862..2b6b5289cdc 100644
--- a/arch/frv/kernel/sys_frv.c
+++ b/arch/frv/kernel/sys_frv.c
@@ -21,7 +21,6 @@
 #include <linux/stat.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/syscalls.h>
 #include <linux/ipc.h>
 
diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c
index 089c65ed6eb..54c1062ee80 100644
--- a/arch/h8300/kernel/init_task.c
+++ b/arch/h8300/kernel/init_task.c
@@ -31,7 +31,6 @@ EXPORT_SYMBOL(init_task);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c
index 2745656dcc5..8cb5d73a0e3 100644
--- a/arch/h8300/kernel/sys_h8300.c
+++ b/arch/h8300/kernel/sys_h8300.c
@@ -17,7 +17,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/fs.h>
 #include <linux/ipc.h>
 
diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h
index d217d1d4e05..0b3b3997dec 100644
--- a/arch/ia64/include/asm/smp.h
+++ b/arch/ia64/include/asm/smp.h
@@ -127,7 +127,6 @@ extern int is_multithreading_enabled(void);
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #else /* CONFIG_SMP */
 
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index d0141fbf51d..3ddb4e709db 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -33,7 +33,6 @@
 /*
  * Returns a bitmask of CPUs on Node 'node'.
  */
-#define node_to_cpumask(node) (node_to_cpu_mask[node])
 #define cpumask_of_node(node) (&node_to_cpu_mask[node])
 
 /*
@@ -104,8 +103,6 @@ void build_cpu_to_node_map(void);
 #ifdef CONFIG_SMP
 #define topology_physical_package_id(cpu)	(cpu_data(cpu)->socket_id)
 #define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
-#define topology_core_siblings(cpu)		(cpu_core_map[cpu])
-#define topology_thread_siblings(cpu)		(per_cpu(cpu_sibling_map, cpu))
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
 #define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #define smt_capable() 				(smp_num_siblings > 1)
diff --git a/arch/ia64/install.sh b/arch/ia64/install.sh
index 929e780026d..0e932f5dcd1 100644
--- a/arch/ia64/install.sh
+++ b/arch/ia64/install.sh
@@ -21,8 +21,8 @@
 
 # User may have a custom install script
 
-if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
-if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install - same as make zlilo
 
diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate
index 1d87f84069b..ab9b03a9adc 100644
--- a/arch/ia64/kernel/Makefile.gate
+++ b/arch/ia64/kernel/Makefile.gate
@@ -10,7 +10,7 @@ quiet_cmd_gate = GATE $@
       cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
 
 GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
-		     $(call ld-option, -Wl$(comma)--hash-style=sysv)
+		     $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 $(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
 	$(call if_changed,gate)
 
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index c475fc281be..e253ab8fcbc 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -33,7 +33,8 @@ union {
 		struct thread_info thread_info;
 	} s;
 	unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)];
-} init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{
+} init_task_mem asm ("init_task") __init_task_data =
+	{{
 	.task =		INIT_TASK(init_task_mem.s.task),
 	.thread_info =	INIT_THREAD_INFO(init_task_mem.s.task)
 }};
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 93ebfea43c6..dabeefe2113 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -302,7 +302,7 @@ smp_flush_tlb_mm (struct mm_struct *mm)
 		return;
 	}
 
-	smp_call_function_mask(mm->cpu_vm_mask,
+	smp_call_function_many(mm_cpumask(mm),
 		(void (*)(void *))local_finish_flush_tlb_mm, mm, 1);
 	local_irq_disable();
 	local_finish_flush_tlb_mm(mm);
diff --git a/arch/m32r/boot/compressed/install.sh b/arch/m32r/boot/compressed/install.sh
index 6d72e9e7269..16e5a0a1343 100644
--- a/arch/m32r/boot/compressed/install.sh
+++ b/arch/m32r/boot/compressed/install.sh
@@ -24,8 +24,8 @@
 
 # User may have a custom install script
 
-if [ -x /sbin/installkernel ]; then
-  exec /sbin/installkernel "$@"
+if [ -x /sbin/${INSTALLKERNEL} ]; then
+  exec /sbin/${INSTALLKERNEL} "$@"
 fi
 
 if [ "$2" = "zImage" ]; then
diff --git a/arch/m32r/include/asm/mmu_context.h b/arch/m32r/include/asm/mmu_context.h
index 91909e5dd9d..a70a3df3363 100644
--- a/arch/m32r/include/asm/mmu_context.h
+++ b/arch/m32r/include/asm/mmu_context.h
@@ -127,7 +127,7 @@ static inline void switch_mm(struct mm_struct *prev,
 
 	if (prev != next) {
 #ifdef CONFIG_SMP
-		cpu_set(cpu, next->cpu_vm_mask);
+		cpumask_set_cpu(cpu, mm_cpumask(next));
 #endif /* CONFIG_SMP */
 		/* Set MPTB = next->pgd */
 		*(volatile unsigned long *)MPTB = (unsigned long)next->pgd;
@@ -135,7 +135,7 @@ static inline void switch_mm(struct mm_struct *prev,
 	}
 #ifdef CONFIG_SMP
 	else
-		if (!cpu_test_and_set(cpu, next->cpu_vm_mask))
+		if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)))
 			activate_context(next);
 #endif /* CONFIG_SMP */
 }
diff --git a/arch/m32r/include/asm/smp.h b/arch/m32r/include/asm/smp.h
index b96a6d2ffbc..e67ded1aab9 100644
--- a/arch/m32r/include/asm/smp.h
+++ b/arch/m32r/include/asm/smp.h
@@ -88,7 +88,7 @@ extern void smp_send_timer(void);
 extern unsigned long send_IPI_mask_phys(cpumask_t, int, int);
 
 extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi(cpumask_t mask);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
 #endif	/* not __ASSEMBLY__ */
 
diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c
index fce57e5d3f9..6c42d5f8df5 100644
--- a/arch/m32r/kernel/init_task.c
+++ b/arch/m32r/kernel/init_task.c
@@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 929e5c9d3ad..1b7598e6f6e 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -85,7 +85,7 @@ void smp_ipi_timer_interrupt(struct pt_regs *);
 void smp_local_timer_interrupt(void);
 
 static void send_IPI_allbutself(int, int);
-static void send_IPI_mask(cpumask_t, int, int);
+static void send_IPI_mask(const struct cpumask *, int, int);
 unsigned long send_IPI_mask_phys(cpumask_t, int, int);
 
 /*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*/
@@ -113,7 +113,7 @@ unsigned long send_IPI_mask_phys(cpumask_t, int, int);
 void smp_send_reschedule(int cpu_id)
 {
 	WARN_ON(cpu_is_offline(cpu_id));
-	send_IPI_mask(cpumask_of_cpu(cpu_id), RESCHEDULE_IPI, 1);
+	send_IPI_mask(cpumask_of(cpu_id), RESCHEDULE_IPI, 1);
 }
 
 /*==========================================================================*
@@ -168,7 +168,7 @@ void smp_flush_cache_all(void)
 	spin_lock(&flushcache_lock);
 	mask=cpus_addr(cpumask);
 	atomic_set_mask(*mask, (atomic_t *)&flushcache_cpumask);
-	send_IPI_mask(cpumask, INVALIDATE_CACHE_IPI, 0);
+	send_IPI_mask(&cpumask, INVALIDATE_CACHE_IPI, 0);
 	_flush_cache_copyback_all();
 	while (flushcache_cpumask)
 		mb();
@@ -264,7 +264,7 @@ void smp_flush_tlb_mm(struct mm_struct *mm)
 	preempt_disable();
 	cpu_id = smp_processor_id();
 	mmc = &mm->context[cpu_id];
-	cpu_mask = mm->cpu_vm_mask;
+	cpu_mask = *mm_cpumask(mm);
 	cpu_clear(cpu_id, cpu_mask);
 
 	if (*mmc != NO_CONTEXT) {
@@ -273,7 +273,7 @@ void smp_flush_tlb_mm(struct mm_struct *mm)
 		if (mm == current->mm)
 			activate_context(mm);
 		else
-			cpu_clear(cpu_id, mm->cpu_vm_mask);
+			cpumask_clear_cpu(cpu_id, mm_cpumask(mm));
 		local_irq_restore(flags);
 	}
 	if (!cpus_empty(cpu_mask))
@@ -334,7 +334,7 @@ void smp_flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 	preempt_disable();
 	cpu_id = smp_processor_id();
 	mmc = &mm->context[cpu_id];
-	cpu_mask = mm->cpu_vm_mask;
+	cpu_mask = *mm_cpumask(mm);
 	cpu_clear(cpu_id, cpu_mask);
 
 #ifdef DEBUG_SMP
@@ -424,7 +424,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 	 * We have to send the IPI only to
 	 * CPUs affected.
 	 */
-	send_IPI_mask(cpumask, INVALIDATE_TLB_IPI, 0);
+	send_IPI_mask(&cpumask, INVALIDATE_TLB_IPI, 0);
 
 	while (!cpus_empty(flush_cpumask)) {
 		/* nothing. lockup detection does not belong here */
@@ -469,7 +469,7 @@ void smp_invalidate_interrupt(void)
 		if (flush_mm == current->active_mm)
 			activate_context(flush_mm);
 		else
-			cpu_clear(cpu_id, flush_mm->cpu_vm_mask);
+			cpumask_clear_cpu(cpu_id, mm_cpumask(flush_mm));
 	} else {
 		unsigned long va = flush_va;
 
@@ -546,14 +546,14 @@ static void stop_this_cpu(void *dummy)
 	for ( ; ; );
 }
 
-void arch_send_call_function_ipi(cpumask_t mask)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
 	send_IPI_mask(mask, CALL_FUNCTION_IPI, 0);
 }
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_IPI, 0);
+	send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_IPI, 0);
 }
 
 /*==========================================================================*
@@ -729,7 +729,7 @@ static void send_IPI_allbutself(int ipi_num, int try)
 	cpumask = cpu_online_map;
 	cpu_clear(smp_processor_id(), cpumask);
 
-	send_IPI_mask(cpumask, ipi_num, try);
+	send_IPI_mask(&cpumask, ipi_num, try);
 }
 
 /*==========================================================================*
@@ -752,7 +752,7 @@ static void send_IPI_allbutself(int ipi_num, int try)
  * ---------- --- --------------------------------------------------------
  *
  *==========================================================================*/
-static void send_IPI_mask(cpumask_t cpumask, int ipi_num, int try)
+static void send_IPI_mask(const struct cpumask *cpumask, int ipi_num, int try)
 {
 	cpumask_t physid_mask, tmp;
 	int cpu_id, phys_id;
@@ -761,11 +761,11 @@ static void send_IPI_mask(cpumask_t cpumask, int ipi_num, int try)
 	if (num_cpus <= 1)	/* NO MP */
 		return;
 
-	cpus_and(tmp, cpumask, cpu_online_map);
-	BUG_ON(!cpus_equal(cpumask, tmp));
+	cpumask_and(&tmp, cpumask, cpu_online_mask);
+	BUG_ON(!cpumask_equal(cpumask, &tmp));
 
 	physid_mask = CPU_MASK_NONE;
-	for_each_cpu_mask(cpu_id, cpumask){
+	for_each_cpu(cpu_id, cpumask) {
 		if ((phys_id = cpu_to_physid(cpu_id)) != -1)
 			cpu_set(phys_id, physid_mask);
 	}
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index 655ea1c47a0..e034844cfc0 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -178,7 +178,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	for (phys_id = 0 ; phys_id < nr_cpu ; phys_id++)
 		physid_set(phys_id, phys_cpu_present_map);
 #ifndef CONFIG_HOTPLUG_CPU
-	cpu_present_map = cpu_possible_map;
+	init_cpu_present(&cpu_possible_map);
 #endif
 
 	show_mp_info(nr_cpu);
diff --git a/arch/m68k/install.sh b/arch/m68k/install.sh
index 9c6bae6112e..57d640d4382 100644
--- a/arch/m68k/install.sh
+++ b/arch/m68k/install.sh
@@ -33,8 +33,8 @@ verify "$3"
 
 # User may have a custom install script
 
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install - same as make zlilo
 
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 72bad65dba3..41230c595a8 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -42,9 +42,9 @@
  */
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-union thread_union init_thread_union
-__attribute__((section(".data.init_task"), aligned(THREAD_SIZE)))
-       = { INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data
+	__attribute__((aligned(THREAD_SIZE))) =
+		{ INIT_THREAD_INFO(init_task) };
 
 /* initial task structure */
 struct task_struct init_task = INIT_TASK(init_task);
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index 7f54efaf60b..7deb402bfc7 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -20,7 +20,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/ipc.h>
 
 #include <asm/setup.h>
diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c
index 45e97a207fe..cbf9dc3cc51 100644
--- a/arch/m68knommu/kernel/init_task.c
+++ b/arch/m68knommu/kernel/init_task.c
@@ -31,7 +31,6 @@ EXPORT_SYMBOL(init_task);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c
index 70028163862..efdd090778a 100644
--- a/arch/m68knommu/kernel/sys_m68k.c
+++ b/arch/m68knommu/kernel/sys_m68k.c
@@ -17,7 +17,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/ipc.h>
 #include <linux/fs.h>
 
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 2db722d80d4..bbd8327f189 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -6,6 +6,7 @@ mainmenu "Linux/Microblaze Kernel Configuration"
 config MICROBLAZE
 	def_bool y
 	select HAVE_LMB
+	select USB_ARCH_HAS_EHCI
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 
 config SWAP
diff --git a/arch/microblaze/Makefile b/arch/microblaze/Makefile
index 8439598d465..34187354304 100644
--- a/arch/microblaze/Makefile
+++ b/arch/microblaze/Makefile
@@ -37,12 +37,12 @@ CPUFLAGS-$(CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR) += -mxl-pattern-compare
 CPUFLAGS-1 += $(call cc-option,-mcpu=v$(CPU_VER))
 
 # r31 holds current when in kernel mode
-KBUILD_KERNEL += -ffixed-r31 $(CPUFLAGS-1) $(CPUFLAGS-2)
+KBUILD_CFLAGS += -ffixed-r31 $(CPUFLAGS-1) $(CPUFLAGS-2)
 
 LDFLAGS		:=
 LDFLAGS_vmlinux	:=
 
-LIBGCC := $(shell $(CC) $(KBUILD_KERNEL) -print-libgcc-file-name)
+LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
 
 head-y := arch/microblaze/kernel/head.o
 libs-y += arch/microblaze/lib/
@@ -53,22 +53,41 @@ core-y += arch/microblaze/platform/
 
 boot := arch/microblaze/boot
 
+# Are we making a simpleImage.<boardname> target? If so, crack out the boardname
+DTB:=$(subst simpleImage.,,$(filter simpleImage.%, $(MAKECMDGOALS)))
+
+ifneq ($(DTB),)
+	core-y	+= $(boot)/
+endif
+
 # defines filename extension depending memory management type
 ifeq ($(CONFIG_MMU),)
 MMU := -nommu
 endif
 
-export MMU
+export MMU DTB
 
 all: linux.bin
 
+BOOT_TARGETS = linux.bin linux.bin.gz simpleImage.%
+
 archclean:
 	$(Q)$(MAKE) $(clean)=$(boot)
 
-linux.bin linux.bin.gz: vmlinux
+$(BOOT_TARGETS): vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
 define archhelp
-  echo  '* linux.bin    - Create raw binary'
-  echo  '  linux.bin.gz - Create compressed raw binary'
+  echo '* linux.bin    - Create raw binary'
+  echo '  linux.bin.gz - Create compressed raw binary'
+  echo '  simpleImage.<dt> - ELF image with $(arch)/boot/dts/<dt>.dts linked in'
+  echo '                   - stripped elf with fdt blob
+  echo '  simpleImage.<dt>.unstrip - full ELF image with fdt blob'
+  echo '  *_defconfig      - Select default config from arch/microblaze/configs'
+  echo ''
+  echo '  Targets with <dt> embed a device tree blob inside the image'
+  echo '  These targets support board with firmware that does not'
+  echo '  support passing a device tree directly. Replace <dt> with the'
+  echo '  name of a dts file from the arch/microblaze/boot/dts/ directory'
+  echo '  (minus the .dts extension).'
 endef
diff --git a/arch/microblaze/boot/Makefile b/arch/microblaze/boot/Makefile
index c2bb043a029..21f13322a4c 100644
--- a/arch/microblaze/boot/Makefile
+++ b/arch/microblaze/boot/Makefile
@@ -2,10 +2,24 @@
 # arch/microblaze/boot/Makefile
 #
 
-targets := linux.bin linux.bin.gz
+obj-y += linked_dtb.o
+
+targets := linux.bin linux.bin.gz simpleImage.%
 
 OBJCOPYFLAGS_linux.bin  := -O binary
 
+# Where the DTS files live
+dtstree         := $(srctree)/$(src)/dts
+
+# Ensure system.dtb exists
+$(obj)/linked_dtb.o: $(obj)/system.dtb
+
+# Generate system.dtb from $(DTB).dtb
+ifneq ($(DTB),system)
+$(obj)/system.dtb: $(obj)/$(DTB).dtb
+	$(call if_changed,cp)
+endif
+
 $(obj)/linux.bin: vmlinux FORCE
 	[ -n $(CONFIG_INITRAMFS_SOURCE) ] && [ ! -e $(CONFIG_INITRAMFS_SOURCE) ] && \
 	touch $(CONFIG_INITRAMFS_SOURCE) || echo "No CPIO image"
@@ -16,4 +30,27 @@ $(obj)/linux.bin.gz: $(obj)/linux.bin FORCE
 	$(call if_changed,gzip)
 	@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
 
-clean-kernel += linux.bin linux.bin.gz
+quiet_cmd_cp = CP      $< $@$2
+	cmd_cp = cat $< >$@$2 || (rm -f $@ && echo false)
+
+quiet_cmd_strip = STRIP   $@
+      cmd_strip = $(STRIP) -K _start -K _end -K __log_buf -K _fdt_start vmlinux -o $@
+
+$(obj)/simpleImage.%: vmlinux FORCE
+	$(call if_changed,cp,.unstrip)
+	$(call if_changed,strip)
+	@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
+
+# Rule to build device tree blobs
+DTC = $(objtree)/scripts/dtc/dtc
+
+# Rule to build device tree blobs
+quiet_cmd_dtc = DTC     $@
+	cmd_dtc = $(DTC) -O dtb -o $(obj)/$*.dtb -b 0 -p 1024 $(dtstree)/$*.dts
+
+$(obj)/%.dtb: $(dtstree)/%.dts FORCE
+	$(call if_changed,dtc)
+
+clean-kernel += linux.bin linux.bin.gz simpleImage.*
+
+clean-files += *.dtb
diff --git a/arch/microblaze/boot/dts/system.dts b/arch/microblaze/boot/dts/system.dts
new file mode 120000
index 00000000000..7cb657892f2
--- /dev/null
+++ b/arch/microblaze/boot/dts/system.dts
@@ -0,0 +1 @@
+../../platform/generic/system.dts
+\ No newline at end of file
diff --git a/arch/microblaze/boot/linked_dtb.S b/arch/microblaze/boot/linked_dtb.S
new file mode 100644
index 00000000000..cb2b537aebe
--- /dev/null
+++ b/arch/microblaze/boot/linked_dtb.S
@@ -0,0 +1,3 @@
+.section __fdt_blob,"a"
+.incbin "arch/microblaze/boot/system.dtb"
+
diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig
index 09c32962b66..bb7c374713a 100644
--- a/arch/microblaze/configs/mmu_defconfig
+++ b/arch/microblaze/configs/mmu_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.31-rc6
-# Tue Aug 18 11:00:02 2009
+# Linux kernel version: 2.6.31
+# Thu Sep 24 10:28:50 2009
 #
 CONFIG_MICROBLAZE=y
 # CONFIG_SWAP is not set
@@ -42,11 +42,12 @@ CONFIG_SYSVIPC_SYSCTL=y
 #
 # RCU Subsystem
 #
-CONFIG_CLASSIC_RCU=y
-# CONFIG_TREE_RCU is not set
-# CONFIG_PREEMPT_RCU is not set
+CONFIG_TREE_RCU=y
+# CONFIG_TREE_PREEMPT_RCU is not set
+# CONFIG_RCU_TRACE is not set
+CONFIG_RCU_FANOUT=32
+# CONFIG_RCU_FANOUT_EXACT is not set
 # CONFIG_TREE_RCU_TRACE is not set
-# CONFIG_PREEMPT_RCU_TRACE is not set
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=17
@@ -260,6 +261,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic"
 # CONFIG_NETFILTER is not set
 # CONFIG_IP_DCCP is not set
 # CONFIG_IP_SCTP is not set
+# CONFIG_RDS is not set
 # CONFIG_TIPC is not set
 # CONFIG_ATM is not set
 # CONFIG_BRIDGE is not set
@@ -357,12 +359,10 @@ CONFIG_NET_ETHERNET=y
 # CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
 # CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
 # CONFIG_KS8842 is not set
+CONFIG_XILINX_EMACLITE=y
 CONFIG_NETDEV_1000=y
 CONFIG_NETDEV_10000=y
-
-#
-# Wireless LAN
-#
+CONFIG_WLAN=y
 # CONFIG_WLAN_PRE80211 is not set
 # CONFIG_WLAN_80211 is not set
 
@@ -460,6 +460,7 @@ CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
 # CONFIG_DISPLAY_SUPPORT is not set
 # CONFIG_SOUND is not set
 # CONFIG_USB_SUPPORT is not set
+CONFIG_USB_ARCH_HAS_EHCI=y
 # CONFIG_MMC is not set
 # CONFIG_MEMSTICK is not set
 # CONFIG_NEW_LEDS is not set
@@ -488,6 +489,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_GFS2_FS is not set
 # CONFIG_OCFS2_FS is not set
 # CONFIG_BTRFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_FILE_LOCKING=y
 CONFIG_FSNOTIFY=y
 # CONFIG_DNOTIFY is not set
@@ -546,7 +548,6 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
-# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -671,18 +672,20 @@ CONFIG_DEBUG_INFO=y
 # CONFIG_DEBUG_LIST is not set
 # CONFIG_DEBUG_SG is not set
 # CONFIG_DEBUG_NOTIFIERS is not set
+# CONFIG_DEBUG_CREDENTIALS is not set
 # CONFIG_BOOT_PRINTK_DELAY is not set
 # CONFIG_RCU_TORTURE_TEST is not set
 # CONFIG_RCU_CPU_STALL_DETECTOR is not set
 # CONFIG_BACKTRACE_SELF_TEST is not set
 # CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
+# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set
 # CONFIG_FAULT_INJECTION is not set
 # CONFIG_SYSCTL_SYSCALL_CHECK is not set
 # CONFIG_PAGE_POISONING is not set
 # CONFIG_SAMPLES is not set
 # CONFIG_KMEMCHECK is not set
 CONFIG_EARLY_PRINTK=y
-CONFIG_HEART_BEAT=y
+# CONFIG_HEART_BEAT is not set
 CONFIG_DEBUG_BOOTMEM=y
 
 #
@@ -697,7 +700,6 @@ CONFIG_CRYPTO=y
 #
 # Crypto core or helper
 #
-# CONFIG_CRYPTO_FIPS is not set
 # CONFIG_CRYPTO_MANAGER is not set
 # CONFIG_CRYPTO_MANAGER2 is not set
 # CONFIG_CRYPTO_GF128MUL is not set
@@ -729,11 +731,13 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_HMAC is not set
 # CONFIG_CRYPTO_XCBC is not set
+# CONFIG_CRYPTO_VMAC is not set
 
 #
 # Digest
 #
 # CONFIG_CRYPTO_CRC32C is not set
+# CONFIG_CRYPTO_GHASH is not set
 # CONFIG_CRYPTO_MD4 is not set
 # CONFIG_CRYPTO_MD5 is not set
 # CONFIG_CRYPTO_MICHAEL_MIC is not set
diff --git a/arch/microblaze/configs/nommu_defconfig b/arch/microblaze/configs/nommu_defconfig
index 8b638615a97..adb839bab70 100644
--- a/arch/microblaze/configs/nommu_defconfig
+++ b/arch/microblaze/configs/nommu_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.31-rc6
-# Tue Aug 18 10:35:30 2009
+# Linux kernel version: 2.6.31
+# Thu Sep 24 10:29:43 2009
 #
 CONFIG_MICROBLAZE=y
 # CONFIG_SWAP is not set
@@ -44,11 +44,12 @@ CONFIG_BSD_PROCESS_ACCT_V3=y
 #
 # RCU Subsystem
 #
-CONFIG_CLASSIC_RCU=y
-# CONFIG_TREE_RCU is not set
-# CONFIG_PREEMPT_RCU is not set
+CONFIG_TREE_RCU=y
+# CONFIG_TREE_PREEMPT_RCU is not set
+# CONFIG_RCU_TRACE is not set
+CONFIG_RCU_FANOUT=32
+# CONFIG_RCU_FANOUT_EXACT is not set
 # CONFIG_TREE_RCU_TRACE is not set
-# CONFIG_PREEMPT_RCU_TRACE is not set
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=17
@@ -243,6 +244,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic"
 # CONFIG_NETFILTER is not set
 # CONFIG_IP_DCCP is not set
 # CONFIG_IP_SCTP is not set
+# CONFIG_RDS is not set
 # CONFIG_TIPC is not set
 # CONFIG_ATM is not set
 # CONFIG_BRIDGE is not set
@@ -272,6 +274,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic"
 # CONFIG_AF_RXRPC is not set
 CONFIG_WIRELESS=y
 # CONFIG_CFG80211 is not set
+CONFIG_CFG80211_DEFAULT_PS_VALUE=0
 CONFIG_WIRELESS_OLD_REGULATORY=y
 # CONFIG_WIRELESS_EXT is not set
 # CONFIG_LIB80211 is not set
@@ -279,7 +282,6 @@ CONFIG_WIRELESS_OLD_REGULATORY=y
 #
 # CFG80211 needs to be enabled for MAC80211
 #
-CONFIG_MAC80211_DEFAULT_PS_VALUE=0
 # CONFIG_WIMAX is not set
 # CONFIG_RFKILL is not set
 # CONFIG_NET_9P is not set
@@ -304,6 +306,7 @@ CONFIG_MTD_PARTITIONS=y
 # CONFIG_MTD_TESTS is not set
 # CONFIG_MTD_REDBOOT_PARTS is not set
 CONFIG_MTD_CMDLINE_PARTS=y
+# CONFIG_MTD_OF_PARTS is not set
 # CONFIG_MTD_AR7_PARTS is not set
 
 #
@@ -349,6 +352,7 @@ CONFIG_MTD_RAM=y
 #
 # CONFIG_MTD_COMPLEX_MAPPINGS is not set
 # CONFIG_MTD_PHYSMAP is not set
+# CONFIG_MTD_PHYSMAP_OF is not set
 CONFIG_MTD_UCLINUX=y
 # CONFIG_MTD_PLATRAM is not set
 
@@ -429,12 +433,10 @@ CONFIG_NET_ETHERNET=y
 # CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
 # CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
 # CONFIG_KS8842 is not set
+# CONFIG_XILINX_EMACLITE is not set
 CONFIG_NETDEV_1000=y
 CONFIG_NETDEV_10000=y
-
-#
-# Wireless LAN
-#
+CONFIG_WLAN=y
 # CONFIG_WLAN_PRE80211 is not set
 # CONFIG_WLAN_80211 is not set
 
@@ -535,7 +537,7 @@ CONFIG_VIDEO_OUTPUT_CONTROL=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
-# CONFIG_USB_ARCH_HAS_EHCI is not set
+CONFIG_USB_ARCH_HAS_EHCI=y
 # CONFIG_USB is not set
 # CONFIG_USB_OTG_WHITELIST is not set
 # CONFIG_USB_OTG_BLACKLIST_HUB is not set
@@ -579,6 +581,7 @@ CONFIG_FS_POSIX_ACL=y
 # CONFIG_GFS2_FS is not set
 # CONFIG_OCFS2_FS is not set
 # CONFIG_BTRFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_FILE_LOCKING=y
 CONFIG_FSNOTIFY=y
 # CONFIG_DNOTIFY is not set
@@ -639,7 +642,6 @@ CONFIG_ROMFS_BACKED_BY_BLOCK=y
 CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
-# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -710,18 +712,20 @@ CONFIG_DEBUG_INFO=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_SG=y
 # CONFIG_DEBUG_NOTIFIERS is not set
+# CONFIG_DEBUG_CREDENTIALS is not set
 # CONFIG_BOOT_PRINTK_DELAY is not set
 # CONFIG_RCU_TORTURE_TEST is not set
 # CONFIG_RCU_CPU_STALL_DETECTOR is not set
 # CONFIG_BACKTRACE_SELF_TEST is not set
 # CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
+# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set
 # CONFIG_FAULT_INJECTION is not set
 CONFIG_SYSCTL_SYSCALL_CHECK=y
 # CONFIG_PAGE_POISONING is not set
 # CONFIG_DYNAMIC_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_EARLY_PRINTK=y
-CONFIG_HEART_BEAT=y
+# CONFIG_HEART_BEAT is not set
 # CONFIG_DEBUG_BOOTMEM is not set
 
 #
@@ -736,7 +740,6 @@ CONFIG_CRYPTO=y
 #
 # Crypto core or helper
 #
-# CONFIG_CRYPTO_FIPS is not set
 # CONFIG_CRYPTO_MANAGER is not set
 # CONFIG_CRYPTO_MANAGER2 is not set
 # CONFIG_CRYPTO_GF128MUL is not set
@@ -768,11 +771,13 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_HMAC is not set
 # CONFIG_CRYPTO_XCBC is not set
+# CONFIG_CRYPTO_VMAC is not set
 
 #
 # Digest
 #
 # CONFIG_CRYPTO_CRC32C is not set
+# CONFIG_CRYPTO_GHASH is not set
 # CONFIG_CRYPTO_MD4 is not set
 # CONFIG_CRYPTO_MD5 is not set
 # CONFIG_CRYPTO_MICHAEL_MIC is not set
diff --git a/arch/microblaze/include/asm/asm-compat.h b/arch/microblaze/include/asm/asm-compat.h
new file mode 100644
index 00000000000..e7bc9dc11b5
--- /dev/null
+++ b/arch/microblaze/include/asm/asm-compat.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_MICROBLAZE_ASM_COMPAT_H
+#define _ASM_MICROBLAZE_ASM_COMPAT_H
+
+#include <asm/types.h>
+
+#ifdef __ASSEMBLY__
+#  define stringify_in_c(...)	__VA_ARGS__
+#  define ASM_CONST(x)		x
+#else
+/* This version of stringify will deal with commas... */
+#  define __stringify_in_c(...)	#__VA_ARGS__
+#  define stringify_in_c(...)	__stringify_in_c(__VA_ARGS__) " "
+#  define __ASM_CONST(x)	x##UL
+#  define ASM_CONST(x)		__ASM_CONST(x)
+#endif
+
+#endif /* _ASM_MICROBLAZE_ASM_COMPAT_H */
diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h
index 7c3ec13b44d..fc9997b73c0 100644
--- a/arch/microblaze/include/asm/io.h
+++ b/arch/microblaze/include/asm/io.h
@@ -210,6 +210,9 @@ static inline void __iomem *__ioremap(phys_addr_t address, unsigned long size,
 #define in_be32(a) __raw_readl((const void __iomem __force *)(a))
 #define in_be16(a) __raw_readw(a)
 
+#define writel_be(v, a)	out_be32((__force unsigned *)a, v)
+#define readl_be(a)	in_be32((__force unsigned *)a)
+
 /*
  * Little endian
  */
diff --git a/arch/microblaze/include/asm/ipc.h b/arch/microblaze/include/asm/ipc.h
deleted file mode 100644
index a46e3d9c2a3..00000000000
--- a/arch/microblaze/include/asm/ipc.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ipc.h>
diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h
index 72aceae8868..880c988c223 100644
--- a/arch/microblaze/include/asm/page.h
+++ b/arch/microblaze/include/asm/page.h
@@ -17,6 +17,7 @@
 
 #include <linux/pfn.h>
 #include <asm/setup.h>
+#include <asm/asm-compat.h>
 #include <linux/const.h>
 
 #ifdef __KERNEL__
@@ -26,6 +27,8 @@
 #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 
+#define LOAD_OFFSET	ASM_CONST((CONFIG_KERNEL_START-CONFIG_KERNEL_BASE_ADDR))
+
 #ifndef __ASSEMBLY__
 
 #define PAGE_UP(addr)	(((addr)+((PAGE_SIZE)-1))&(~((PAGE_SIZE)-1)))
diff --git a/arch/microblaze/include/asm/setup.h b/arch/microblaze/include/asm/setup.h
index 27f8dafd8c3..ed67c9ed15b 100644
--- a/arch/microblaze/include/asm/setup.h
+++ b/arch/microblaze/include/asm/setup.h
@@ -38,7 +38,7 @@ extern void early_console_reg_tlb_alloc(unsigned int addr);
 void time_init(void);
 void init_IRQ(void);
 void machine_early_init(const char *cmdline, unsigned int ram,
-						unsigned int fdt);
+			unsigned int fdt, unsigned int msr);
 
 void machine_restart(char *cmd);
 void machine_shutdown(void);
diff --git a/arch/microblaze/include/asm/syscall.h b/arch/microblaze/include/asm/syscall.h
new file mode 100644
index 00000000000..048dfcd8d89
--- /dev/null
+++ b/arch/microblaze/include/asm/syscall.h
@@ -0,0 +1,99 @@
+#ifndef __ASM_MICROBLAZE_SYSCALL_H
+#define __ASM_MICROBLAZE_SYSCALL_H
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <asm/ptrace.h>
+
+/* The system call number is given by the user in R12 */
+static inline long syscall_get_nr(struct task_struct *task,
+				  struct pt_regs *regs)
+{
+	return regs->r12;
+}
+
+static inline void syscall_rollback(struct task_struct *task,
+				    struct pt_regs *regs)
+{
+	/* TODO.  */
+}
+
+static inline long syscall_get_error(struct task_struct *task,
+				     struct pt_regs *regs)
+{
+	return IS_ERR_VALUE(regs->r3) ? regs->r3 : 0;
+}
+
+static inline long syscall_get_return_value(struct task_struct *task,
+					    struct pt_regs *regs)
+{
+	return regs->r3;
+}
+
+static inline void syscall_set_return_value(struct task_struct *task,
+					    struct pt_regs *regs,
+					    int error, long val)
+{
+	if (error)
+		regs->r3 = -error;
+	else
+		regs->r3 = val;
+}
+
+static inline microblaze_reg_t microblaze_get_syscall_arg(struct pt_regs *regs,
+							  unsigned int n)
+{
+	switch (n) {
+	case 5: return regs->r10;
+	case 4: return regs->r9;
+	case 3: return regs->r8;
+	case 2: return regs->r7;
+	case 1: return regs->r6;
+	case 0: return regs->r5;
+	default:
+		BUG();
+	}
+	return ~0;
+}
+
+static inline void microblaze_set_syscall_arg(struct pt_regs *regs,
+					      unsigned int n,
+					      unsigned long val)
+{
+	switch (n) {
+	case 5:
+		regs->r10 = val;
+	case 4:
+		regs->r9 = val;
+	case 3:
+		regs->r8 = val;
+	case 2:
+		regs->r7 = val;
+	case 1:
+		regs->r6 = val;
+	case 0:
+		regs->r5 = val;
+	default:
+		BUG();
+	}
+}
+
+static inline void syscall_get_arguments(struct task_struct *task,
+					 struct pt_regs *regs,
+					 unsigned int i, unsigned int n,
+					 unsigned long *args)
+{
+	while (n--)
+		*args++ = microblaze_get_syscall_arg(regs, i++);
+}
+
+static inline void syscall_set_arguments(struct task_struct *task,
+					 struct pt_regs *regs,
+					 unsigned int i, unsigned int n,
+					 const unsigned long *args)
+{
+	while (n--)
+		microblaze_set_syscall_arg(regs, i++, *args++);
+}
+
+#endif /* __ASM_MICROBLAZE_SYSCALL_H */
diff --git a/arch/microblaze/kernel/cpu/cpuinfo.c b/arch/microblaze/kernel/cpu/cpuinfo.c
index c411c6757de..3539babc1c1 100644
--- a/arch/microblaze/kernel/cpu/cpuinfo.c
+++ b/arch/microblaze/kernel/cpu/cpuinfo.c
@@ -28,6 +28,7 @@ const struct cpu_ver_key cpu_ver_lookup[] = {
 	{"7.10.d", 0x0b},
 	{"7.20.a", 0x0c},
 	{"7.20.b", 0x0d},
+	{"7.20.c", 0x0e},
 	/* FIXME There is no keycode defined in MBV for these versions */
 	{"2.10.a", 0x10},
 	{"3.00.a", 0x20},
@@ -49,6 +50,8 @@ const struct family_string_key family_string_lookup[] = {
 	{"spartan3a", 0xa},
 	{"spartan3an", 0xb},
 	{"spartan3adsp", 0xc},
+	{"spartan6", 0xd},
+	{"virtex6", 0xe},
 	/* FIXME There is no key code defined for spartan2 */
 	{"spartan2", 0xf0},
 	{NULL, 0},
diff --git a/arch/microblaze/kernel/entry.S b/arch/microblaze/kernel/entry.S
index c7353e79f4a..acc1f05d1e2 100644
--- a/arch/microblaze/kernel/entry.S
+++ b/arch/microblaze/kernel/entry.S
@@ -308,38 +308,69 @@ C_ENTRY(_user_exception):
 	swi	r12, r1, PTO+PT_R0;
 	tovirt(r1,r1)
 
-	la	r15, r0, ret_from_trap-8
 /* where the trap should return need -8 to adjust for rtsd r15, 8*/
 /* Jump to the appropriate function for the system call number in r12
  * (r12 is not preserved), or return an error if r12 is not valid. The LP
  * register should point to the location where
  * the called function should return.  [note that MAKE_SYS_CALL uses label 1] */
-	/* See if the system call number is valid.  */
+
+	# Step into virtual mode.
+	set_vms;
+	addik	r11, r0, 3f
+	rtid	r11, 0
+	nop
+3:
+	add	r11, r0, CURRENT_TASK	 /* Get current task ptr into r11 */
+	lwi	r11, r11, TS_THREAD_INFO /* get thread info */
+	lwi	r11, r11, TI_FLAGS	 /* get flags in thread info */
+	andi	r11, r11, _TIF_WORK_SYSCALL_MASK
+	beqi	r11, 4f
+
+	addik	r3, r0, -ENOSYS
+	swi	r3, r1, PTO + PT_R3
+	brlid	r15, do_syscall_trace_enter
+	addik	r5, r1, PTO + PT_R0
+
+	# do_syscall_trace_enter returns the new syscall nr.
+	addk	r12, r0, r3
+	lwi	r5, r1, PTO+PT_R5;
+	lwi	r6, r1, PTO+PT_R6;
+	lwi	r7, r1, PTO+PT_R7;
+	lwi	r8, r1, PTO+PT_R8;
+	lwi	r9, r1, PTO+PT_R9;
+	lwi	r10, r1, PTO+PT_R10;
+4:
+/* Jump to the appropriate function for the system call number in r12
+ * (r12 is not preserved), or return an error if r12 is not valid.
+ * The LP register should point to the location where the called function
+ * should return.  [note that MAKE_SYS_CALL uses label 1] */
+	/* See if the system call number is valid */
 	addi	r11, r12, -__NR_syscalls;
-	bgei	r11,1f;
+	bgei	r11,5f;
 	/* Figure out which function to use for this system call.  */
 	/* Note Microblaze barrel shift is optional, so don't rely on it */
 	add	r12, r12, r12;			/* convert num -> ptr */
 	add	r12, r12, r12;
 
 	/* Trac syscalls and stored them to r0_ram */
-	lwi	r3, r12, 0x400 + TOPHYS(r0_ram)
+	lwi	r3, r12, 0x400 + r0_ram
 	addi	r3, r3, 1
-	swi	r3, r12, 0x400 + TOPHYS(r0_ram)
+	swi	r3, r12, 0x400 + r0_ram
+
+	# Find and jump into the syscall handler.
+	lwi	r12, r12, sys_call_table
+	/* where the trap should return need -8 to adjust for rtsd r15, 8 */
+	la	r15, r0, ret_from_trap-8
+	bra	r12
 
-	lwi	r12, r12, TOPHYS(sys_call_table); /* Function ptr */
-	/* Make the system call.  to r12*/
-	set_vms;
-	rtid	r12, 0;
-	nop;
 	/* The syscall number is invalid, return an error.  */
-1:	VM_ON;	/* RETURN() expects virtual mode*/
+5:
 	addi	r3, r0, -ENOSYS;
 	rtsd	r15,8;		/* looks like a normal subroutine return */
 	or 	r0, r0, r0
 
 
-/* Entry point used to return from a syscall/trap.  */
+/* Entry point used to return from a syscall/trap */
 /* We re-enable BIP bit before state restore */
 C_ENTRY(ret_from_trap):
 	set_bip;			/*  Ints masked for state restore*/
@@ -349,6 +380,23 @@ C_ENTRY(ret_from_trap):
 
 	/* We're returning to user mode, so check for various conditions that
 	 * trigger rescheduling. */
+	# FIXME: Restructure all these flag checks.
+	add	r11, r0, CURRENT_TASK;	/* Get current task ptr into r11 */
+	lwi	r11, r11, TS_THREAD_INFO;	/* get thread info */
+	lwi	r11, r11, TI_FLAGS;		/* get flags in thread info */
+	andi	r11, r11, _TIF_WORK_SYSCALL_MASK
+	beqi	r11, 1f
+
+	swi	r3, r1, PTO + PT_R3
+	swi	r4, r1, PTO + PT_R4
+	brlid	r15, do_syscall_trace_leave
+	addik	r5, r1, PTO + PT_R0
+	lwi	r3, r1, PTO + PT_R3
+	lwi	r4, r1, PTO + PT_R4
+1:
+
+	/* We're returning to user mode, so check for various conditions that
+	 * trigger rescheduling. */
 	/* Get current task ptr into r11 */
 	add	r11, r0, CURRENT_TASK;	/* Get current task ptr into r11 */
 	lwi	r11, r11, TS_THREAD_INFO;	/* get thread info */
diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c
index 0cb64a31e89..d9f70f83097 100644
--- a/arch/microblaze/kernel/exceptions.c
+++ b/arch/microblaze/kernel/exceptions.c
@@ -72,7 +72,8 @@ asmlinkage void full_exception(struct pt_regs *regs, unsigned int type,
 #endif
 
 #if 0
-	printk(KERN_WARNING "Exception %02x in %s mode, FSR=%08x PC=%08x ESR=%08x\n",
+	printk(KERN_WARNING "Exception %02x in %s mode, FSR=%08x PC=%08x " \
+							"ESR=%08x\n",
 			type, user_mode(regs) ? "user" : "kernel", fsr,
 			(unsigned int) regs->pc, (unsigned int) regs->esr);
 #endif
@@ -80,42 +81,50 @@ asmlinkage void full_exception(struct pt_regs *regs, unsigned int type,
 	switch (type & 0x1F) {
 	case MICROBLAZE_ILL_OPCODE_EXCEPTION:
 		if (user_mode(regs)) {
-			printk(KERN_WARNING "Illegal opcode exception in user mode.\n");
+			pr_debug(KERN_WARNING "Illegal opcode exception " \
+							"in user mode.\n");
 			_exception(SIGILL, regs, ILL_ILLOPC, addr);
 			return;
 		}
-		printk(KERN_WARNING "Illegal opcode exception in kernel mode.\n");
+		printk(KERN_WARNING "Illegal opcode exception " \
+							"in kernel mode.\n");
 		die("opcode exception", regs, SIGBUS);
 		break;
 	case MICROBLAZE_IBUS_EXCEPTION:
 		if (user_mode(regs)) {
-			printk(KERN_WARNING "Instruction bus error exception in user mode.\n");
+			pr_debug(KERN_WARNING "Instruction bus error " \
+						"exception in user mode.\n");
 			_exception(SIGBUS, regs, BUS_ADRERR, addr);
 			return;
 		}
-		printk(KERN_WARNING "Instruction bus error exception in kernel mode.\n");
+		printk(KERN_WARNING "Instruction bus error exception " \
+							"in kernel mode.\n");
 		die("bus exception", regs, SIGBUS);
 		break;
 	case MICROBLAZE_DBUS_EXCEPTION:
 		if (user_mode(regs)) {
-			printk(KERN_WARNING "Data bus error exception in user mode.\n");
+			pr_debug(KERN_WARNING "Data bus error exception " \
+							"in user mode.\n");
 			_exception(SIGBUS, regs, BUS_ADRERR, addr);
 			return;
 		}
-		printk(KERN_WARNING "Data bus error exception in kernel mode.\n");
+		printk(KERN_WARNING "Data bus error exception " \
+							"in kernel mode.\n");
 		die("bus exception", regs, SIGBUS);
 		break;
 	case MICROBLAZE_DIV_ZERO_EXCEPTION:
 		if (user_mode(regs)) {
-			printk(KERN_WARNING "Divide by zero exception in user mode\n");
-			_exception(SIGILL, regs, ILL_ILLOPC, addr);
+			pr_debug(KERN_WARNING "Divide by zero exception " \
+							"in user mode\n");
+			_exception(SIGILL, regs, FPE_INTDIV, addr);
 			return;
 		}
-		printk(KERN_WARNING "Divide by zero exception in kernel mode.\n");
+		printk(KERN_WARNING "Divide by zero exception " \
+							"in kernel mode.\n");
 		die("Divide by exception", regs, SIGBUS);
 		break;
 	case MICROBLAZE_FPU_EXCEPTION:
-		printk(KERN_WARNING "FPU exception\n");
+		pr_debug(KERN_WARNING "FPU exception\n");
 		/* IEEE FP exception */
 		/* I removed fsr variable and use code var for storing fsr */
 		if (fsr & FSR_IO)
@@ -133,7 +142,7 @@ asmlinkage void full_exception(struct pt_regs *regs, unsigned int type,
 
 #ifdef CONFIG_MMU
 	case MICROBLAZE_PRIVILEGED_EXCEPTION:
-		printk(KERN_WARNING "Privileged exception\n");
+		pr_debug(KERN_WARNING "Privileged exception\n");
 		/* "brk r0,r0" - used as debug breakpoint */
 		if (get_user(code, (unsigned long *)regs->pc) == 0
 			&& code == 0x980c0000) {
diff --git a/arch/microblaze/kernel/head.S b/arch/microblaze/kernel/head.S
index e41c6ce2a7b..697ce3007f3 100644
--- a/arch/microblaze/kernel/head.S
+++ b/arch/microblaze/kernel/head.S
@@ -54,6 +54,16 @@ ENTRY(_start)
 	mfs	r1, rmsr
 	andi	r1, r1, ~2
 	mts	rmsr, r1
+/*
+ * Here is checking mechanism which check if Microblaze has msr instructions
+ * We load msr and compare it with previous r1 value - if is the same,
+ * msr instructions works if not - cpu don't have them.
+ */
+	/* r8=0 - I have msr instr, 1 - I don't have them */
+	rsubi	r0, r0, 1	/* set the carry bit */
+	msrclr	r0, 0x4		/* try to clear it */
+	/* read the carry bit, r8 will be '0' if msrclr exists */
+	addik	r8, r0, 0
 
 /* r7 may point to an FDT, or there may be one linked in.
    if it's in r7, we've got to save it away ASAP.
@@ -209,8 +219,8 @@ start_here:
 	 * Please see $(ARCH)/mach-$(SUBARCH)/setup.c for
 	 * the function.
 	 */
-	la	r8, r0, machine_early_init
-	brald	r15, r8
+	la	r9, r0, machine_early_init
+	brald	r15, r9
 	nop
 
 #ifndef CONFIG_MMU
diff --git a/arch/microblaze/kernel/hw_exception_handler.S b/arch/microblaze/kernel/hw_exception_handler.S
index 3288c973767..6b0288ebccd 100644
--- a/arch/microblaze/kernel/hw_exception_handler.S
+++ b/arch/microblaze/kernel/hw_exception_handler.S
@@ -84,9 +84,10 @@
 #define NUM_TO_REG(num)		r ## num
 
 #ifdef CONFIG_MMU
-/* FIXME you can't change first load of MSR because there is
- * hardcoded jump bri 4 */
 	#define RESTORE_STATE			\
+		lwi	r5, r1, 0;		\
+		mts	rmsr, r5;		\
+		nop;				\
 		lwi	r3, r1, PT_R3;		\
 		lwi	r4, r1, PT_R4;		\
 		lwi	r5, r1, PT_R5;		\
@@ -309,6 +310,9 @@ _hw_exception_handler:
 	lwi	r31, r0, TOPHYS(PER_CPU(CURRENT_SAVE)) /* get saved current */
 #endif
 
+	mfs	r5, rmsr;
+	nop
+	swi	r5, r1, 0;
 	mfs	r3, resr
 	nop
 	mfs	r4, rear;
@@ -380,6 +384,8 @@ handle_other_ex: /* Handle Other exceptions here */
 	addk	r8, r17, r0; /* Load exception address */
 	bralid	r15, full_exception; /* Branch to the handler */
 	nop;
+	mts	r0, rfsr;	/* Clear sticky fsr */
+	nop
 
 	/*
 	 * Trigger execution of the signal handler by enabling
diff --git a/arch/microblaze/kernel/init_task.c b/arch/microblaze/kernel/init_task.c
index 67da22579b6..b5d711f94ff 100644
--- a/arch/microblaze/kernel/init_task.c
+++ b/arch/microblaze/kernel/init_task.c
@@ -19,9 +19,8 @@
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 struct task_struct init_task = INIT_TASK(init_task);
 EXPORT_SYMBOL(init_task);
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 00b12c6d532..4201c743cc9 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -235,6 +235,7 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long usp)
 	regs->pc = pc;
 	regs->r1 = usp;
 	regs->pt_mode = 0;
+	regs->msr |= MSR_UMS;
 }
 
 #ifdef CONFIG_MMU
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index 53ff39af6a5..4b3ac32754d 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -29,6 +29,10 @@
 #include <linux/sched.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
+#include <linux/elf.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/tracehook.h>
 
 #include <linux/errno.h>
 #include <asm/processor.h>
@@ -174,6 +178,64 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	return rval;
 }
 
+asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
+{
+	long ret = 0;
+
+	secure_computing(regs->r12);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
+	    tracehook_report_syscall_entry(regs))
+		/*
+		 * Tracing decided this syscall should not happen.
+		 * We'll return a bogus call number to get an ENOSYS
+		 * error, but leave the original number in regs->regs[0].
+		 */
+		ret = -1L;
+
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(EM_XILINX_MICROBLAZE, regs->r12,
+				    regs->r5, regs->r6,
+				    regs->r7, regs->r8);
+
+	return ret ?: regs->r12;
+}
+
+asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
+{
+	int step;
+
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3);
+
+	step = test_thread_flag(TIF_SINGLESTEP);
+	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_exit(regs, step);
+}
+
+#if 0
+static asmlinkage void syscall_trace(void)
+{
+	if (!test_thread_flag(TIF_SYSCALL_TRACE))
+		return;
+	if (!(current->ptrace & PT_PTRACED))
+		return;
+	/* The 0x80 provides a way for the tracing parent to distinguish
+	 between a syscall stop and SIGTRAP delivery */
+	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+				? 0x80 : 0));
+	/*
+	 * this isn't the same as continuing with a signal, but it will do
+	 * for normal use. strace only continues with a signal if the
+	 * stopping signal is not SIGTRAP. -brl
+	 */
+	if (current->exit_code) {
+		send_sig(current->exit_code, current, 1);
+		current->exit_code = 0;
+	}
+}
+#endif
+
 void ptrace_disable(struct task_struct *child)
 {
 	/* nothing to do */
diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c
index 2a97bf513b6..8c1e0f4dcf1 100644
--- a/arch/microblaze/kernel/setup.c
+++ b/arch/microblaze/kernel/setup.c
@@ -94,7 +94,7 @@ inline unsigned get_romfs_len(unsigned *addr)
 #endif	/* CONFIG_MTD_UCLINUX_EBSS */
 
 void __init machine_early_init(const char *cmdline, unsigned int ram,
-		unsigned int fdt)
+		unsigned int fdt, unsigned int msr)
 {
 	unsigned long *src, *dst = (unsigned long *)0x0;
 
@@ -157,6 +157,16 @@ void __init machine_early_init(const char *cmdline, unsigned int ram,
 	early_printk("New klimit: 0x%08x\n", (unsigned)klimit);
 #endif
 
+#if CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR
+	if (msr)
+		early_printk("!!!Your kernel has setup MSR instruction but "
+				"CPU don't have it %d\n", msr);
+#else
+	if (!msr)
+		early_printk("!!!Your kernel not setup MSR instruction but "
+				"CPU have it %d\n", msr);
+#endif
+
 	for (src = __ivt_start; src < __ivt_end; src++, dst++)
 		*dst = *src;
 
diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c
index b96f1682bb2..07cabed4b94 100644
--- a/arch/microblaze/kernel/sys_microblaze.c
+++ b/arch/microblaze/kernel/sys_microblaze.c
@@ -23,7 +23,6 @@
 #include <linux/mman.h>
 #include <linux/sys.h>
 #include <linux/ipc.h>
-#include <linux/utsname.h>
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/err.h>
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index ec5fa91a48d..e704188d785 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -12,13 +12,16 @@ OUTPUT_FORMAT("elf32-microblaze", "elf32-microblaze", "elf32-microblaze")
 OUTPUT_ARCH(microblaze)
 ENTRY(_start)
 
+#include <asm/page.h>
 #include <asm-generic/vmlinux.lds.h>
+#include <asm/thread_info.h>
 
 jiffies = jiffies_64 + 4;
 
 SECTIONS {
 	. = CONFIG_KERNEL_START;
-	.text : {
+	_start = CONFIG_KERNEL_BASE_ADDR;
+	.text : AT(ADDR(.text) - LOAD_OFFSET) {
 		_text = . ;
 		_stext = . ;
 		*(.text .text.*)
@@ -33,24 +36,22 @@ SECTIONS {
 	}
 
 	. = ALIGN (4) ;
-	_fdt_start = . ; /* place for fdt blob */
-	. = . + 0x4000;
-	_fdt_end = . ;
+	__fdt_blob : AT(ADDR(__fdt_blob) - LOAD_OFFSET) {
+		_fdt_start = . ;		/* place for fdt blob */
+		*(__fdt_blob) ;			/* Any link-placed DTB */
+	        . = _fdt_start + 0x4000;	/* Pad up to 16kbyte */
+		_fdt_end = . ;
+	}
 
 	. = ALIGN(16);
 	RODATA
-	. = ALIGN(16);
-	__ex_table : {
-		__start___ex_table = .;
-		*(__ex_table)
-		__stop___ex_table = .;
-	}
+	EXCEPTION_TABLE(16)
 
 	/*
 	 * sdata2 section can go anywhere, but must be word aligned
 	 * and SDA2_BASE must point to the middle of it
 	 */
-	.sdata2 : {
+	.sdata2 : AT(ADDR(.sdata2) - LOAD_OFFSET) {
 		_ssrw = .;
 		. = ALIGN(4096); /* page aligned when MMU used - origin 0x8 */
 		*(.sdata2)
@@ -61,12 +62,7 @@ SECTIONS {
 	}
 
 	_sdata = . ;
-	.data ALIGN (4096) : { /* page aligned when MMU used - origin 0x4 */
-		DATA_DATA
-		CONSTRUCTORS
-	}
-	. = ALIGN(32);
-	.data.cacheline_aligned : { *(.data.cacheline_aligned) }
+	RW_DATA_SECTION(32, PAGE_SIZE, THREAD_SIZE)
 	_edata = . ;
 
 	/* Reserve some low RAM for r0 based memory references */
@@ -74,18 +70,14 @@ SECTIONS {
 	r0_ram = . ;
 	. = . +  4096;	/* a page should be enough */
 
-	/* The initial task */
-	. = ALIGN(8192);
-	.data.init_task : { *(.data.init_task) }
-
 	/* Under the microblaze ABI, .sdata and .sbss must be contiguous */
 	. = ALIGN(8);
-	.sdata : {
+	.sdata : AT(ADDR(.sdata) - LOAD_OFFSET) {
 		_ssro = .;
 		*(.sdata)
 	}
 
-	.sbss :	{
+	.sbss :	AT(ADDR(.sbss) - LOAD_OFFSET) {
 		_ssbss = .;
 		*(.sbss)
 		_esbss = .;
@@ -96,47 +88,36 @@ SECTIONS {
 
 	__init_begin = .;
 
-	. = ALIGN(4096);
-	.init.text : {
-		_sinittext = . ;
-		INIT_TEXT
-		_einittext = .;
-	}
+	INIT_TEXT_SECTION(PAGE_SIZE)
 
-	.init.data : {
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
 		INIT_DATA
 	}
 
 	. = ALIGN(4);
-	.init.ivt : {
+	.init.ivt : AT(ADDR(.init.ivt) - LOAD_OFFSET) {
 		__ivt_start = .;
 		*(.init.ivt)
 		__ivt_end = .;
 	}
 
-	.init.setup : {
-		__setup_start = .;
-		*(.init.setup)
-		__setup_end = .;
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		INIT_SETUP(0)
 	}
 
-	.initcall.init : {
-		__initcall_start = .;
-		INITCALLS
-		__initcall_end = .;
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET ) {
+		INIT_CALLS
 	}
 
-	.con_initcall.init : {
-		__con_initcall_start = .;
-		*(.con_initcall.init)
-		__con_initcall_end = .;
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		CON_INITCALL
 	}
 
 	SECURITY_INIT
 
 	__init_end_before_initramfs = .;
 
-	.init.ramfs ALIGN(4096) : {
+	.init.ramfs ALIGN(4096) : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
 		__initramfs_start = .;
 		*(.init.ramfs)
 		__initramfs_end = .;
@@ -152,7 +133,8 @@ SECTIONS {
 	}
 	__init_end = .;
 
-	.bss ALIGN (4096) : { /* page aligned when MMU used */
+	.bss ALIGN (4096) : AT(ADDR(.bss) - LOAD_OFFSET) {
+		/* page aligned when MMU used */
 		__bss_start = . ;
 			*(.bss*)
 			*(COMMON)
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 1110784eb3f..a44892e7cd5 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -180,7 +180,8 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		totalram_pages++;
 		pages++;
 	}
-	printk(KERN_NOTICE "Freeing initrd memory: %dk freed\n", pages);
+	printk(KERN_NOTICE "Freeing initrd memory: %dk freed\n",
+					(int)(pages * (PAGE_SIZE / 1024)));
 }
 #endif
 
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index c825b14b4ed..77f5021218d 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -627,16 +627,6 @@ endif
 cflags-y			+= -I$(srctree)/arch/mips/include/asm/mach-generic
 drivers-$(CONFIG_PCI)		+= arch/mips/pci/
 
-ifdef CONFIG_32BIT
-ifdef CONFIG_CPU_LITTLE_ENDIAN
-JIFFIES			= jiffies_64
-else
-JIFFIES			= jiffies_64 + 4
-endif
-else
-JIFFIES			= jiffies_64
-endif
-
 #
 # Automatically detect the build format. By default we choose
 # the elf format according to the load address.
@@ -660,8 +650,9 @@ ifdef CONFIG_64BIT
 endif
 
 KBUILD_AFLAGS	+= $(cflags-y)
-KBUILD_CFLAGS	+= $(cflags-y) \
-			-D"VMLINUX_LOAD_ADDRESS=$(load-y)"
+KBUILD_CFLAGS	+= $(cflags-y)
+KBUILD_CPPFLAGS += -D"VMLINUX_LOAD_ADDRESS=$(load-y)"
+KBUILD_CPPFLAGS += -D"DATAOFFSET=$(if $(dataoffset-y),$(dataoffset-y),0)"
 
 LDFLAGS			+= -m $(ld-emul)
 
@@ -676,18 +667,6 @@ endif
 
 OBJCOPYFLAGS		+= --remove-section=.reginfo
 
-#
-# Choosing incompatible machines durings configuration will result in
-# error messages during linking.  Select a default linkscript if
-# none has been choosen above.
-#
-
-CPPFLAGS_vmlinux.lds := \
-	$(KBUILD_CFLAGS) \
-	-D"LOADADDR=$(load-y)" \
-	-D"JIFFIES=$(JIFFIES)" \
-	-D"DATAOFFSET=$(if $(dataoffset-y),$(dataoffset-y),0)"
-
 head-y := arch/mips/kernel/head.o arch/mips/kernel/init_task.o
 
 libs-y			+= arch/mips/lib/
diff --git a/arch/mips/alchemy/common/time.c b/arch/mips/alchemy/common/time.c
index f34ff860194..379a664809b 100644
--- a/arch/mips/alchemy/common/time.c
+++ b/arch/mips/alchemy/common/time.c
@@ -88,7 +88,7 @@ static struct clock_event_device au1x_rtcmatch2_clockdev = {
 	.irq		= AU1000_RTC_MATCH2_INT,
 	.set_next_event	= au1x_rtcmatch2_set_next_event,
 	.set_mode	= au1x_rtcmatch2_set_mode,
-	.cpumask	= CPU_MASK_ALL_PTR,
+	.cpumask	= cpu_all_mask,
 };
 
 static struct irqaction au1x_rtcmatch2_irqaction = {
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 23059170700..f6837422fe6 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -24,12 +24,10 @@ extern struct cpuinfo_ip27 sn_cpu_info[NR_CPUS];
 
 #define cpu_to_node(cpu)	(sn_cpu_info[(cpu)].p_nodeid)
 #define parent_node(node)	(node)
-#define node_to_cpumask(node)	(hub_data(node)->h_cpus)
 #define cpumask_of_node(node)	(&hub_data(node)->h_cpus)
 struct pci_bus;
 extern int pcibus_to_node(struct pci_bus *);
 
-#define pcibus_to_cpumask(bus)	(cpu_online_map)
 #define cpumask_of_pcibus(bus)	(cpu_online_mask)
 
 extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h
index d3bea88d874..d9743536a62 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -178,8 +178,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	 * Mark current->active_mm as not "active" anymore.
 	 * We don't want to mislead possible IPI tlb flush routines.
 	 */
-	cpu_clear(cpu, prev->cpu_vm_mask);
-	cpu_set(cpu, next->cpu_vm_mask);
+	cpumask_clear_cpu(cpu, mm_cpumask(prev));
+	cpumask_set_cpu(cpu, mm_cpumask(next));
 
 	local_irq_restore(flags);
 }
@@ -235,8 +235,8 @@ activate_mm(struct mm_struct *prev, struct mm_struct *next)
 	TLBMISS_HANDLER_SETUP_PGD(next->pgd);
 
 	/* mark mmu ownership change */
-	cpu_clear(cpu, prev->cpu_vm_mask);
-	cpu_set(cpu, next->cpu_vm_mask);
+	cpumask_clear_cpu(cpu, mm_cpumask(prev));
+	cpumask_set_cpu(cpu, mm_cpumask(next));
 
 	local_irq_restore(flags);
 }
@@ -258,7 +258,7 @@ drop_mmu_context(struct mm_struct *mm, unsigned cpu)
 
 	local_irq_save(flags);
 
-	if (cpu_isset(cpu, mm->cpu_vm_mask))  {
+	if (cpumask_test_cpu(cpu, mm_cpumask(mm)))  {
 		get_new_mmu_context(mm, cpu);
 #ifdef CONFIG_MIPS_MT_SMTC
 		/* See comments for similar code above */
diff --git a/arch/mips/include/asm/smp-ops.h b/arch/mips/include/asm/smp-ops.h
index fd545547b8a..9e09af34c8a 100644
--- a/arch/mips/include/asm/smp-ops.h
+++ b/arch/mips/include/asm/smp-ops.h
@@ -19,7 +19,7 @@ struct task_struct;
 
 struct plat_smp_ops {
 	void (*send_ipi_single)(int cpu, unsigned int action);
-	void (*send_ipi_mask)(cpumask_t mask, unsigned int action);
+	void (*send_ipi_mask)(const struct cpumask *mask, unsigned int action);
 	void (*init_secondary)(void);
 	void (*smp_finish)(void);
 	void (*cpus_done)(void);
diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h
index aaa2d4ab26d..e15f11a0931 100644
--- a/arch/mips/include/asm/smp.h
+++ b/arch/mips/include/asm/smp.h
@@ -78,6 +78,6 @@ extern void play_dead(void);
 extern asmlinkage void smp_call_function_interrupt(void);
 
 extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi(cpumask_t mask);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
 #endif /* __ASM_SMP_H */
diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c
index 5b457a40c78..6d6ca530589 100644
--- a/arch/mips/kernel/init_task.c
+++ b/arch/mips/kernel/init_task.c
@@ -21,9 +21,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  *
  * The things we do for performance..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"),
-	               __aligned__(THREAD_SIZE))) =
+union thread_union init_thread_union __init_task_data
+	__attribute__((__aligned__(THREAD_SIZE))) =
 		{ INIT_THREAD_INFO(init_task) };
 
 /*
diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c
index ad0ff5dc4d5..cc81771b882 100644
--- a/arch/mips/kernel/smp-cmp.c
+++ b/arch/mips/kernel/smp-cmp.c
@@ -80,11 +80,11 @@ void cmp_send_ipi_single(int cpu, unsigned int action)
 	local_irq_restore(flags);
 }
 
-static void cmp_send_ipi_mask(cpumask_t mask, unsigned int action)
+static void cmp_send_ipi_mask(const struct cpumask *mask, unsigned int action)
 {
 	unsigned int i;
 
-	for_each_cpu_mask(i, mask)
+	for_each_cpu(i, mask)
 		cmp_send_ipi_single(i, action);
 }
 
@@ -171,7 +171,7 @@ void __init cmp_smp_setup(void)
 
 	for (i = 1; i < NR_CPUS; i++) {
 		if (amon_cpu_avail(i)) {
-			cpu_set(i, cpu_possible_map);
+			set_cpu_possible(i, true);
 			__cpu_number_map[i]	= ++ncpu;
 			__cpu_logical_map[ncpu]	= i;
 		}
diff --git a/arch/mips/kernel/smp-mt.c b/arch/mips/kernel/smp-mt.c
index 6f7ee5ac46e..43e7cdc5ded 100644
--- a/arch/mips/kernel/smp-mt.c
+++ b/arch/mips/kernel/smp-mt.c
@@ -70,7 +70,7 @@ static unsigned int __init smvp_vpe_init(unsigned int tc, unsigned int mvpconf0,
 		write_vpe_c0_vpeconf0(tmp);
 
 		/* Record this as available CPU */
-		cpu_set(tc, cpu_possible_map);
+		set_cpu_possible(tc, true);
 		__cpu_number_map[tc]	= ++ncpu;
 		__cpu_logical_map[ncpu]	= tc;
 	}
@@ -141,11 +141,11 @@ static void vsmp_send_ipi_single(int cpu, unsigned int action)
 	local_irq_restore(flags);
 }
 
-static void vsmp_send_ipi_mask(cpumask_t mask, unsigned int action)
+static void vsmp_send_ipi_mask(const struct cpumask *mask, unsigned int action)
 {
 	unsigned int i;
 
-	for_each_cpu_mask(i, mask)
+	for_each_cpu(i, mask)
 		vsmp_send_ipi_single(i, action);
 }
 
diff --git a/arch/mips/kernel/smp-up.c b/arch/mips/kernel/smp-up.c
index 2508d55d68f..00500fea275 100644
--- a/arch/mips/kernel/smp-up.c
+++ b/arch/mips/kernel/smp-up.c
@@ -18,7 +18,8 @@ static void up_send_ipi_single(int cpu, unsigned int action)
 	panic(KERN_ERR "%s called", __func__);
 }
 
-static inline void up_send_ipi_mask(cpumask_t mask, unsigned int action)
+static inline void up_send_ipi_mask(const struct cpumask *mask,
+				    unsigned int action)
 {
 	panic(KERN_ERR "%s called", __func__);
 }
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 64668a93248..4eb106c6a3e 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -128,7 +128,7 @@ asmlinkage __cpuinit void start_secondary(void)
 	cpu_idle();
 }
 
-void arch_send_call_function_ipi(cpumask_t mask)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
 	mp_ops->send_ipi_mask(mask, SMP_CALL_FUNCTION);
 }
@@ -183,15 +183,15 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	mp_ops->prepare_cpus(max_cpus);
 	set_cpu_sibling_map(0);
 #ifndef CONFIG_HOTPLUG_CPU
-	cpu_present_map = cpu_possible_map;
+	init_cpu_present(&cpu_possible_map);
 #endif
 }
 
 /* preload SMP state for boot cpu */
 void __devinit smp_prepare_boot_cpu(void)
 {
-	cpu_set(0, cpu_possible_map);
-	cpu_set(0, cpu_online_map);
+	set_cpu_possible(0, true);
+	set_cpu_online(0, true);
 	cpu_set(0, cpu_callin_map);
 }
 
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index 1a466baf0ed..67153a0dc26 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -305,7 +305,7 @@ int __init smtc_build_cpu_map(int start_cpu_slot)
 	 */
 	ntcs = ((read_c0_mvpconf0() & MVPCONF0_PTC) >> MVPCONF0_PTC_SHIFT) + 1;
 	for (i=start_cpu_slot; i<NR_CPUS && i<ntcs; i++) {
-		cpu_set(i, cpu_possible_map);
+		set_cpu_possible(i, true);
 		__cpu_number_map[i] = i;
 		__cpu_logical_map[i] = i;
 	}
@@ -525,8 +525,8 @@ void smtc_prepare_cpus(int cpus)
 	 * Pull any physically present but unused TCs out of circulation.
 	 */
 	while (tc < (((val & MVPCONF0_PTC) >> MVPCONF0_PTC_SHIFT) + 1)) {
-		cpu_clear(tc, cpu_possible_map);
-		cpu_clear(tc, cpu_present_map);
+		set_cpu_possible(tc, false);
+		set_cpu_present(tc, false);
 		tc++;
 	}
 
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 2769bed3d2a..9bf0e3df7c5 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -10,7 +10,16 @@ PHDRS {
 	text PT_LOAD FLAGS(7);	/* RWX */
 	note PT_NOTE FLAGS(4);	/* R__ */
 }
-jiffies = JIFFIES;
+
+ifdef CONFIG_32BIT
+	ifdef CONFIG_CPU_LITTLE_ENDIAN
+		jiffies  = jiffies_64;
+	else
+		jiffies  = jiffies_64 + 4;
+	endif
+else
+	jiffies  = jiffies_64;
+endif
 
 SECTIONS
 {
@@ -29,7 +38,7 @@ SECTIONS
 	/* . = 0xa800000000300000; */
 	. = 0xffffffff80300000;
 #endif
-	. = LOADADDR;
+	. = VMLINUX_LOAD_ADDRESS;
 	/* read-only */
 	_text = .;	/* Text and read-only data */
 	.text : {
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index 3f04d4c406b..b3deed8db61 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -56,12 +56,12 @@ int sysctl_lasatstring(ctl_table *table,
 
 
 /* And the same for proc */
-int proc_dolasatstring(ctl_table *table, int write, struct file *filp,
+int proc_dolasatstring(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
 
-	r = proc_dostring(table, write, filp, buffer, lenp, ppos);
+	r = proc_dostring(table, write, buffer, lenp, ppos);
 	if ((!write) || r)
 		return r;
 
@@ -71,12 +71,12 @@ int proc_dolasatstring(ctl_table *table, int write, struct file *filp,
 }
 
 /* proc function to write EEPROM after changing int entry */
-int proc_dolasatint(ctl_table *table, int write, struct file *filp,
+int proc_dolasatint(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
 
-	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	r = proc_dointvec(table, write, buffer, lenp, ppos);
 	if ((!write) || r)
 		return r;
 
@@ -89,7 +89,7 @@ int proc_dolasatint(ctl_table *table, int write, struct file *filp,
 static int rtctmp;
 
 /* proc function to read/write RealTime Clock */
-int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
+int proc_dolasatrtc(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct timespec ts;
@@ -102,7 +102,7 @@ int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
 		if (rtctmp < 0)
 			rtctmp = 0;
 	}
-	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	r = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (r)
 		return r;
 
@@ -154,7 +154,7 @@ int sysctl_lasat_rtc(ctl_table *table,
 #endif
 
 #ifdef CONFIG_INET
-int proc_lasat_ip(ctl_table *table, int write, struct file *filp,
+int proc_lasat_ip(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	unsigned int ip;
@@ -231,12 +231,12 @@ static int sysctl_lasat_prid(ctl_table *table,
 	return 0;
 }
 
-int proc_lasat_prid(ctl_table *table, int write, struct file *filp,
+int proc_lasat_prid(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
 
-	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	r = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (r < 0)
 		return r;
 	if (write) {
diff --git a/arch/mips/mipssim/sim_smtc.c b/arch/mips/mipssim/sim_smtc.c
index d6e4f656ad1..5da30b6a65b 100644
--- a/arch/mips/mipssim/sim_smtc.c
+++ b/arch/mips/mipssim/sim_smtc.c
@@ -43,11 +43,12 @@ static void ssmtc_send_ipi_single(int cpu, unsigned int action)
 	/* "CPU" may be TC of same VPE, VPE of same CPU, or different CPU */
 }
 
-static inline void ssmtc_send_ipi_mask(cpumask_t mask, unsigned int action)
+static inline void ssmtc_send_ipi_mask(const struct cpumask *mask,
+				       unsigned int action)
 {
 	unsigned int i;
 
-	for_each_cpu_mask(i, mask)
+	for_each_cpu(i, mask)
 		ssmtc_send_ipi_single(i, action);
 }
 
diff --git a/arch/mips/mm/c-octeon.c b/arch/mips/mm/c-octeon.c
index 10ab69f7183..94e05e5733c 100644
--- a/arch/mips/mm/c-octeon.c
+++ b/arch/mips/mm/c-octeon.c
@@ -79,7 +79,7 @@ static void octeon_flush_icache_all_cores(struct vm_area_struct *vma)
 	 * cores it has been used on
 	 */
 	if (vma)
-		mask = vma->vm_mm->cpu_vm_mask;
+		mask = *mm_cpumask(vma->vm_mm);
 	else
 		mask = cpu_online_map;
 	cpu_clear(cpu, mask);
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c
index 499ffe5475d..192cfd2a539 100644
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -21,11 +21,11 @@ static void msmtc_send_ipi_single(int cpu, unsigned int action)
 	smtc_send_ipi(cpu, LINUX_SMP_IPI, action);
 }
 
-static void msmtc_send_ipi_mask(cpumask_t mask, unsigned int action)
+static void msmtc_send_ipi_mask(const struct cpumask *mask, unsigned int action)
 {
 	unsigned int i;
 
-	for_each_cpu_mask(i, mask)
+	for_each_cpu(i, mask)
 		msmtc_send_ipi_single(i, action);
 }
 
diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c
index 8ace2771623..326fe7a392e 100644
--- a/arch/mips/pmc-sierra/yosemite/smp.c
+++ b/arch/mips/pmc-sierra/yosemite/smp.c
@@ -97,11 +97,11 @@ static void yos_send_ipi_single(int cpu, unsigned int action)
 	}
 }
 
-static void yos_send_ipi_mask(cpumask_t mask, unsigned int action)
+static void yos_send_ipi_mask(const struct cpumask *mask, unsigned int action)
 {
 	unsigned int i;
 
-	for_each_cpu_mask(i, mask)
+	for_each_cpu(i, mask)
 		yos_send_ipi_single(i, action);
 }
 
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index 060d853d7b3..f61c164d1e6 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -421,7 +421,7 @@ static void __init node_mem_init(cnodeid_t node)
 
 /*
  * A node with nothing.  We use it to avoid any special casing in
- * node_to_cpumask
+ * cpumask_of_node
  */
 static struct node_data null_node = {
 	.hub = {
diff --git a/arch/mips/sgi-ip27/ip27-smp.c b/arch/mips/sgi-ip27/ip27-smp.c
index cbcd7eb83bd..9aa8f2951df 100644
--- a/arch/mips/sgi-ip27/ip27-smp.c
+++ b/arch/mips/sgi-ip27/ip27-smp.c
@@ -165,11 +165,11 @@ static void ip27_send_ipi_single(int destid, unsigned int action)
 	REMOTE_HUB_SEND_INTR(COMPACT_TO_NASID_NODEID(cpu_to_node(destid)), irq);
 }
 
-static void ip27_send_ipi_mask(cpumask_t mask, unsigned int action)
+static void ip27_send_ipi(const struct cpumask *mask, unsigned int action)
 {
 	unsigned int i;
 
-	for_each_cpu_mask(i, mask)
+	for_each_cpu(i, mask)
 		ip27_send_ipi_single(i, action);
 }
 
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index 314691648c9..47b347c992e 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -82,11 +82,12 @@ static void bcm1480_send_ipi_single(int cpu, unsigned int action)
 	__raw_writeq((((u64)action)<< 48), mailbox_0_set_regs[cpu]);
 }
 
-static void bcm1480_send_ipi_mask(cpumask_t mask, unsigned int action)
+static void bcm1480_send_ipi_mask(const struct cpumask *mask,
+				  unsigned int action)
 {
 	unsigned int i;
 
-	for_each_cpu_mask(i, mask)
+	for_each_cpu(i, mask)
 		bcm1480_send_ipi_single(i, action);
 }
 
diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c
index cad14003b84..c00a5cb1128 100644
--- a/arch/mips/sibyte/sb1250/smp.c
+++ b/arch/mips/sibyte/sb1250/smp.c
@@ -70,11 +70,12 @@ static void sb1250_send_ipi_single(int cpu, unsigned int action)
 	__raw_writeq((((u64)action) << 48), mailbox_set_regs[cpu]);
 }
 
-static inline void sb1250_send_ipi_mask(cpumask_t mask, unsigned int action)
+static inline void sb1250_send_ipi_mask(const struct cpumask *mask,
+					unsigned int action)
 {
 	unsigned int i;
 
-	for_each_cpu_mask(i, mask)
+	for_each_cpu(i, mask)
 		sb1250_send_ipi_single(i, action);
 }
 
diff --git a/arch/mn10300/include/asm/mmu_context.h b/arch/mn10300/include/asm/mmu_context.h
index a9e2e34f69b..cb294c244de 100644
--- a/arch/mn10300/include/asm/mmu_context.h
+++ b/arch/mn10300/include/asm/mmu_context.h
@@ -38,13 +38,13 @@ extern unsigned long mmu_context_cache[NR_CPUS];
 #define enter_lazy_tlb(mm, tsk)	do {} while (0)
 
 #ifdef CONFIG_SMP
-#define cpu_ran_vm(cpu, task) \
-	cpu_set((cpu), (task)->cpu_vm_mask)
-#define cpu_maybe_ran_vm(cpu, task) \
-	cpu_test_and_set((cpu), (task)->cpu_vm_mask)
+#define cpu_ran_vm(cpu, mm) \
+	cpumask_set_cpu((cpu), mm_cpumask(mm))
+#define cpu_maybe_ran_vm(cpu, mm) \
+	cpumask_test_and_set_cpu((cpu), mm_cpumask(mm))
 #else
-#define cpu_ran_vm(cpu, task)		do {} while (0)
-#define cpu_maybe_ran_vm(cpu, task)	true
+#define cpu_ran_vm(cpu, mm)		do {} while (0)
+#define cpu_maybe_ran_vm(cpu, mm)	true
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c
index 80d423b80af..a481b043bea 100644
--- a/arch/mn10300/kernel/init_task.c
+++ b/arch/mn10300/kernel/init_task.c
@@ -27,9 +27,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c
index 3e52a105432..8ca5af00334 100644
--- a/arch/mn10300/kernel/sys_mn10300.c
+++ b/arch/mn10300/kernel/sys_mn10300.c
@@ -19,7 +19,6 @@
 #include <linux/stat.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/tty.h>
 
 #include <asm/uaccess.h>
diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile
index da6f66901c9..55cca1dac43 100644
--- a/arch/parisc/Makefile
+++ b/arch/parisc/Makefile
@@ -118,8 +118,8 @@ define archhelp
 	@echo  '* vmlinux	- Uncompressed kernel image (./vmlinux)'
 	@echo  '  palo		- Bootable image (./lifimage)'
 	@echo  '  install	- Install kernel using'
-	@echo  '		  (your) ~/bin/installkernel or'
-	@echo  '		  (distribution) /sbin/installkernel or'
+	@echo  '		  (your) ~/bin/$(INSTALLKERNEL) or'
+	@echo  '		  (distribution) /sbin/$(INSTALLKERNEL) or'
 	@echo  '		  copy to $$(INSTALL_PATH)'
 endef
 
diff --git a/arch/parisc/include/asm/fcntl.h b/arch/parisc/include/asm/fcntl.h
index 1e1c824764e..5f39d5597ce 100644
--- a/arch/parisc/include/asm/fcntl.h
+++ b/arch/parisc/include/asm/fcntl.h
@@ -28,6 +28,8 @@
 #define F_SETOWN	12	/*  for sockets. */
 #define F_SETSIG	13	/*  for sockets. */
 #define F_GETSIG	14	/*  for sockets. */
+#define F_GETOWN_EX	15
+#define F_SETOWN_EX	16
 
 /* for posix fcntl() and lockf() */
 #define F_RDLCK		01
diff --git a/arch/parisc/include/asm/smp.h b/arch/parisc/include/asm/smp.h
index 21eb45a5262..2e73623feb6 100644
--- a/arch/parisc/include/asm/smp.h
+++ b/arch/parisc/include/asm/smp.h
@@ -30,7 +30,6 @@ extern void smp_send_all_nop(void);
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #endif /* !ASSEMBLY */
 
diff --git a/arch/parisc/install.sh b/arch/parisc/install.sh
index 9632b3e164c..e593fc8d58b 100644
--- a/arch/parisc/install.sh
+++ b/arch/parisc/install.sh
@@ -21,8 +21,8 @@
 
 # User may have a custom install script
 
-if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
-if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install
 
diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c
index 82974b20fc1..d020eae6525 100644
--- a/arch/parisc/kernel/init_task.c
+++ b/arch/parisc/kernel/init_task.c
@@ -43,8 +43,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((aligned(128))) __attribute__((__section__(".data.init_task"))) =
+union thread_union init_thread_union __init_task_data
+	__attribute__((aligned(128))) =
 		{ INIT_THREAD_INFO(init_task) };
 
 #if PT_NLEVELS == 3
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index 92a0acaa0d1..561388b17c9 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -18,7 +18,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/time.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 952a3963e9e..aacf629c1a9 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -158,8 +158,6 @@ drivers-$(CONFIG_OPROFILE)	+= arch/powerpc/oprofile/
 # Default to zImage, override when needed
 all: zImage
 
-CPPFLAGS_vmlinux.lds	:= -Upowerpc
-
 BOOT_TARGETS = zImage zImage.initrd uImage zImage% dtbImage% treeImage.% cuImage.% simpleImage.%
 
 PHONY += $(BOOT_TARGETS)
@@ -182,8 +180,8 @@ define archhelp
   @echo '  simpleImage.<dt> - Firmware independent image.'
   @echo '  treeImage.<dt>  - Support for older IBM 4xx firmware (not U-Boot)'
   @echo '  install         - Install kernel using'
-  @echo '                    (your) ~/bin/installkernel or'
-  @echo '                    (distribution) /sbin/installkernel or'
+  @echo '                    (your) ~/bin/$(INSTALLKERNEL) or'
+  @echo '                    (distribution) /sbin/$(INSTALLKERNEL) or'
   @echo '                    install to $$(INSTALL_PATH) and run lilo'
   @echo '  *_defconfig     - Select default config from arch/$(ARCH)/configs'
   @echo ''
diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh
index 98312d169c8..b6a256bc96e 100644
--- a/arch/powerpc/boot/install.sh
+++ b/arch/powerpc/boot/install.sh
@@ -23,8 +23,8 @@ set -e
 
 # User may have a custom install script
 
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install
 
diff --git a/arch/powerpc/include/asm/fsldma.h b/arch/powerpc/include/asm/fsldma.h
new file mode 100644
index 00000000000..a67aeed17d4
--- /dev/null
+++ b/arch/powerpc/include/asm/fsldma.h
@@ -0,0 +1,136 @@
+/*
+ * Freescale MPC83XX / MPC85XX DMA Controller
+ *
+ * Copyright (c) 2009 Ira W. Snyder <iws@ovro.caltech.edu>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef __ARCH_POWERPC_ASM_FSLDMA_H__
+#define __ARCH_POWERPC_ASM_FSLDMA_H__
+
+#include <linux/dmaengine.h>
+
+/*
+ * Definitions for the Freescale DMA controller's DMA_SLAVE implemention
+ *
+ * The Freescale DMA_SLAVE implementation was designed to handle many-to-many
+ * transfers. An example usage would be an accelerated copy between two
+ * scatterlists. Another example use would be an accelerated copy from
+ * multiple non-contiguous device buffers into a single scatterlist.
+ *
+ * A DMA_SLAVE transaction is defined by a struct fsl_dma_slave. This
+ * structure contains a list of hardware addresses that should be copied
+ * to/from the scatterlist passed into device_prep_slave_sg(). The structure
+ * also has some fields to enable hardware-specific features.
+ */
+
+/**
+ * struct fsl_dma_hw_addr
+ * @entry: linked list entry
+ * @address: the hardware address
+ * @length: length to transfer
+ *
+ * Holds a single physical hardware address / length pair for use
+ * with the DMAEngine DMA_SLAVE API.
+ */
+struct fsl_dma_hw_addr {
+	struct list_head entry;
+
+	dma_addr_t address;
+	size_t length;
+};
+
+/**
+ * struct fsl_dma_slave
+ * @addresses: a linked list of struct fsl_dma_hw_addr structures
+ * @request_count: value for DMA request count
+ * @src_loop_size: setup and enable constant source-address DMA transfers
+ * @dst_loop_size: setup and enable constant destination address DMA transfers
+ * @external_start: enable externally started DMA transfers
+ * @external_pause: enable externally paused DMA transfers
+ *
+ * Holds a list of address / length pairs for use with the DMAEngine
+ * DMA_SLAVE API implementation for the Freescale DMA controller.
+ */
+struct fsl_dma_slave {
+
+	/* List of hardware address/length pairs */
+	struct list_head addresses;
+
+	/* Support for extra controller features */
+	unsigned int request_count;
+	unsigned int src_loop_size;
+	unsigned int dst_loop_size;
+	bool external_start;
+	bool external_pause;
+};
+
+/**
+ * fsl_dma_slave_append - add an address/length pair to a struct fsl_dma_slave
+ * @slave: the &struct fsl_dma_slave to add to
+ * @address: the hardware address to add
+ * @length: the length of bytes to transfer from @address
+ *
+ * Add a hardware address/length pair to a struct fsl_dma_slave. Returns 0 on
+ * success, -ERRNO otherwise.
+ */
+static inline int fsl_dma_slave_append(struct fsl_dma_slave *slave,
+				       dma_addr_t address, size_t length)
+{
+	struct fsl_dma_hw_addr *addr;
+
+	addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
+	if (!addr)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&addr->entry);
+	addr->address = address;
+	addr->length = length;
+
+	list_add_tail(&addr->entry, &slave->addresses);
+	return 0;
+}
+
+/**
+ * fsl_dma_slave_free - free a struct fsl_dma_slave
+ * @slave: the struct fsl_dma_slave to free
+ *
+ * Free a struct fsl_dma_slave and all associated address/length pairs
+ */
+static inline void fsl_dma_slave_free(struct fsl_dma_slave *slave)
+{
+	struct fsl_dma_hw_addr *addr, *tmp;
+
+	if (slave) {
+		list_for_each_entry_safe(addr, tmp, &slave->addresses, entry) {
+			list_del(&addr->entry);
+			kfree(addr);
+		}
+
+		kfree(slave);
+	}
+}
+
+/**
+ * fsl_dma_slave_alloc - allocate a struct fsl_dma_slave
+ * @gfp: the flags to pass to kmalloc when allocating this structure
+ *
+ * Allocate a struct fsl_dma_slave for use by the DMA_SLAVE API. Returns a new
+ * struct fsl_dma_slave on success, or NULL on failure.
+ */
+static inline struct fsl_dma_slave *fsl_dma_slave_alloc(gfp_t gfp)
+{
+	struct fsl_dma_slave *slave;
+
+	slave = kzalloc(sizeof(*slave), gfp);
+	if (!slave)
+		return NULL;
+
+	INIT_LIST_HEAD(&slave->addresses);
+	return slave;
+}
+
+#endif /* __ARCH_POWERPC_ASM_FSLDMA_H__ */
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index c0d3b8af931..d9ea8d39c34 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -146,7 +146,7 @@ extern void smp_generic_take_timebase(void);
 extern struct smp_ops_t *smp_ops;
 
 extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi(cpumask_t mask);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
 /* Definitions relative to the secondary CPU spin loop
  * and entry point. Not all of them exist on both 32 and
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 394edcbcce7..22f738d12ad 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -17,11 +17,6 @@ static inline int cpu_to_node(int cpu)
 
 #define parent_node(node)	(node)
 
-static inline cpumask_t node_to_cpumask(int node)
-{
-	return numa_cpumask_lookup_table[node];
-}
-
 #define cpumask_of_node(node) (&numa_cpumask_lookup_table[node])
 
 int of_node_to_nid(struct device_node *device);
@@ -36,11 +31,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 }
 #endif
 
-#define pcibus_to_cpumask(bus)	(pcibus_to_node(bus) == -1 ? \
-					CPU_MASK_ALL : \
-					node_to_cpumask(pcibus_to_node(bus)) \
-				)
-
 #define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
 				 cpu_all_mask :				\
 				 cpumask_of_node(pcibus_to_node(bus)))
@@ -104,8 +94,6 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev,
 #ifdef CONFIG_PPC64
 #include <asm/smp.h>
 
-#define topology_thread_siblings(cpu)	(per_cpu(cpu_sibling_map, cpu))
-#define topology_core_siblings(cpu)	(per_cpu(cpu_core_map, cpu))
 #define topology_thread_cpumask(cpu)	(&per_cpu(cpu_sibling_map, cpu))
 #define topology_core_cpumask(cpu)	(&per_cpu(cpu_core_map, cpu))
 #define topology_core_id(cpu)		(cpu_to_core_id(cpu))
diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c
index ffc4253fef5..2375b7eb1c7 100644
--- a/arch/powerpc/kernel/init_task.c
+++ b/arch/powerpc/kernel/init_task.c
@@ -16,9 +16,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union 
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 49e705fcee6..040bd1de8d9 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -13,6 +13,7 @@
 #include <linux/kexec.h>
 #include <linux/smp.h>
 #include <linux/thread_info.h>
+#include <linux/init_task.h>
 #include <linux/errno.h>
 
 #include <asm/page.h>
@@ -249,8 +250,8 @@ static void kexec_prepare_cpus(void)
  * We could use a smaller stack if we don't care about anything using
  * current, but that audit has not been performed.
  */
-static union thread_union kexec_stack
-	__attribute__((__section__(".data.init_task"))) = { };
+static union thread_union kexec_stack __init_task_data =
+	{ };
 
 /* Our assembly helper, in kexec_stub.S */
 extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start,
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 1d5570a1e45..4271f7a655a 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -24,7 +24,6 @@
 #include <linux/seq_file.h>
 #include <linux/ioport.h>
 #include <linux/console.h>
-#include <linux/utsname.h>
 #include <linux/screen_info.h>
 #include <linux/root_dev.h>
 #include <linux/notifier.h>
@@ -432,9 +431,9 @@ void __init smp_setup_cpu_maps(void)
 		for (j = 0; j < nthreads && cpu < NR_CPUS; j++) {
 			DBG("    thread %d -> cpu %d (hard id %d)\n",
 			    j, cpu, intserv[j]);
-			cpu_set(cpu, cpu_present_map);
+			set_cpu_present(cpu, true);
 			set_hard_smp_processor_id(cpu, intserv[j]);
-			cpu_set(cpu, cpu_possible_map);
+			set_cpu_possible(cpu, true);
 			cpu++;
 		}
 	}
@@ -480,7 +479,7 @@ void __init smp_setup_cpu_maps(void)
 			       maxcpus);
 
 		for (cpu = 0; cpu < maxcpus; cpu++)
-			cpu_set(cpu, cpu_possible_map);
+			set_cpu_possible(cpu, true);
 	out:
 		of_node_put(dn);
 	}
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index d387b3937cc..9b86a74d281 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -189,11 +189,11 @@ void arch_send_call_function_single_ipi(int cpu)
 	smp_ops->message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
 }
 
-void arch_send_call_function_ipi(cpumask_t mask)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
 	unsigned int cpu;
 
-	for_each_cpu_mask(cpu, mask)
+	for_each_cpu(cpu, mask)
 		smp_ops->message_pass(cpu, PPC_MSG_CALL_FUNCTION);
 }
 
@@ -287,7 +287,7 @@ void __devinit smp_prepare_boot_cpu(void)
 {
 	BUG_ON(smp_processor_id() != boot_cpuid);
 
-	cpu_set(boot_cpuid, cpu_online_map);
+	set_cpu_online(boot_cpuid, true);
 	cpu_set(boot_cpuid, per_cpu(cpu_sibling_map, boot_cpuid));
 	cpu_set(boot_cpuid, per_cpu(cpu_core_map, boot_cpuid));
 #ifdef CONFIG_PPC64
@@ -307,7 +307,7 @@ int generic_cpu_disable(void)
 	if (cpu == boot_cpuid)
 		return -EBUSY;
 
-	cpu_clear(cpu, cpu_online_map);
+	set_cpu_online(cpu, false);
 #ifdef CONFIG_PPC64
 	vdso_data->processorCount--;
 	fixup_irqs(cpu_online_map);
@@ -361,7 +361,7 @@ void generic_mach_cpu_die(void)
 	smp_wmb();
 	while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
 		cpu_relax();
-	cpu_set(cpu, cpu_online_map);
+	set_cpu_online(cpu, true);
 	local_irq_enable();
 }
 #endif
@@ -508,7 +508,7 @@ int __devinit start_secondary(void *unused)
 
 	ipi_call_lock();
 	notify_cpu_starting(cpu);
-	cpu_set(cpu, cpu_online_map);
+	set_cpu_online(cpu, true);
 	/* Update sibling maps */
 	base = cpu_first_thread_in_core(cpu);
 	for (i = 0; i < threads_per_core; i++) {
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 1cc5e9e5da9..b97c2d67f4a 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -22,7 +22,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index a0abce251d0..3faaf29bdb2 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -1,3 +1,4 @@
+
 /*
  *    Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp.
  *			 <benh@kernel.crashing.org>
@@ -74,7 +75,7 @@ static int vdso_ready;
 static union {
 	struct vdso_data	data;
 	u8			page[PAGE_SIZE];
-} vdso_data_store __attribute__((__section__(".data.page_aligned")));
+} vdso_data_store __page_aligned_data;
 struct vdso_data *vdso_data = &vdso_data_store.data;
 
 /* Format of the patch table */
diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
index b54b8168813..51ead52141b 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -16,7 +16,7 @@ GCOV_PROFILE := n
 
 EXTRA_CFLAGS := -shared -fno-common -fno-builtin
 EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso32.so.1 \
-		$(call ld-option, -Wl$(comma)--hash-style=sysv)
+		$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 EXTRA_AFLAGS := -D__VDSO32__ -s
 
 obj-y += vdso32_wrapper.o
diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
index 556f0caa5d8..6e8f507ed32 100644
--- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
+++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
@@ -1,7 +1,8 @@
 #include <linux/init.h>
+#include <linux/linkage.h>
 #include <asm/page.h>
 
-	.section ".data.page_aligned"
+	__PAGE_ALIGNED_DATA
 
 	.globl vdso32_start, vdso32_end
 	.balign PAGE_SIZE
diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
index dd0c8e93677..79da65d44a2 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -11,7 +11,7 @@ GCOV_PROFILE := n
 
 EXTRA_CFLAGS := -shared -fno-common -fno-builtin
 EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
-		$(call ld-option, -Wl$(comma)--hash-style=sysv)
+		$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 EXTRA_AFLAGS := -D__VDSO64__ -s
 
 obj-y += vdso64_wrapper.o
diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
index 0529cb9e3b9..b8553d62b79 100644
--- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
+++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
@@ -1,7 +1,8 @@
 #include <linux/init.h>
+#include <linux/linkage.h>
 #include <asm/page.h>
 
-	.section ".data.page_aligned"
+	__PAGE_ALIGNED_DATA
 
 	.globl vdso64_start, vdso64_end
 	.balign PAGE_SIZE
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index 937a38e7317..b40c22d697f 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -320,7 +320,7 @@ static int __init smp_psurge_probe(void)
 	if (ncpus > NR_CPUS)
 		ncpus = NR_CPUS;
 	for (i = 1; i < ncpus ; ++i)
-		cpu_set(i, cpu_present_map);
+		set_cpu_present(i, true);
 
 	if (ppc_md.progress) ppc_md.progress("smp_psurge_probe - done", 0x352);
 
@@ -867,7 +867,7 @@ static void __devinit smp_core99_setup_cpu(int cpu_nr)
 
 int smp_core99_cpu_disable(void)
 {
-	cpu_clear(smp_processor_id(), cpu_online_map);
+	set_cpu_online(smp_processor_id(), false);
 
 	/* XXX reset cpu affinity here */
 	mpic_cpu_set_priority(0xf);
@@ -952,7 +952,7 @@ void __init pmac_setup_smp(void)
 		int cpu;
 
 		for (cpu = 1; cpu < 4 && cpu < NR_CPUS; ++cpu)
-			cpu_set(cpu, cpu_possible_map);
+			set_cpu_possible(cpu, true);
 		smp_ops = &psurge_smp_ops;
 	}
 #endif /* CONFIG_PPC32 */
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index a20ead87153..ebff6d9a4e3 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -94,7 +94,7 @@ static int pseries_cpu_disable(void)
 {
 	int cpu = smp_processor_id();
 
-	cpu_clear(cpu, cpu_online_map);
+	set_cpu_online(cpu, false);
 	vdso_data->processorCount--;
 
 	/*fix boot_cpuid here*/
@@ -185,7 +185,7 @@ static int pseries_add_processor(struct device_node *np)
 
 	for_each_cpu_mask(cpu, tmp) {
 		BUG_ON(cpu_isset(cpu, cpu_present_map));
-		cpu_set(cpu, cpu_present_map);
+		set_cpu_present(cpu, true);
 		set_hard_smp_processor_id(cpu, *intserv++);
 	}
 	err = 0;
@@ -217,7 +217,7 @@ static void pseries_remove_processor(struct device_node *np)
 			if (get_hard_smp_processor_id(cpu) != intserv[i])
 				continue;
 			BUG_ON(cpu_online(cpu));
-			cpu_clear(cpu, cpu_present_map);
+			set_cpu_present(cpu, false);
 			set_hard_smp_processor_id(cpu, -1);
 			break;
 		}
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 264528e4f58..b55fd7ed1c3 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -50,10 +50,9 @@ static struct platform_device *appldata_pdev;
  * /proc entries (sysctl)
  */
 static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata";
-static int appldata_timer_handler(ctl_table *ctl, int write, struct file *filp,
+static int appldata_timer_handler(ctl_table *ctl, int write,
 				  void __user *buffer, size_t *lenp, loff_t *ppos);
 static int appldata_interval_handler(ctl_table *ctl, int write,
-					 struct file *filp,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
 
@@ -247,7 +246,7 @@ __appldata_vtimer_setup(int cmd)
  * Start/Stop timer, show status of timer (0 = not active, 1 = active)
  */
 static int
-appldata_timer_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_timer_handler(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int len;
@@ -289,7 +288,7 @@ out:
  * current timer interval.
  */
 static int
-appldata_interval_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_interval_handler(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int len, interval;
@@ -335,7 +334,7 @@ out:
  * monitoring (0 = not in process, 1 = in process)
  */
 static int
-appldata_generic_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_generic_handler(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct appldata_ops *ops = NULL, *tmp_ops;
diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh
index d4026f62cb0..aed3069699b 100644
--- a/arch/s390/boot/install.sh
+++ b/arch/s390/boot/install.sh
@@ -21,8 +21,8 @@
 
 # User may have a custom install script
 
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install - same as make zlilo
 
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index c991fe6473c..a868b272c25 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -62,7 +62,7 @@ extern struct mutex smp_cpu_state_mutex;
 extern int smp_cpu_polarization[];
 
 extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi(cpumask_t mask);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
 #endif
 
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index 5e0ad618dc4..6e7211abd95 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -9,7 +9,6 @@ const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
 
 extern cpumask_t cpu_core_map[NR_CPUS];
 
-#define topology_core_siblings(cpu)	(cpu_core_map[cpu])
 #define topology_core_cpumask(cpu)	(&cpu_core_map[cpu])
 
 int topology_set_cpu_management(int fc);
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 5519cb74510..0debcec23a3 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -24,7 +24,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 4c512561687..20f282c911c 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -881,11 +881,11 @@ static int debug_active=1;
  * if debug_active is already off
  */
 static int
-s390dbf_procactive(ctl_table *table, int write, struct file *filp,
+s390dbf_procactive(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (!write || debug_stoppable || !debug_active)
-		return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+		return proc_dointvec(table, write, buffer, lenp, ppos);
 	else
 		return 0;
 }
diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c
index fe787f9e5f3..4d1c9fb0b54 100644
--- a/arch/s390/kernel/init_task.c
+++ b/arch/s390/kernel/init_task.c
@@ -25,9 +25,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union 
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 59fe6ecc6ed..5417eb57271 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -27,7 +27,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
-#include <linux/utsname.h>
 #include <linux/tick.h>
 #include <linux/elfcore.h>
 #include <linux/kernel_stat.h>
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b4b6396e6cf..c932caa5e85 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -147,11 +147,11 @@ static void smp_ext_bitcall(int cpu, ec_bit_sig sig)
 		udelay(10);
 }
 
-void arch_send_call_function_ipi(cpumask_t mask)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
 	int cpu;
 
-	for_each_cpu_mask(cpu, mask)
+	for_each_cpu(cpu, mask)
 		smp_ext_bitcall(cpu, ec_call_function);
 }
 
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 45e1708b70f..45a3e9a7ae2 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -75,7 +75,7 @@ __setup("vdso=", vdso_setup);
 static union {
 	struct vdso_data	data;
 	u8			page[PAGE_SIZE];
-} vdso_data_store __attribute__((__section__(".data.page_aligned")));
+} vdso_data_store __page_aligned_data;
 struct vdso_data *vdso_data = &vdso_data_store.data;
 
 /*
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
index ca78ad60ba2..d13e8755a8c 100644
--- a/arch/s390/kernel/vdso32/Makefile
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -13,7 +13,7 @@ KBUILD_AFLAGS_31 += -m31 -s
 KBUILD_CFLAGS_31 := $(filter-out -m64,$(KBUILD_CFLAGS))
 KBUILD_CFLAGS_31 += -m31 -fPIC -shared -fno-common -fno-builtin
 KBUILD_CFLAGS_31 += -nostdlib -Wl,-soname=linux-vdso32.so.1 \
-			$(call ld-option, -Wl$(comma)--hash-style=sysv)
+			$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 
 $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_31)
 $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_31)
diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S
index 61639a89e70..ae42f8ce350 100644
--- a/arch/s390/kernel/vdso32/vdso32_wrapper.S
+++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S
@@ -1,7 +1,8 @@
 #include <linux/init.h>
+#include <linux/linkage.h>
 #include <asm/page.h>
 
-	.section ".data.page_aligned"
+	__PAGE_ALIGNED_DATA
 
 	.globl vdso32_start, vdso32_end
 	.balign PAGE_SIZE
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index 6fc8e829258..449352dda9c 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -13,7 +13,7 @@ KBUILD_AFLAGS_64 += -m64 -s
 KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS))
 KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin
 KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
-			$(call ld-option, -Wl$(comma)--hash-style=sysv)
+			$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 
 $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64)
 $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64)
diff --git a/arch/s390/kernel/vdso64/vdso64_wrapper.S b/arch/s390/kernel/vdso64/vdso64_wrapper.S
index d8e2ac14d56..c245842b516 100644
--- a/arch/s390/kernel/vdso64/vdso64_wrapper.S
+++ b/arch/s390/kernel/vdso64/vdso64_wrapper.S
@@ -1,7 +1,8 @@
 #include <linux/init.h>
+#include <linux/linkage.h>
 #include <asm/page.h>
 
-	.section ".data.page_aligned"
+	__PAGE_ALIGNED_DATA
 
 	.globl vdso64_start, vdso64_end
 	.balign PAGE_SIZE
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 413c240cbca..b201135cc18 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -262,7 +262,7 @@ cmm_skip_blanks(char *cp, char **endp)
 static struct ctl_table cmm_table[];
 
 static int
-cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
+cmm_pages_handler(ctl_table *ctl, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[16], *p;
@@ -303,7 +303,7 @@ cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
 }
 
 static int
-cmm_timeout_handler(ctl_table *ctl, int write, struct file *filp,
+cmm_timeout_handler(ctl_table *ctl, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[64], *p;
diff --git a/arch/score/kernel/init_task.c b/arch/score/kernel/init_task.c
index ff952f6c63f..baa03ee217d 100644
--- a/arch/score/kernel/init_task.c
+++ b/arch/score/kernel/init_task.c
@@ -34,9 +34,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"), __aligned__(THREAD_SIZE))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/sh/boot/compressed/install.sh b/arch/sh/boot/compressed/install.sh
index 90589f0fec1..f9f41818b17 100644
--- a/arch/sh/boot/compressed/install.sh
+++ b/arch/sh/boot/compressed/install.sh
@@ -23,8 +23,8 @@
 
 # User may have a custom install script
 
-if [ -x /sbin/installkernel ]; then
-  exec /sbin/installkernel "$@"
+if [ -x /sbin/${INSTALLKERNEL} ]; then
+  exec /sbin/${INSTALLKERNEL} "$@"
 fi
 
 if [ "$2" = "zImage" ]; then
diff --git a/arch/sh/drivers/dma/Kconfig b/arch/sh/drivers/dma/Kconfig
index b91fa8dbf04..4d58eb0973d 100644
--- a/arch/sh/drivers/dma/Kconfig
+++ b/arch/sh/drivers/dma/Kconfig
@@ -1,12 +1,9 @@
 menu "DMA support"
 
-config SH_DMA_API
-	bool
 
 config SH_DMA
 	bool "SuperH on-chip DMA controller (DMAC) support"
 	depends on CPU_SH3 || CPU_SH4
-	select SH_DMA_API
 	default n
 
 config SH_DMA_IRQ_MULTI
@@ -19,6 +16,15 @@ config SH_DMA_IRQ_MULTI
 		     CPU_SUBTYPE_SH7780  || CPU_SUBTYPE_SH7785  || \
 		     CPU_SUBTYPE_SH7760
 
+config SH_DMA_API
+	depends on SH_DMA
+	bool "SuperH DMA API support"
+	default n
+	help
+	  SH_DMA_API always enabled DMA API of used SuperH.
+	  If you want to use DMA ENGINE, you must not enable this.
+	  Please enable DMA_ENGINE and SH_DMAE.
+
 config NR_ONCHIP_DMA_CHANNELS
 	int
 	depends on SH_DMA
diff --git a/arch/sh/drivers/dma/Makefile b/arch/sh/drivers/dma/Makefile
index c6068137b46..d88c9484762 100644
--- a/arch/sh/drivers/dma/Makefile
+++ b/arch/sh/drivers/dma/Makefile
@@ -2,8 +2,7 @@
 # Makefile for the SuperH DMA specific kernel interface routines under Linux.
 #
 
-obj-$(CONFIG_SH_DMA_API)	+= dma-api.o dma-sysfs.o
-obj-$(CONFIG_SH_DMA)		+= dma-sh.o
+obj-$(CONFIG_SH_DMA_API)	+= dma-sh.o dma-api.o dma-sysfs.o
 obj-$(CONFIG_PVR2_DMA)		+= dma-pvr2.o
 obj-$(CONFIG_G2_DMA)		+= dma-g2.o
 obj-$(CONFIG_SH_DMABRG)		+= dmabrg.o
diff --git a/arch/sh/include/asm/dma-sh.h b/arch/sh/include/asm/dma-sh.h
index 68a5f4cb034..78eed3e0bdf 100644
--- a/arch/sh/include/asm/dma-sh.h
+++ b/arch/sh/include/asm/dma-sh.h
@@ -116,4 +116,17 @@ static u32 dma_base_addr[] __maybe_unused = {
 #define CHCR    0x0C
 #define DMAOR	0x40
 
+/*
+ * for dma engine
+ *
+ * SuperH DMA mode
+ */
+#define SHDMA_MIX_IRQ	(1 << 1)
+#define SHDMA_DMAOR1	(1 << 2)
+#define SHDMA_DMAE1		(1 << 3)
+
+struct sh_dmae_pdata {
+	unsigned int mode;
+};
+
 #endif /* __DMA_SH_H */
diff --git a/arch/sh/include/asm/smp.h b/arch/sh/include/asm/smp.h
index ca64f43abe6..53ef26ced75 100644
--- a/arch/sh/include/asm/smp.h
+++ b/arch/sh/include/asm/smp.h
@@ -44,7 +44,6 @@ void plat_send_ipi(unsigned int cpu, unsigned int message);
 
 void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #else
 
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index f8c40cc6505..65e7bd2f224 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -31,7 +31,6 @@
 #define cpu_to_node(cpu)	((void)(cpu),0)
 #define parent_node(node)	((void)(node),0)
 
-#define node_to_cpumask(node)	((void)node, cpu_online_map)
 #define cpumask_of_node(node)	((void)node, cpu_online_mask)
 
 #define pcibus_to_node(bus)	((void)(bus), -1)
diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c
index 1719957c0a6..11f2ea556a6 100644
--- a/arch/sh/kernel/init_task.c
+++ b/arch/sh/kernel/init_task.c
@@ -17,9 +17,8 @@ struct pt_regs fake_swapper_regs;
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 60f8af4497c..7cb933ba495 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -165,11 +165,9 @@ asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs)
 }
 
 #ifdef CONFIG_IRQSTACKS
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
 
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
 
 /*
  * allocate per-cpu stacks for hardirq and for softirq processing
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index 63ba12836ea..eb68bfdd86e 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -9,7 +9,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/ipc.h>
diff --git a/arch/sh/kernel/sys_sh64.c b/arch/sh/kernel/sys_sh64.c
index 91fb8445a5a..287235768bc 100644
--- a/arch/sh/kernel/sys_sh64.c
+++ b/arch/sh/kernel/sys_sh64.c
@@ -23,7 +23,6 @@
 #include <linux/stat.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/syscalls.h>
 #include <linux/ipc.h>
 #include <asm/uaccess.h>
diff --git a/arch/sh/kernel/vsyscall/Makefile b/arch/sh/kernel/vsyscall/Makefile
index 4bbce1cfa35..8f0ea5fc835 100644
--- a/arch/sh/kernel/vsyscall/Makefile
+++ b/arch/sh/kernel/vsyscall/Makefile
@@ -15,7 +15,7 @@ quiet_cmd_syscall = SYSCALL $@
 export CPPFLAGS_vsyscall.lds += -P -C -Ush
 
 vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
-		$(call ld-option, -Wl$(comma)--hash-style=sysv)
+		$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 
 SYSCFLAGS_vsyscall-trapa.so	= $(vsyscall-flags)
 
diff --git a/arch/sparc/Makefile b/arch/sparc/Makefile
index 467221dd570..dfe272d1446 100644
--- a/arch/sparc/Makefile
+++ b/arch/sparc/Makefile
@@ -31,7 +31,6 @@ export BITS    := 32
 #KBUILD_CFLAGS += -g -pipe -fcall-used-g5 -fcall-used-g7
 KBUILD_CFLAGS += -m32 -pipe -mno-fpu -fcall-used-g5 -fcall-used-g7
 KBUILD_AFLAGS += -m32
-CPPFLAGS_vmlinux.lds += -m32
 
 #LDFLAGS_vmlinux = -N -Ttext 0xf0004000
 #  Since 2.5.40, the first stage is left not btfix-ed.
@@ -45,9 +44,6 @@ else
 
 CHECKFLAGS      += -D__sparc__ -D__sparc_v9__ -D__arch64__ -m64
 
-# Undefine sparc when processing vmlinux.lds - it is used
-# And teach CPP we are doing 64 bit builds (for this case)
-CPPFLAGS_vmlinux.lds += -m64 -Usparc
 LDFLAGS              := -m elf64_sparc
 export BITS          := 64
 
diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h
index becb6bf353a..f49e11cd4de 100644
--- a/arch/sparc/include/asm/smp_64.h
+++ b/arch/sparc/include/asm/smp_64.h
@@ -36,7 +36,6 @@ extern int sparc64_multi_core;
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 /*
  *	General functions that each host system must provide.
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 26cd25c0839..600a79035fa 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -12,22 +12,8 @@ static inline int cpu_to_node(int cpu)
 
 #define parent_node(node)	(node)
 
-static inline cpumask_t node_to_cpumask(int node)
-{
-	return numa_cpumask_lookup_table[node];
-}
 #define cpumask_of_node(node) (&numa_cpumask_lookup_table[node])
 
-/*
- * Returns a pointer to the cpumask of CPUs on Node 'node'.
- * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
- */
-#define node_to_cpumask_ptr(v, node)		\
-		cpumask_t *v = &(numa_cpumask_lookup_table[node])
-
-#define node_to_cpumask_ptr_next(v, node)	\
-			   v = &(numa_cpumask_lookup_table[node])
-
 struct pci_bus;
 #ifdef CONFIG_PCI
 extern int pcibus_to_node(struct pci_bus *pbus);
@@ -71,8 +57,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 #ifdef CONFIG_SMP
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).proc_id)
 #define topology_core_id(cpu)			(cpu_data(cpu).core_id)
-#define topology_core_siblings(cpu)		(cpu_core_map[cpu])
-#define topology_thread_siblings(cpu)		(per_cpu(cpu_sibling_map, cpu))
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
 #define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #define mc_capable()				(sparc64_multi_core)
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 3a048fad7ee..5b47fab9966 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -7,7 +7,11 @@ ccflags-y := -Werror
 
 extra-y     := head_$(BITS).o
 extra-y     += init_task.o
-extra-y     += vmlinux.lds
+
+# Undefine sparc when processing vmlinux.lds - it is used
+# And teach CPP we are doing $(BITS) builds (for this case)
+CPPFLAGS_vmlinux.lds := -Usparc -m$(BITS)
+extra-y              += vmlinux.lds
 
 obj-$(CONFIG_SPARC32)   += entry.o wof.o wuf.o
 obj-$(CONFIG_SPARC32)   += etrap_32.o
diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c
index 28125c5b3d3..5fe3d65581f 100644
--- a/arch/sparc/kernel/init_task.c
+++ b/arch/sparc/kernel/init_task.c
@@ -18,6 +18,5 @@ EXPORT_SYMBOL(init_task);
  * If this is not aligned on a 8k boundry, then you should change code
  * in etrap.S which assumes it.
  */
-union thread_union init_thread_union
-	__attribute__((section (".data.init_task")))
-	= { INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index f5000a460c0..04e28b2671c 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -16,7 +16,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
diff --git a/arch/sparc/kernel/systbls.h b/arch/sparc/kernel/systbls.h
index 15c2d752b2b..a63c5d2d984 100644
--- a/arch/sparc/kernel/systbls.h
+++ b/arch/sparc/kernel/systbls.h
@@ -3,10 +3,11 @@
 
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/utsname.h>
 #include <asm/utrap.h>
 #include <asm/signal.h>
 
+struct new_utsname;
+
 extern asmlinkage unsigned long sys_getpagesize(void);
 extern asmlinkage unsigned long sparc_brk(unsigned long brk);
 extern asmlinkage long sparc_pipe(struct pt_regs *regs);
diff --git a/arch/um/Makefile b/arch/um/Makefile
index 0728def3223..fc633dbacf8 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -96,11 +96,10 @@ CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,)
 	$(call cc-option, -fno-stack-protector,) \
 	$(call cc-option, -fno-stack-protector-all,)
 
-CONFIG_KERNEL_STACK_ORDER ?= 2
-STACK_SIZE := $(shell echo $$[ 4096 * (1 << $(CONFIG_KERNEL_STACK_ORDER)) ] )
-
-CPPFLAGS_vmlinux.lds = -U$(SUBARCH) -DSTART=$(START) -DELF_ARCH=$(ELF_ARCH) \
-	-DELF_FORMAT="$(ELF_FORMAT)" -DKERNEL_STACK_SIZE=$(STACK_SIZE)
+# Options used by linker script
+export LDS_START      := $(START)
+export LDS_ELF_ARCH   := $(ELF_ARCH)
+export LDS_ELF_FORMAT := $(ELF_FORMAT)
 
 # The wrappers will select whether using "malloc" or the kernel allocator.
 LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index 54f42e8b010..34d813011b7 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -35,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	unsigned cpu = smp_processor_id();
 
 	if(prev != next){
-		cpu_clear(cpu, prev->cpu_vm_mask);
-		cpu_set(cpu, next->cpu_vm_mask);
+		cpumask_clear_cpu(cpu, mm_cpumask(prev));
+		cpumask_set_cpu(cpu, mm_cpumask(next));
 		if(next != &init_mm)
 			__switch_mm(&next->context.id);
 	}
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 388ec0a3ea9..1119233597a 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -3,6 +3,9 @@
 # Licensed under the GPL
 #
 
+CPPFLAGS_vmlinux.lds := -U$(SUBARCH) -DSTART=$(LDS_START) \
+                        -DELF_ARCH=$(LDS_ELF_ARCH)        \
+                        -DELF_FORMAT=$(LDS_ELF_FORMAT)
 extra-y := vmlinux.lds
 clean-files :=
 
diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c
index b25121b537d..8aa77b61a5f 100644
--- a/arch/um/kernel/init_task.c
+++ b/arch/um/kernel/init_task.c
@@ -30,9 +30,8 @@ EXPORT_SYMBOL(init_task);
  * "init_task" linker map entry..
  */
 
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 union thread_union cpu0_irqstack
 	__attribute__((__section__(".data.init_irqstack"))) =
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index 98351c78bc8..106bf27e2a9 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -111,7 +111,7 @@ void smp_prepare_cpus(unsigned int maxcpus)
 	int i;
 
 	for (i = 0; i < ncpus; ++i)
-		cpu_set(i, cpu_possible_map);
+		set_cpu_possible(i, true);
 
 	cpu_clear(me, cpu_online_map);
 	cpu_set(me, cpu_online_map);
diff --git a/arch/um/kernel/vmlinux.lds.S b/arch/um/kernel/vmlinux.lds.S
index f8aeb448aab..16e49bfa2b4 100644
--- a/arch/um/kernel/vmlinux.lds.S
+++ b/arch/um/kernel/vmlinux.lds.S
@@ -1,3 +1,6 @@
+
+KERNEL_STACK_SIZE = 4096 * (1 << CONFIG_KERNEL_STACK_ORDER);
+
 #ifdef CONFIG_LD_SCRIPT_STATIC
 #include "uml.lds.S"
 #else
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 7983c420eaf..a012ee8ef80 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -179,8 +179,8 @@ archclean:
 define archhelp
   echo  '* bzImage      - Compressed kernel image (arch/x86/boot/bzImage)'
   echo  '  install      - Install kernel using'
-  echo  '                  (your) ~/bin/installkernel or'
-  echo  '                  (distribution) /sbin/installkernel or'
+  echo  '                  (your) ~/bin/$(INSTALLKERNEL) or'
+  echo  '                  (distribution) /sbin/$(INSTALLKERNEL) or'
   echo  '                  install to $$(INSTALL_PATH) and run lilo'
   echo  '  fdimage      - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
   echo  '  fdimage144   - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
index 8d60ee15dfd..d13ec1c3864 100644
--- a/arch/x86/boot/install.sh
+++ b/arch/x86/boot/install.sh
@@ -33,8 +33,8 @@ verify "$3"
 
 # User may have a custom install script
 
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install - same as make zlilo
 
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 5d367caa0e3..549860d3be8 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_CACHE_H
 #define _ASM_X86_CACHE_H
 
+#include <linux/linkage.h>
+
 /* L1 cache line size */
 #define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
 #define L1_CACHE_BYTES	(1 << L1_CACHE_SHIFT)
@@ -13,7 +15,7 @@
 #ifdef CONFIG_SMP
 #define __cacheline_aligned_in_smp					\
 	__attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT))))	\
-	__attribute__((__section__(".data.page_aligned")))
+	__page_aligned_data
 #endif
 #endif
 
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index f923203dc39..4a2d4e0c18d 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -37,12 +37,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 	if (likely(prev != next)) {
 		/* stop flush ipis for the previous mm */
-		cpu_clear(cpu, prev->cpu_vm_mask);
+		cpumask_clear_cpu(cpu, mm_cpumask(prev));
 #ifdef CONFIG_SMP
 		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
 		percpu_write(cpu_tlbstate.active_mm, next);
 #endif
-		cpu_set(cpu, next->cpu_vm_mask);
+		cpumask_set_cpu(cpu, mm_cpumask(next));
 
 		/* Re-load page tables */
 		load_cr3(next->pgd);
@@ -58,7 +58,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
 		BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
 
-		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+		if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
 			/* We were in lazy tlb mode and leave_mm disabled
 			 * tlb flush IPI delivery. We must reload CR3
 			 * to make sure to use no freed page tables.
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index e63cf7d441e..139d4c1a33a 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -40,8 +40,7 @@ extern unsigned int nmi_watchdog;
 #define NMI_INVALID	3
 
 struct ctl_table;
-struct file;
-extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+extern int proc_nmi_enabled(struct ctl_table *, int ,
 			void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
 
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index f76a162c082..ada8c201d51 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -143,7 +143,11 @@ static inline int __pcibus_to_node(const struct pci_bus *bus)
 static inline const struct cpumask *
 cpumask_of_pcibus(const struct pci_bus *bus)
 {
-	return cpumask_of_node(__pcibus_to_node(bus));
+	int node;
+
+	node = __pcibus_to_node(bus);
+	return (node == -1) ? cpu_online_mask :
+			      cpumask_of_node(node);
 }
 #endif
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 6a84ed166ae..1e796782cd7 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -121,7 +121,6 @@ static inline void arch_send_call_function_single_ipi(int cpu)
 	smp_ops.send_call_func_single_ipi(cpu);
 }
 
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
 	smp_ops.send_call_func_ipi(mask);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 64970b9885f..dc69f28489f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -227,17 +227,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node)
 
 	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
 	if (cfg) {
-		if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
+		if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
 			kfree(cfg);
 			cfg = NULL;
-		} else if (!alloc_cpumask_var_node(&cfg->old_domain,
+		} else if (!zalloc_cpumask_var_node(&cfg->old_domain,
 							  GFP_ATOMIC, node)) {
 			free_cpumask_var(cfg->domain);
 			kfree(cfg);
 			cfg = NULL;
-		} else {
-			cpumask_clear(cfg->domain);
-			cpumask_clear(cfg->old_domain);
 		}
 	}
 
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index cb66a22d98a..7ff61d6a188 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 /*
  * proc handler for /proc/sys/kernel/nmi
  */
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+int proc_nmi_enabled(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int old_state;
 
 	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
 	old_state = nmi_watchdog_enabled;
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	if (!!old_state == !!nmi_watchdog_enabled)
 		return 0;
 
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba91c9..f7dd2a7c3bf 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -5,7 +5,6 @@
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
-#include <linux/utsname.h>
 #include <linux/hardirq.h>
 #include <linux/kdebug.h>
 #include <linux/module.h>
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a327676..a071e6be177 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -5,7 +5,6 @@
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
-#include <linux/utsname.h>
 #include <linux/hardirq.h>
 #include <linux/kdebug.h>
 #include <linux/module.h>
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b766e8c7252..218aad7ee76 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,7 +608,7 @@ ENTRY(initial_code)
 /*
  * BSS section
  */
-.section ".bss.page_aligned","wa"
+__PAGE_ALIGNED_BSS
 	.align PAGE_SIZE_asm
 #ifdef CONFIG_X86_PAE
 swapper_pg_pmd:
@@ -626,7 +626,7 @@ ENTRY(empty_zero_page)
  * This starts the data section.
  */
 #ifdef CONFIG_X86_PAE
-.section ".data.page_aligned","wa"
+__PAGE_ALIGNED_DATA
 	/* Page-aligned for the benefit of paravirt? */
 	.align PAGE_SIZE_asm
 ENTRY(swapper_pg_dir)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fa54f78e2a0..d0bc0a13a43 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -418,7 +418,7 @@ ENTRY(phys_base)
 ENTRY(idt_table)
 	.skip IDT_ENTRIES * 16
 
-	.section .bss.page_aligned, "aw", @nobits
+	__PAGE_ALIGNED_BSS
 	.align PAGE_SIZE
 ENTRY(empty_zero_page)
 	.skip PAGE_SIZE
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 270ff83efc1..3a54dcb9cd0 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 71f1d99a635..ec6ef60cbd1 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 #ifdef CONFIG_SMP
 		preempt_disable();
 		load_LDT(pc);
-		if (!cpus_equal(current->mm->cpu_vm_mask,
-				cpumask_of_cpu(smp_processor_id())))
+		if (!cpumask_equal(mm_cpumask(current->mm),
+				   cpumask_of(smp_processor_id())))
 			smp_call_function(flush_ldt, current->mm, 1);
 		preempt_enable();
 #else
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 847ab416031..5284cd2b577 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -555,10 +555,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 void __init init_c1e_mask(void)
 {
 	/* If we're using c1e_idle, we need to allocate c1e_mask. */
-	if (pm_idle == c1e_idle) {
-		alloc_cpumask_var(&c1e_mask, GFP_KERNEL);
-		cpumask_clear(c1e_mask);
-	}
+	if (pm_idle == c1e_idle)
+		zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
 }
 
 static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 09c5e077dff..565ebc65920 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1059,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 #endif
 	current_thread_info()->cpu = 0;  /* needed? */
 	for_each_possible_cpu(i) {
-		alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
-		alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
-		alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
-		cpumask_clear(per_cpu(cpu_core_map, i));
-		cpumask_clear(per_cpu(cpu_sibling_map, i));
-		cpumask_clear(cpu_data(i).llc_shared_map);
+		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
 	}
 	set_cpu_sibling_map(0);
 
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index e293ac56c72..dcb00d27851 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -93,7 +93,6 @@ static struct irqaction irq0  = {
 
 void __init setup_default_timer_irq(void)
 {
-	irq0.mask = cpumask_of_cpu(0);
 	setup_irq(0, &irq0);
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9346e102338..a665c71352b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -14,7 +14,6 @@
 #include <linux/spinlock.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
-#include <linux/utsname.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index cf53a78e2dc..8cb4974ff59 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -228,19 +228,11 @@ static long __vsyscall(3) venosys_1(void)
 }
 
 #ifdef CONFIG_SYSCTL
-
-static int
-vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
-		       void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-}
-
 static ctl_table kernel_table2[] = {
 	{ .procname = "vsyscall64",
 	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
 	  .mode = 0644,
-	  .proc_handler = vsyscall_sysctl_change },
+	  .proc_handler = proc_dointvec },
 	{}
 };
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 82728f2c6d5..f4cee9028cf 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 	info.si_errno	= 0;
 	info.si_code	= si_code;
 	info.si_addr	= (void __user *)address;
+	info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
 
 	force_sig_info(si_signo, &info, tsk);
 }
@@ -790,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code,
 }
 
 static void
-do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+	  unsigned int fault)
 {
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
+	int code = BUS_ADRERR;
 
 	up_read(&mm->mmap_sem);
 
@@ -809,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 	tsk->thread.error_code	= error_code;
 	tsk->thread.trap_no	= 14;
 
-	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+#ifdef CONFIG_MEMORY_FAILURE
+	if (fault & VM_FAULT_HWPOISON) {
+		printk(KERN_ERR
+	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+			tsk->comm, tsk->pid, address);
+		code = BUS_MCEERR_AR;
+	}
+#endif
+	force_sig_info_fault(SIGBUS, code, address, tsk);
 }
 
 static noinline void
@@ -819,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	if (fault & VM_FAULT_OOM) {
 		out_of_memory(regs, error_code, address);
 	} else {
-		if (fault & VM_FAULT_SIGBUS)
-			do_sigbus(regs, error_code, address);
+		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+			do_sigbus(regs, error_code, address, fault);
 		else
 			BUG();
 	}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 24952fdc7e4..dd38bfbefd1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -144,6 +144,7 @@ void clflush_cache_range(void *vaddr, unsigned int size)
 
 	mb();
 }
+EXPORT_SYMBOL_GPL(clflush_cache_range);
 
 static void __cpa_flush_all(void *arg)
 {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c814e144a3f..36fe08eeb5c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -59,7 +59,8 @@ void leave_mm(int cpu)
 {
 	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
 		BUG();
-	cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
+	cpumask_clear_cpu(cpu,
+			  mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
 	load_cr3(swapper_pg_dir);
 }
 EXPORT_SYMBOL_GPL(leave_mm);
@@ -234,8 +235,8 @@ void flush_tlb_current_task(void)
 	preempt_disable();
 
 	local_flush_tlb();
-	if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
 	preempt_enable();
 }
 
@@ -249,8 +250,8 @@ void flush_tlb_mm(struct mm_struct *mm)
 		else
 			leave_mm(smp_processor_id());
 	}
-	if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
 
 	preempt_enable();
 }
@@ -268,8 +269,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 			leave_mm(smp_processor_id());
 	}
 
-	if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(&mm->cpu_vm_mask, mm, va);
+	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(mm_cpumask(mm), mm, va);
 
 	preempt_enable();
 }
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 5db96d4304d..1331fcf2614 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -646,7 +646,7 @@ int get_mp_bus_to_node(int busnum)
 
 #else /* CONFIG_X86_32 */
 
-static unsigned char mp_bus_to_node[BUS_NR] = {
+static int mp_bus_to_node[BUS_NR] = {
 	[0 ... BUS_NR - 1] = -1
 };
 
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 88112b49f02..6b4ffedb93c 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -122,7 +122,7 @@ quiet_cmd_vdso = VDSO    $@
 		       $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
 		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
 
-VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv)
+VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 GCOV_PROFILE := n
 
 #
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 093dd59b538..3bf7b1d250c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1165,14 +1165,14 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 	/* Get the "official" set of cpus referring to our pagetable. */
 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
 		for_each_online_cpu(cpu) {
-			if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
 			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
 				continue;
 			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
 		}
 		return;
 	}
-	cpumask_copy(mask, &mm->cpu_vm_mask);
+	cpumask_copy(mask, mm_cpumask(mm));
 
 	/* It's possible that a vcpu may have a stale reference to our
 	   cr3, because its in lazy mode, and it hasn't yet flushed
diff --git a/arch/xtensa/kernel/Makefile b/arch/xtensa/kernel/Makefile
index fe3186de6a3..6f56d95f2c1 100644
--- a/arch/xtensa/kernel/Makefile
+++ b/arch/xtensa/kernel/Makefile
@@ -27,7 +27,8 @@ sed-y = -e 's/(\(\.[a-z]*it\|\.ref\|\)\.text)/(\1.literal \1.text)/g'	\
 	-e 's/(\(\.text\.[a-z]*\))/(\1.literal \1)/g'
 
 quiet_cmd__cpp_lds_S = LDS     $@
-      cmd__cpp_lds_S = $(CPP) $(cpp_flags) -D__ASSEMBLY__ $< | sed $(sed-y) >$@
+      cmd__cpp_lds_S = $(CPP) $(cpp_flags) -P -C -Uxtensa -D__ASSEMBLY__ $< \
+                       | sed $(sed-y) >$@
 
 $(obj)/vmlinux.lds: $(src)/vmlinux.lds.S FORCE
 	$(call if_changed_dep,_cpp_lds_S)
diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S
index d9ddc1ba761..d215adcfd4e 100644
--- a/arch/xtensa/kernel/head.S
+++ b/arch/xtensa/kernel/head.S
@@ -235,7 +235,7 @@ should_never_return:
  * BSS section
  */
 	
-.section ".bss.page_aligned", "w"
+__PAGE_ALIGNED_BSS
 #ifdef CONFIG_MMU
 ENTRY(swapper_pg_dir)
 	.fill	PAGE_SIZE, 1, 0
diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c
index c4302f0e4ba..cd122fb7e48 100644
--- a/arch/xtensa/kernel/init_task.c
+++ b/arch/xtensa/kernel/init_task.c
@@ -23,9 +23,8 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 struct task_struct init_task = INIT_TASK(init_task);
 
diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig
index d8fb3914598..e5aeb2b79e6 100644
--- a/crypto/async_tx/Kconfig
+++ b/crypto/async_tx/Kconfig
@@ -14,3 +14,12 @@ config ASYNC_MEMSET
 	tristate
 	select ASYNC_CORE
 
+config ASYNC_PQ
+	tristate
+	select ASYNC_CORE
+
+config ASYNC_RAID6_RECOV
+	tristate
+	select ASYNC_CORE
+	select ASYNC_PQ
+
diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile
index 27baa7d52fb..d1e0e6f72bc 100644
--- a/crypto/async_tx/Makefile
+++ b/crypto/async_tx/Makefile
@@ -2,3 +2,6 @@ obj-$(CONFIG_ASYNC_CORE) += async_tx.o
 obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o
 obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o
 obj-$(CONFIG_ASYNC_XOR) += async_xor.o
+obj-$(CONFIG_ASYNC_PQ) += async_pq.o
+obj-$(CONFIG_ASYNC_RAID6_RECOV) += async_raid6_recov.o
+obj-$(CONFIG_ASYNC_RAID6_TEST) += raid6test.o
diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c
index ddccfb01c41..0ec1fb69d4e 100644
--- a/crypto/async_tx/async_memcpy.c
+++ b/crypto/async_tx/async_memcpy.c
@@ -33,28 +33,31 @@
  * async_memcpy - attempt to copy memory with a dma engine.
  * @dest: destination page
  * @src: src page
- * @offset: offset in pages to start transaction
+ * @dest_offset: offset into 'dest' to start transaction
+ * @src_offset: offset into 'src' to start transaction
  * @len: length in bytes
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK,
- * @depend_tx: memcpy depends on the result of this transaction
- * @cb_fn: function to call when the memcpy completes
- * @cb_param: parameter to pass to the callback routine
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK
  */
 struct dma_async_tx_descriptor *
 async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
-	unsigned int src_offset, size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+	     unsigned int src_offset, size_t len,
+	     struct async_submit_ctl *submit)
 {
-	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMCPY,
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMCPY,
 						      &dest, 1, &src, 1, len);
 	struct dma_device *device = chan ? chan->device : NULL;
 	struct dma_async_tx_descriptor *tx = NULL;
 
-	if (device) {
+	if (device && is_dma_copy_aligned(device, src_offset, dest_offset, len)) {
 		dma_addr_t dma_dest, dma_src;
-		unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+		unsigned long dma_prep_flags = 0;
 
+		if (submit->cb_fn)
+			dma_prep_flags |= DMA_PREP_INTERRUPT;
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_prep_flags |= DMA_PREP_FENCE;
 		dma_dest = dma_map_page(device->dev, dest, dest_offset, len,
 					DMA_FROM_DEVICE);
 
@@ -67,13 +70,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 
 	if (tx) {
 		pr_debug("%s: (async) len: %zu\n", __func__, len);
-		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+		async_tx_submit(chan, tx, submit);
 	} else {
 		void *dest_buf, *src_buf;
 		pr_debug("%s: (sync) len: %zu\n", __func__, len);
 
 		/* wait for any prerequisite operations */
-		async_tx_quiesce(&depend_tx);
+		async_tx_quiesce(&submit->depend_tx);
 
 		dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset;
 		src_buf = kmap_atomic(src, KM_USER1) + src_offset;
@@ -83,26 +86,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 		kunmap_atomic(dest_buf, KM_USER0);
 		kunmap_atomic(src_buf, KM_USER1);
 
-		async_tx_sync_epilog(cb_fn, cb_param);
+		async_tx_sync_epilog(submit);
 	}
 
 	return tx;
 }
 EXPORT_SYMBOL_GPL(async_memcpy);
 
-static int __init async_memcpy_init(void)
-{
-	return 0;
-}
-
-static void __exit async_memcpy_exit(void)
-{
-	do { } while (0);
-}
-
-module_init(async_memcpy_init);
-module_exit(async_memcpy_exit);
-
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("asynchronous memcpy api");
 MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c
index 5b5eb99bb24..58e4a8752ae 100644
--- a/crypto/async_tx/async_memset.c
+++ b/crypto/async_tx/async_memset.c
@@ -35,26 +35,26 @@
  * @val: fill value
  * @offset: offset in pages to start transaction
  * @len: length in bytes
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
- * @depend_tx: memset depends on the result of this transaction
- * @cb_fn: function to call when the memcpy completes
- * @cb_param: parameter to pass to the callback routine
+ *
+ * honored flags: ASYNC_TX_ACK
  */
 struct dma_async_tx_descriptor *
-async_memset(struct page *dest, int val, unsigned int offset,
-	size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+async_memset(struct page *dest, int val, unsigned int offset, size_t len,
+	     struct async_submit_ctl *submit)
 {
-	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMSET,
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMSET,
 						      &dest, 1, NULL, 0, len);
 	struct dma_device *device = chan ? chan->device : NULL;
 	struct dma_async_tx_descriptor *tx = NULL;
 
-	if (device) {
+	if (device && is_dma_fill_aligned(device, offset, 0, len)) {
 		dma_addr_t dma_dest;
-		unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+		unsigned long dma_prep_flags = 0;
 
+		if (submit->cb_fn)
+			dma_prep_flags |= DMA_PREP_INTERRUPT;
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_prep_flags |= DMA_PREP_FENCE;
 		dma_dest = dma_map_page(device->dev, dest, offset, len,
 					DMA_FROM_DEVICE);
 
@@ -64,38 +64,25 @@ async_memset(struct page *dest, int val, unsigned int offset,
 
 	if (tx) {
 		pr_debug("%s: (async) len: %zu\n", __func__, len);
-		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+		async_tx_submit(chan, tx, submit);
 	} else { /* run the memset synchronously */
 		void *dest_buf;
 		pr_debug("%s: (sync) len: %zu\n", __func__, len);
 
-		dest_buf = (void *) (((char *) page_address(dest)) + offset);
+		dest_buf = page_address(dest) + offset;
 
 		/* wait for any prerequisite operations */
-		async_tx_quiesce(&depend_tx);
+		async_tx_quiesce(&submit->depend_tx);
 
 		memset(dest_buf, val, len);
 
-		async_tx_sync_epilog(cb_fn, cb_param);
+		async_tx_sync_epilog(submit);
 	}
 
 	return tx;
 }
 EXPORT_SYMBOL_GPL(async_memset);
 
-static int __init async_memset_init(void)
-{
-	return 0;
-}
-
-static void __exit async_memset_exit(void)
-{
-	do { } while (0);
-}
-
-module_init(async_memset_init);
-module_exit(async_memset_exit);
-
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("asynchronous memset api");
 MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
new file mode 100644
index 00000000000..b88db6d1dc6
--- /dev/null
+++ b/crypto/async_tx/async_pq.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright(c) 2007 Yuri Tikhonov <yur@emcraft.com>
+ * Copyright(c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <linux/raid/pq.h>
+#include <linux/async_tx.h>
+
+/**
+ * scribble - space to hold throwaway P buffer for synchronous gen_syndrome
+ */
+static struct page *scribble;
+
+static bool is_raid6_zero_block(struct page *p)
+{
+	return p == (void *) raid6_empty_zero_page;
+}
+
+/* the struct page *blocks[] parameter passed to async_gen_syndrome()
+ * and async_syndrome_val() contains the 'P' destination address at
+ * blocks[disks-2] and the 'Q' destination address at blocks[disks-1]
+ *
+ * note: these are macros as they are used as lvalues
+ */
+#define P(b, d) (b[d-2])
+#define Q(b, d) (b[d-1])
+
+/**
+ * do_async_gen_syndrome - asynchronously calculate P and/or Q
+ */
+static __async_inline struct dma_async_tx_descriptor *
+do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks,
+		      const unsigned char *scfs, unsigned int offset, int disks,
+		      size_t len, dma_addr_t *dma_src,
+		      struct async_submit_ctl *submit)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	struct dma_device *dma = chan->device;
+	enum dma_ctrl_flags dma_flags = 0;
+	enum async_tx_flags flags_orig = submit->flags;
+	dma_async_tx_callback cb_fn_orig = submit->cb_fn;
+	dma_async_tx_callback cb_param_orig = submit->cb_param;
+	int src_cnt = disks - 2;
+	unsigned char coefs[src_cnt];
+	unsigned short pq_src_cnt;
+	dma_addr_t dma_dest[2];
+	int src_off = 0;
+	int idx;
+	int i;
+
+	/* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */
+	if (P(blocks, disks))
+		dma_dest[0] = dma_map_page(dma->dev, P(blocks, disks), offset,
+					   len, DMA_BIDIRECTIONAL);
+	else
+		dma_flags |= DMA_PREP_PQ_DISABLE_P;
+	if (Q(blocks, disks))
+		dma_dest[1] = dma_map_page(dma->dev, Q(blocks, disks), offset,
+					   len, DMA_BIDIRECTIONAL);
+	else
+		dma_flags |= DMA_PREP_PQ_DISABLE_Q;
+
+	/* convert source addresses being careful to collapse 'empty'
+	 * sources and update the coefficients accordingly
+	 */
+	for (i = 0, idx = 0; i < src_cnt; i++) {
+		if (is_raid6_zero_block(blocks[i]))
+			continue;
+		dma_src[idx] = dma_map_page(dma->dev, blocks[i], offset, len,
+					    DMA_TO_DEVICE);
+		coefs[idx] = scfs[i];
+		idx++;
+	}
+	src_cnt = idx;
+
+	while (src_cnt > 0) {
+		submit->flags = flags_orig;
+		pq_src_cnt = min(src_cnt, dma_maxpq(dma, dma_flags));
+		/* if we are submitting additional pqs, leave the chain open,
+		 * clear the callback parameters, and leave the destination
+		 * buffers mapped
+		 */
+		if (src_cnt > pq_src_cnt) {
+			submit->flags &= ~ASYNC_TX_ACK;
+			submit->flags |= ASYNC_TX_FENCE;
+			dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP;
+			submit->cb_fn = NULL;
+			submit->cb_param = NULL;
+		} else {
+			dma_flags &= ~DMA_COMPL_SKIP_DEST_UNMAP;
+			submit->cb_fn = cb_fn_orig;
+			submit->cb_param = cb_param_orig;
+			if (cb_fn_orig)
+				dma_flags |= DMA_PREP_INTERRUPT;
+		}
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
+
+		/* Since we have clobbered the src_list we are committed
+		 * to doing this asynchronously.  Drivers force forward
+		 * progress in case they can not provide a descriptor
+		 */
+		for (;;) {
+			tx = dma->device_prep_dma_pq(chan, dma_dest,
+						     &dma_src[src_off],
+						     pq_src_cnt,
+						     &coefs[src_off], len,
+						     dma_flags);
+			if (likely(tx))
+				break;
+			async_tx_quiesce(&submit->depend_tx);
+			dma_async_issue_pending(chan);
+		}
+
+		async_tx_submit(chan, tx, submit);
+		submit->depend_tx = tx;
+
+		/* drop completed sources */
+		src_cnt -= pq_src_cnt;
+		src_off += pq_src_cnt;
+
+		dma_flags |= DMA_PREP_CONTINUE;
+	}
+
+	return tx;
+}
+
+/**
+ * do_sync_gen_syndrome - synchronously calculate a raid6 syndrome
+ */
+static void
+do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
+		     size_t len, struct async_submit_ctl *submit)
+{
+	void **srcs;
+	int i;
+
+	if (submit->scribble)
+		srcs = submit->scribble;
+	else
+		srcs = (void **) blocks;
+
+	for (i = 0; i < disks; i++) {
+		if (is_raid6_zero_block(blocks[i])) {
+			BUG_ON(i > disks - 3); /* P or Q can't be zero */
+			srcs[i] = blocks[i];
+		} else
+			srcs[i] = page_address(blocks[i]) + offset;
+	}
+	raid6_call.gen_syndrome(disks, len, srcs);
+	async_tx_sync_epilog(submit);
+}
+
+/**
+ * async_gen_syndrome - asynchronously calculate a raid6 syndrome
+ * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1
+ * @offset: common offset into each block (src and dest) to start transaction
+ * @disks: number of blocks (including missing P or Q, see below)
+ * @len: length of operation in bytes
+ * @submit: submission/completion modifiers
+ *
+ * General note: This routine assumes a field of GF(2^8) with a
+ * primitive polynomial of 0x11d and a generator of {02}.
+ *
+ * 'disks' note: callers can optionally omit either P or Q (but not
+ * both) from the calculation by setting blocks[disks-2] or
+ * blocks[disks-1] to NULL.  When P or Q is omitted 'len' must be <=
+ * PAGE_SIZE as a temporary buffer of this size is used in the
+ * synchronous path.  'disks' always accounts for both destination
+ * buffers.
+ *
+ * 'blocks' note: if submit->scribble is NULL then the contents of
+ * 'blocks' may be overridden
+ */
+struct dma_async_tx_descriptor *
+async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
+		   size_t len, struct async_submit_ctl *submit)
+{
+	int src_cnt = disks - 2;
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
+						      &P(blocks, disks), 2,
+						      blocks, src_cnt, len);
+	struct dma_device *device = chan ? chan->device : NULL;
+	dma_addr_t *dma_src = NULL;
+
+	BUG_ON(disks > 255 || !(P(blocks, disks) || Q(blocks, disks)));
+
+	if (submit->scribble)
+		dma_src = submit->scribble;
+	else if (sizeof(dma_addr_t) <= sizeof(struct page *))
+		dma_src = (dma_addr_t *) blocks;
+
+	if (dma_src && device &&
+	    (src_cnt <= dma_maxpq(device, 0) ||
+	     dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
+	    is_dma_pq_aligned(device, offset, 0, len)) {
+		/* run the p+q asynchronously */
+		pr_debug("%s: (async) disks: %d len: %zu\n",
+			 __func__, disks, len);
+		return do_async_gen_syndrome(chan, blocks, raid6_gfexp, offset,
+					     disks, len, dma_src, submit);
+	}
+
+	/* run the pq synchronously */
+	pr_debug("%s: (sync) disks: %d len: %zu\n", __func__, disks, len);
+
+	/* wait for any prerequisite operations */
+	async_tx_quiesce(&submit->depend_tx);
+
+	if (!P(blocks, disks)) {
+		P(blocks, disks) = scribble;
+		BUG_ON(len + offset > PAGE_SIZE);
+	}
+	if (!Q(blocks, disks)) {
+		Q(blocks, disks) = scribble;
+		BUG_ON(len + offset > PAGE_SIZE);
+	}
+	do_sync_gen_syndrome(blocks, offset, disks, len, submit);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(async_gen_syndrome);
+
+/**
+ * async_syndrome_val - asynchronously validate a raid6 syndrome
+ * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1
+ * @offset: common offset into each block (src and dest) to start transaction
+ * @disks: number of blocks (including missing P or Q, see below)
+ * @len: length of operation in bytes
+ * @pqres: on val failure SUM_CHECK_P_RESULT and/or SUM_CHECK_Q_RESULT are set
+ * @spare: temporary result buffer for the synchronous case
+ * @submit: submission / completion modifiers
+ *
+ * The same notes from async_gen_syndrome apply to the 'blocks',
+ * and 'disks' parameters of this routine.  The synchronous path
+ * requires a temporary result buffer and submit->scribble to be
+ * specified.
+ */
+struct dma_async_tx_descriptor *
+async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
+		   size_t len, enum sum_check_flags *pqres, struct page *spare,
+		   struct async_submit_ctl *submit)
+{
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ_VAL,
+						      NULL, 0,  blocks, disks,
+						      len);
+	struct dma_device *device = chan ? chan->device : NULL;
+	struct dma_async_tx_descriptor *tx;
+	enum dma_ctrl_flags dma_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
+	dma_addr_t *dma_src = NULL;
+
+	BUG_ON(disks < 4);
+
+	if (submit->scribble)
+		dma_src = submit->scribble;
+	else if (sizeof(dma_addr_t) <= sizeof(struct page *))
+		dma_src = (dma_addr_t *) blocks;
+
+	if (dma_src && device && disks <= dma_maxpq(device, 0) &&
+	    is_dma_pq_aligned(device, offset, 0, len)) {
+		struct device *dev = device->dev;
+		dma_addr_t *pq = &dma_src[disks-2];
+		int i;
+
+		pr_debug("%s: (async) disks: %d len: %zu\n",
+			 __func__, disks, len);
+		if (!P(blocks, disks))
+			dma_flags |= DMA_PREP_PQ_DISABLE_P;
+		if (!Q(blocks, disks))
+			dma_flags |= DMA_PREP_PQ_DISABLE_Q;
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
+		for (i = 0; i < disks; i++)
+			if (likely(blocks[i])) {
+				BUG_ON(is_raid6_zero_block(blocks[i]));
+				dma_src[i] = dma_map_page(dev, blocks[i],
+							  offset, len,
+							  DMA_TO_DEVICE);
+			}
+
+		for (;;) {
+			tx = device->device_prep_dma_pq_val(chan, pq, dma_src,
+							    disks - 2,
+							    raid6_gfexp,
+							    len, pqres,
+							    dma_flags);
+			if (likely(tx))
+				break;
+			async_tx_quiesce(&submit->depend_tx);
+			dma_async_issue_pending(chan);
+		}
+		async_tx_submit(chan, tx, submit);
+
+		return tx;
+	} else {
+		struct page *p_src = P(blocks, disks);
+		struct page *q_src = Q(blocks, disks);
+		enum async_tx_flags flags_orig = submit->flags;
+		dma_async_tx_callback cb_fn_orig = submit->cb_fn;
+		void *scribble = submit->scribble;
+		void *cb_param_orig = submit->cb_param;
+		void *p, *q, *s;
+
+		pr_debug("%s: (sync) disks: %d len: %zu\n",
+			 __func__, disks, len);
+
+		/* caller must provide a temporary result buffer and
+		 * allow the input parameters to be preserved
+		 */
+		BUG_ON(!spare || !scribble);
+
+		/* wait for any prerequisite operations */
+		async_tx_quiesce(&submit->depend_tx);
+
+		/* recompute p and/or q into the temporary buffer and then
+		 * check to see the result matches the current value
+		 */
+		tx = NULL;
+		*pqres = 0;
+		if (p_src) {
+			init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL,
+					  NULL, NULL, scribble);
+			tx = async_xor(spare, blocks, offset, disks-2, len, submit);
+			async_tx_quiesce(&tx);
+			p = page_address(p_src) + offset;
+			s = page_address(spare) + offset;
+			*pqres |= !!memcmp(p, s, len) << SUM_CHECK_P;
+		}
+
+		if (q_src) {
+			P(blocks, disks) = NULL;
+			Q(blocks, disks) = spare;
+			init_async_submit(submit, 0, NULL, NULL, NULL, scribble);
+			tx = async_gen_syndrome(blocks, offset, disks, len, submit);
+			async_tx_quiesce(&tx);
+			q = page_address(q_src) + offset;
+			s = page_address(spare) + offset;
+			*pqres |= !!memcmp(q, s, len) << SUM_CHECK_Q;
+		}
+
+		/* restore P, Q and submit */
+		P(blocks, disks) = p_src;
+		Q(blocks, disks) = q_src;
+
+		submit->cb_fn = cb_fn_orig;
+		submit->cb_param = cb_param_orig;
+		submit->flags = flags_orig;
+		async_tx_sync_epilog(submit);
+
+		return NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(async_syndrome_val);
+
+static int __init async_pq_init(void)
+{
+	scribble = alloc_page(GFP_KERNEL);
+
+	if (scribble)
+		return 0;
+
+	pr_err("%s: failed to allocate required spare page\n", __func__);
+
+	return -ENOMEM;
+}
+
+static void __exit async_pq_exit(void)
+{
+	put_page(scribble);
+}
+
+module_init(async_pq_init);
+module_exit(async_pq_exit);
+
+MODULE_DESCRIPTION("asynchronous raid6 syndrome generation/validation");
+MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c
new file mode 100644
index 00000000000..6d73dde4786
--- /dev/null
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -0,0 +1,468 @@
+/*
+ * Asynchronous RAID-6 recovery calculations ASYNC_TX API.
+ * Copyright(c) 2009 Intel Corporation
+ *
+ * based on raid6recov.c:
+ *   Copyright 2002 H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <linux/raid/pq.h>
+#include <linux/async_tx.h>
+
+static struct dma_async_tx_descriptor *
+async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
+		  size_t len, struct async_submit_ctl *submit)
+{
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
+						      &dest, 1, srcs, 2, len);
+	struct dma_device *dma = chan ? chan->device : NULL;
+	const u8 *amul, *bmul;
+	u8 ax, bx;
+	u8 *a, *b, *c;
+
+	if (dma) {
+		dma_addr_t dma_dest[2];
+		dma_addr_t dma_src[2];
+		struct device *dev = dma->dev;
+		struct dma_async_tx_descriptor *tx;
+		enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
+
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
+		dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
+		dma_src[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE);
+		dma_src[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE);
+		tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 2, coef,
+					     len, dma_flags);
+		if (tx) {
+			async_tx_submit(chan, tx, submit);
+			return tx;
+		}
+
+		/* could not get a descriptor, unmap and fall through to
+		 * the synchronous path
+		 */
+		dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL);
+		dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE);
+		dma_unmap_page(dev, dma_src[1], len, DMA_TO_DEVICE);
+	}
+
+	/* run the operation synchronously */
+	async_tx_quiesce(&submit->depend_tx);
+	amul = raid6_gfmul[coef[0]];
+	bmul = raid6_gfmul[coef[1]];
+	a = page_address(srcs[0]);
+	b = page_address(srcs[1]);
+	c = page_address(dest);
+
+	while (len--) {
+		ax    = amul[*a++];
+		bx    = bmul[*b++];
+		*c++ = ax ^ bx;
+	}
+
+	return NULL;
+}
+
+static struct dma_async_tx_descriptor *
+async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
+	   struct async_submit_ctl *submit)
+{
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
+						      &dest, 1, &src, 1, len);
+	struct dma_device *dma = chan ? chan->device : NULL;
+	const u8 *qmul; /* Q multiplier table */
+	u8 *d, *s;
+
+	if (dma) {
+		dma_addr_t dma_dest[2];
+		dma_addr_t dma_src[1];
+		struct device *dev = dma->dev;
+		struct dma_async_tx_descriptor *tx;
+		enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
+
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
+		dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
+		dma_src[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE);
+		tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 1, &coef,
+					     len, dma_flags);
+		if (tx) {
+			async_tx_submit(chan, tx, submit);
+			return tx;
+		}
+
+		/* could not get a descriptor, unmap and fall through to
+		 * the synchronous path
+		 */
+		dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL);
+		dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE);
+	}
+
+	/* no channel available, or failed to allocate a descriptor, so
+	 * perform the operation synchronously
+	 */
+	async_tx_quiesce(&submit->depend_tx);
+	qmul  = raid6_gfmul[coef];
+	d = page_address(dest);
+	s = page_address(src);
+
+	while (len--)
+		*d++ = qmul[*s++];
+
+	return NULL;
+}
+
+static struct dma_async_tx_descriptor *
+__2data_recov_4(size_t bytes, int faila, int failb, struct page **blocks,
+	      struct async_submit_ctl *submit)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	struct page *p, *q, *a, *b;
+	struct page *srcs[2];
+	unsigned char coef[2];
+	enum async_tx_flags flags = submit->flags;
+	dma_async_tx_callback cb_fn = submit->cb_fn;
+	void *cb_param = submit->cb_param;
+	void *scribble = submit->scribble;
+
+	p = blocks[4-2];
+	q = blocks[4-1];
+
+	a = blocks[faila];
+	b = blocks[failb];
+
+	/* in the 4 disk case P + Pxy == P and Q + Qxy == Q */
+	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
+	srcs[0] = p;
+	srcs[1] = q;
+	coef[0] = raid6_gfexi[failb-faila];
+	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+	tx = async_sum_product(b, srcs, coef, bytes, submit);
+
+	/* Dy = P+Pxy+Dx */
+	srcs[0] = p;
+	srcs[1] = b;
+	init_async_submit(submit, flags | ASYNC_TX_XOR_ZERO_DST, tx, cb_fn,
+			  cb_param, scribble);
+	tx = async_xor(a, srcs, 0, 2, bytes, submit);
+
+	return tx;
+
+}
+
+static struct dma_async_tx_descriptor *
+__2data_recov_5(size_t bytes, int faila, int failb, struct page **blocks,
+	      struct async_submit_ctl *submit)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	struct page *p, *q, *g, *dp, *dq;
+	struct page *srcs[2];
+	unsigned char coef[2];
+	enum async_tx_flags flags = submit->flags;
+	dma_async_tx_callback cb_fn = submit->cb_fn;
+	void *cb_param = submit->cb_param;
+	void *scribble = submit->scribble;
+	int uninitialized_var(good);
+	int i;
+
+	for (i = 0; i < 3; i++) {
+		if (i == faila || i == failb)
+			continue;
+		else {
+			good = i;
+			break;
+		}
+	}
+	BUG_ON(i >= 3);
+
+	p = blocks[5-2];
+	q = blocks[5-1];
+	g = blocks[good];
+
+	/* Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for delta p and
+	 * delta q
+	 */
+	dp = blocks[faila];
+	dq = blocks[failb];
+
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+	tx = async_memcpy(dp, g, 0, 0, bytes, submit);
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+	tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
+
+	/* compute P + Pxy */
+	srcs[0] = dp;
+	srcs[1] = p;
+	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, NULL, scribble);
+	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+	/* compute Q + Qxy */
+	srcs[0] = dq;
+	srcs[1] = q;
+	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, NULL, scribble);
+	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+
+	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
+	srcs[0] = dp;
+	srcs[1] = dq;
+	coef[0] = raid6_gfexi[failb-faila];
+	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+	tx = async_sum_product(dq, srcs, coef, bytes, submit);
+
+	/* Dy = P+Pxy+Dx */
+	srcs[0] = dp;
+	srcs[1] = dq;
+	init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
+			  cb_param, scribble);
+	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+	return tx;
+}
+
+static struct dma_async_tx_descriptor *
+__2data_recov_n(int disks, size_t bytes, int faila, int failb,
+	      struct page **blocks, struct async_submit_ctl *submit)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	struct page *p, *q, *dp, *dq;
+	struct page *srcs[2];
+	unsigned char coef[2];
+	enum async_tx_flags flags = submit->flags;
+	dma_async_tx_callback cb_fn = submit->cb_fn;
+	void *cb_param = submit->cb_param;
+	void *scribble = submit->scribble;
+
+	p = blocks[disks-2];
+	q = blocks[disks-1];
+
+	/* Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = blocks[faila];
+	blocks[faila] = (void *)raid6_empty_zero_page;
+	blocks[disks-2] = dp;
+	dq = blocks[failb];
+	blocks[failb] = (void *)raid6_empty_zero_page;
+	blocks[disks-1] = dq;
+
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+	tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
+
+	/* Restore pointer table */
+	blocks[faila]   = dp;
+	blocks[failb]   = dq;
+	blocks[disks-2] = p;
+	blocks[disks-1] = q;
+
+	/* compute P + Pxy */
+	srcs[0] = dp;
+	srcs[1] = p;
+	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, NULL, scribble);
+	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+	/* compute Q + Qxy */
+	srcs[0] = dq;
+	srcs[1] = q;
+	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, NULL, scribble);
+	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+
+	/* Dx = A*(P+Pxy) + B*(Q+Qxy) */
+	srcs[0] = dp;
+	srcs[1] = dq;
+	coef[0] = raid6_gfexi[failb-faila];
+	coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+	tx = async_sum_product(dq, srcs, coef, bytes, submit);
+
+	/* Dy = P+Pxy+Dx */
+	srcs[0] = dp;
+	srcs[1] = dq;
+	init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
+			  cb_param, scribble);
+	tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+	return tx;
+}
+
+/**
+ * async_raid6_2data_recov - asynchronously calculate two missing data blocks
+ * @disks: number of disks in the RAID-6 array
+ * @bytes: block size
+ * @faila: first failed drive index
+ * @failb: second failed drive index
+ * @blocks: array of source pointers where the last two entries are p and q
+ * @submit: submission/completion modifiers
+ */
+struct dma_async_tx_descriptor *
+async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+			struct page **blocks, struct async_submit_ctl *submit)
+{
+	BUG_ON(faila == failb);
+	if (failb < faila)
+		swap(faila, failb);
+
+	pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
+
+	/* we need to preserve the contents of 'blocks' for the async
+	 * case, so punt to synchronous if a scribble buffer is not available
+	 */
+	if (!submit->scribble) {
+		void **ptrs = (void **) blocks;
+		int i;
+
+		async_tx_quiesce(&submit->depend_tx);
+		for (i = 0; i < disks; i++)
+			ptrs[i] = page_address(blocks[i]);
+
+		raid6_2data_recov(disks, bytes, faila, failb, ptrs);
+
+		async_tx_sync_epilog(submit);
+
+		return NULL;
+	}
+
+	switch (disks) {
+	case 4:
+		/* dma devices do not uniformly understand a zero source pq
+		 * operation (in contrast to the synchronous case), so
+		 * explicitly handle the 4 disk special case
+		 */
+		return __2data_recov_4(bytes, faila, failb, blocks, submit);
+	case 5:
+		/* dma devices do not uniformly understand a single
+		 * source pq operation (in contrast to the synchronous
+		 * case), so explicitly handle the 5 disk special case
+		 */
+		return __2data_recov_5(bytes, faila, failb, blocks, submit);
+	default:
+		return __2data_recov_n(disks, bytes, faila, failb, blocks, submit);
+	}
+}
+EXPORT_SYMBOL_GPL(async_raid6_2data_recov);
+
+/**
+ * async_raid6_datap_recov - asynchronously calculate a data and the 'p' block
+ * @disks: number of disks in the RAID-6 array
+ * @bytes: block size
+ * @faila: failed drive index
+ * @blocks: array of source pointers where the last two entries are p and q
+ * @submit: submission/completion modifiers
+ */
+struct dma_async_tx_descriptor *
+async_raid6_datap_recov(int disks, size_t bytes, int faila,
+			struct page **blocks, struct async_submit_ctl *submit)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	struct page *p, *q, *dq;
+	u8 coef;
+	enum async_tx_flags flags = submit->flags;
+	dma_async_tx_callback cb_fn = submit->cb_fn;
+	void *cb_param = submit->cb_param;
+	void *scribble = submit->scribble;
+	struct page *srcs[2];
+
+	pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
+
+	/* we need to preserve the contents of 'blocks' for the async
+	 * case, so punt to synchronous if a scribble buffer is not available
+	 */
+	if (!scribble) {
+		void **ptrs = (void **) blocks;
+		int i;
+
+		async_tx_quiesce(&submit->depend_tx);
+		for (i = 0; i < disks; i++)
+			ptrs[i] = page_address(blocks[i]);
+
+		raid6_datap_recov(disks, bytes, faila, ptrs);
+
+		async_tx_sync_epilog(submit);
+
+		return NULL;
+	}
+
+	p = blocks[disks-2];
+	q = blocks[disks-1];
+
+	/* Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = blocks[faila];
+	blocks[faila] = (void *)raid6_empty_zero_page;
+	blocks[disks-1] = dq;
+
+	/* in the 4 disk case we only need to perform a single source
+	 * multiplication
+	 */
+	if (disks == 4) {
+		int good = faila == 0 ? 1 : 0;
+		struct page *g = blocks[good];
+
+		init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
+				  scribble);
+		tx = async_memcpy(p, g, 0, 0, bytes, submit);
+
+		init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
+				  scribble);
+		tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
+	} else {
+		init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
+				  scribble);
+		tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
+	}
+
+	/* Restore pointer table */
+	blocks[faila]   = dq;
+	blocks[disks-1] = q;
+
+	/* calculate g^{-faila} */
+	coef = raid6_gfinv[raid6_gfexp[faila]];
+
+	srcs[0] = dq;
+	srcs[1] = q;
+	init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, NULL, scribble);
+	tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+
+	init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+	tx = async_mult(dq, dq, coef, bytes, submit);
+
+	srcs[0] = p;
+	srcs[1] = dq;
+	init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
+			  cb_param, scribble);
+	tx = async_xor(p, srcs, 0, 2, bytes, submit);
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_raid6_datap_recov);
+
+MODULE_AUTHOR("Dan Williams <dan.j.williams@intel.com>");
+MODULE_DESCRIPTION("asynchronous RAID-6 recovery api");
+MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index 06eb6cc09fe..f9cdf04fe7c 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -42,16 +42,21 @@ static void __exit async_tx_exit(void)
 	async_dmaengine_put();
 }
 
+module_init(async_tx_init);
+module_exit(async_tx_exit);
+
 /**
  * __async_tx_find_channel - find a channel to carry out the operation or let
  *	the transaction execute synchronously
- * @depend_tx: transaction dependency
+ * @submit: transaction dependency and submission modifiers
  * @tx_type: transaction type
  */
 struct dma_chan *
-__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
-	enum dma_transaction_type tx_type)
+__async_tx_find_channel(struct async_submit_ctl *submit,
+			enum dma_transaction_type tx_type)
 {
+	struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
+
 	/* see if we can keep the chain on one channel */
 	if (depend_tx &&
 	    dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
@@ -59,17 +64,6 @@ __async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
 	return async_dma_find_channel(tx_type);
 }
 EXPORT_SYMBOL_GPL(__async_tx_find_channel);
-#else
-static int __init async_tx_init(void)
-{
-	printk(KERN_INFO "async_tx: api initialized (sync-only)\n");
-	return 0;
-}
-
-static void __exit async_tx_exit(void)
-{
-	do { } while (0);
-}
 #endif
 
 
@@ -83,10 +77,14 @@ static void
 async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 			struct dma_async_tx_descriptor *tx)
 {
-	struct dma_chan *chan;
-	struct dma_device *device;
+	struct dma_chan *chan = depend_tx->chan;
+	struct dma_device *device = chan->device;
 	struct dma_async_tx_descriptor *intr_tx = (void *) ~0;
 
+	#ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
+	BUG();
+	#endif
+
 	/* first check to see if we can still append to depend_tx */
 	spin_lock_bh(&depend_tx->lock);
 	if (depend_tx->parent && depend_tx->chan == tx->chan) {
@@ -96,11 +94,11 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 	}
 	spin_unlock_bh(&depend_tx->lock);
 
-	if (!intr_tx)
+	/* attached dependency, flush the parent channel */
+	if (!intr_tx) {
+		device->device_issue_pending(chan);
 		return;
-
-	chan = depend_tx->chan;
-	device = chan->device;
+	}
 
 	/* see if we can schedule an interrupt
 	 * otherwise poll for completion
@@ -134,6 +132,7 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 			intr_tx->tx_submit(intr_tx);
 			async_tx_ack(intr_tx);
 		}
+		device->device_issue_pending(chan);
 	} else {
 		if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR)
 			panic("%s: DMA_ERROR waiting for depend_tx\n",
@@ -144,13 +143,14 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 
 
 /**
- * submit_disposition - while holding depend_tx->lock we must avoid submitting
- * 	new operations to prevent a circular locking dependency with
- * 	drivers that already hold a channel lock when calling
- * 	async_tx_run_dependencies.
+ * submit_disposition - flags for routing an incoming operation
  * @ASYNC_TX_SUBMITTED: we were able to append the new operation under the lock
  * @ASYNC_TX_CHANNEL_SWITCH: when the lock is dropped schedule a channel switch
  * @ASYNC_TX_DIRECT_SUBMIT: when the lock is dropped submit directly
+ *
+ * while holding depend_tx->lock we must avoid submitting new operations
+ * to prevent a circular locking dependency with drivers that already
+ * hold a channel lock when calling async_tx_run_dependencies.
  */
 enum submit_disposition {
 	ASYNC_TX_SUBMITTED,
@@ -160,11 +160,12 @@ enum submit_disposition {
 
 void
 async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
-	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+		struct async_submit_ctl *submit)
 {
-	tx->callback = cb_fn;
-	tx->callback_param = cb_param;
+	struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
+
+	tx->callback = submit->cb_fn;
+	tx->callback_param = submit->cb_param;
 
 	if (depend_tx) {
 		enum submit_disposition s;
@@ -220,30 +221,29 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
 		tx->tx_submit(tx);
 	}
 
-	if (flags & ASYNC_TX_ACK)
+	if (submit->flags & ASYNC_TX_ACK)
 		async_tx_ack(tx);
 
-	if (depend_tx && (flags & ASYNC_TX_DEP_ACK))
+	if (depend_tx)
 		async_tx_ack(depend_tx);
 }
 EXPORT_SYMBOL_GPL(async_tx_submit);
 
 /**
- * async_trigger_callback - schedules the callback function to be run after
- * any dependent operations have been completed.
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
- * @depend_tx: 'callback' requires the completion of this transaction
- * @cb_fn: function to call after depend_tx completes
- * @cb_param: parameter to pass to the callback routine
+ * async_trigger_callback - schedules the callback function to be run
+ * @submit: submission and completion parameters
+ *
+ * honored flags: ASYNC_TX_ACK
+ *
+ * The callback is run after any dependent operations have completed.
  */
 struct dma_async_tx_descriptor *
-async_trigger_callback(enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+async_trigger_callback(struct async_submit_ctl *submit)
 {
 	struct dma_chan *chan;
 	struct dma_device *device;
 	struct dma_async_tx_descriptor *tx;
+	struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
 
 	if (depend_tx) {
 		chan = depend_tx->chan;
@@ -262,14 +262,14 @@ async_trigger_callback(enum async_tx_flags flags,
 	if (tx) {
 		pr_debug("%s: (async)\n", __func__);
 
-		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+		async_tx_submit(chan, tx, submit);
 	} else {
 		pr_debug("%s: (sync)\n", __func__);
 
 		/* wait for any prerequisite operations */
-		async_tx_quiesce(&depend_tx);
+		async_tx_quiesce(&submit->depend_tx);
 
-		async_tx_sync_epilog(cb_fn, cb_param);
+		async_tx_sync_epilog(submit);
 	}
 
 	return tx;
@@ -295,9 +295,6 @@ void async_tx_quiesce(struct dma_async_tx_descriptor **tx)
 }
 EXPORT_SYMBOL_GPL(async_tx_quiesce);
 
-module_init(async_tx_init);
-module_exit(async_tx_exit);
-
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("Asynchronous Bulk Memory Transactions API");
 MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index 90dd3f8bd28..b459a9034aa 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -33,19 +33,16 @@
 /* do_async_xor - dma map the pages and perform the xor with an engine */
 static __async_inline struct dma_async_tx_descriptor *
 do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
-	     unsigned int offset, int src_cnt, size_t len,
-	     enum async_tx_flags flags,
-	     struct dma_async_tx_descriptor *depend_tx,
-	     dma_async_tx_callback cb_fn, void *cb_param)
+	     unsigned int offset, int src_cnt, size_t len, dma_addr_t *dma_src,
+	     struct async_submit_ctl *submit)
 {
 	struct dma_device *dma = chan->device;
-	dma_addr_t *dma_src = (dma_addr_t *) src_list;
 	struct dma_async_tx_descriptor *tx = NULL;
 	int src_off = 0;
 	int i;
-	dma_async_tx_callback _cb_fn;
-	void *_cb_param;
-	enum async_tx_flags async_flags;
+	dma_async_tx_callback cb_fn_orig = submit->cb_fn;
+	void *cb_param_orig = submit->cb_param;
+	enum async_tx_flags flags_orig = submit->flags;
 	enum dma_ctrl_flags dma_flags;
 	int xor_src_cnt;
 	dma_addr_t dma_dest;
@@ -63,25 +60,27 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 	}
 
 	while (src_cnt) {
-		async_flags = flags;
+		submit->flags = flags_orig;
 		dma_flags = 0;
-		xor_src_cnt = min(src_cnt, dma->max_xor);
+		xor_src_cnt = min(src_cnt, (int)dma->max_xor);
 		/* if we are submitting additional xors, leave the chain open,
 		 * clear the callback parameters, and leave the destination
 		 * buffer mapped
 		 */
 		if (src_cnt > xor_src_cnt) {
-			async_flags &= ~ASYNC_TX_ACK;
+			submit->flags &= ~ASYNC_TX_ACK;
+			submit->flags |= ASYNC_TX_FENCE;
 			dma_flags = DMA_COMPL_SKIP_DEST_UNMAP;
-			_cb_fn = NULL;
-			_cb_param = NULL;
+			submit->cb_fn = NULL;
+			submit->cb_param = NULL;
 		} else {
-			_cb_fn = cb_fn;
-			_cb_param = cb_param;
+			submit->cb_fn = cb_fn_orig;
+			submit->cb_param = cb_param_orig;
 		}
-		if (_cb_fn)
+		if (submit->cb_fn)
 			dma_flags |= DMA_PREP_INTERRUPT;
-
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
 		/* Since we have clobbered the src_list we are committed
 		 * to doing this asynchronously.  Drivers force forward progress
 		 * in case they can not provide a descriptor
@@ -90,7 +89,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 					      xor_src_cnt, len, dma_flags);
 
 		if (unlikely(!tx))
-			async_tx_quiesce(&depend_tx);
+			async_tx_quiesce(&submit->depend_tx);
 
 		/* spin wait for the preceeding transactions to complete */
 		while (unlikely(!tx)) {
@@ -101,11 +100,8 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 						      dma_flags);
 		}
 
-		async_tx_submit(chan, tx, async_flags, depend_tx, _cb_fn,
-				_cb_param);
-
-		depend_tx = tx;
-		flags |= ASYNC_TX_DEP_ACK;
+		async_tx_submit(chan, tx, submit);
+		submit->depend_tx = tx;
 
 		if (src_cnt > xor_src_cnt) {
 			/* drop completed sources */
@@ -124,23 +120,27 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 
 static void
 do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
-	    int src_cnt, size_t len, enum async_tx_flags flags,
-	    dma_async_tx_callback cb_fn, void *cb_param)
+	    int src_cnt, size_t len, struct async_submit_ctl *submit)
 {
 	int i;
 	int xor_src_cnt;
 	int src_off = 0;
 	void *dest_buf;
-	void **srcs = (void **) src_list;
+	void **srcs;
+
+	if (submit->scribble)
+		srcs = submit->scribble;
+	else
+		srcs = (void **) src_list;
 
-	/* reuse the 'src_list' array to convert to buffer pointers */
+	/* convert to buffer pointers */
 	for (i = 0; i < src_cnt; i++)
 		srcs[i] = page_address(src_list[i]) + offset;
 
 	/* set destination address */
 	dest_buf = page_address(dest) + offset;
 
-	if (flags & ASYNC_TX_XOR_ZERO_DST)
+	if (submit->flags & ASYNC_TX_XOR_ZERO_DST)
 		memset(dest_buf, 0, len);
 
 	while (src_cnt > 0) {
@@ -153,61 +153,70 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
 		src_off += xor_src_cnt;
 	}
 
-	async_tx_sync_epilog(cb_fn, cb_param);
+	async_tx_sync_epilog(submit);
 }
 
 /**
  * async_xor - attempt to xor a set of blocks with a dma engine.
- *	xor_blocks always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST
- *	flag must be set to not include dest data in the calculation.  The
- *	assumption with dma eninges is that they only use the destination
- *	buffer as a source when it is explicity specified in the source list.
  * @dest: destination page
- * @src_list: array of source pages (if the dest is also a source it must be
- *	at index zero).  The contents of this array may be overwritten.
- * @offset: offset in pages to start transaction
+ * @src_list: array of source pages
+ * @offset: common src/dst offset to start transaction
  * @src_cnt: number of source pages
  * @len: length in bytes
- * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST,
- *	ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
- * @depend_tx: xor depends on the result of this transaction.
- * @cb_fn: function to call when the xor completes
- * @cb_param: parameter to pass to the callback routine
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST
+ *
+ * xor_blocks always uses the dest as a source so the
+ * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in
+ * the calculation.  The assumption with dma eninges is that they only
+ * use the destination buffer as a source when it is explicity specified
+ * in the source list.
+ *
+ * src_list note: if the dest is also a source it must be at index zero.
+ * The contents of this array will be overwritten if a scribble region
+ * is not specified.
  */
 struct dma_async_tx_descriptor *
 async_xor(struct page *dest, struct page **src_list, unsigned int offset,
-	int src_cnt, size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+	  int src_cnt, size_t len, struct async_submit_ctl *submit)
 {
-	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR,
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR,
 						      &dest, 1, src_list,
 						      src_cnt, len);
+	dma_addr_t *dma_src = NULL;
+
 	BUG_ON(src_cnt <= 1);
 
-	if (chan) {
+	if (submit->scribble)
+		dma_src = submit->scribble;
+	else if (sizeof(dma_addr_t) <= sizeof(struct page *))
+		dma_src = (dma_addr_t *) src_list;
+
+	if (dma_src && chan && is_dma_xor_aligned(chan->device, offset, 0, len)) {
 		/* run the xor asynchronously */
 		pr_debug("%s (async): len: %zu\n", __func__, len);
 
 		return do_async_xor(chan, dest, src_list, offset, src_cnt, len,
-				    flags, depend_tx, cb_fn, cb_param);
+				    dma_src, submit);
 	} else {
 		/* run the xor synchronously */
 		pr_debug("%s (sync): len: %zu\n", __func__, len);
+		WARN_ONCE(chan, "%s: no space for dma address conversion\n",
+			  __func__);
 
 		/* in the sync case the dest is an implied source
 		 * (assumes the dest is the first source)
 		 */
-		if (flags & ASYNC_TX_XOR_DROP_DST) {
+		if (submit->flags & ASYNC_TX_XOR_DROP_DST) {
 			src_cnt--;
 			src_list++;
 		}
 
 		/* wait for any prerequisite operations */
-		async_tx_quiesce(&depend_tx);
+		async_tx_quiesce(&submit->depend_tx);
 
-		do_sync_xor(dest, src_list, offset, src_cnt, len,
-			    flags, cb_fn, cb_param);
+		do_sync_xor(dest, src_list, offset, src_cnt, len, submit);
 
 		return NULL;
 	}
@@ -222,104 +231,94 @@ static int page_is_zero(struct page *p, unsigned int offset, size_t len)
 }
 
 /**
- * async_xor_zero_sum - attempt a xor parity check with a dma engine.
+ * async_xor_val - attempt a xor parity check with a dma engine.
  * @dest: destination page used if the xor is performed synchronously
- * @src_list: array of source pages.  The dest page must be listed as a source
- * 	at index zero.  The contents of this array may be overwritten.
+ * @src_list: array of source pages
  * @offset: offset in pages to start transaction
  * @src_cnt: number of source pages
  * @len: length in bytes
  * @result: 0 if sum == 0 else non-zero
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
- * @depend_tx: xor depends on the result of this transaction.
- * @cb_fn: function to call when the xor completes
- * @cb_param: parameter to pass to the callback routine
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK
+ *
+ * src_list note: if the dest is also a source it must be at index zero.
+ * The contents of this array will be overwritten if a scribble region
+ * is not specified.
  */
 struct dma_async_tx_descriptor *
-async_xor_zero_sum(struct page *dest, struct page **src_list,
-	unsigned int offset, int src_cnt, size_t len,
-	u32 *result, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_param)
+async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
+	      int src_cnt, size_t len, enum sum_check_flags *result,
+	      struct async_submit_ctl *submit)
 {
-	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_ZERO_SUM,
+	struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR_VAL,
 						      &dest, 1, src_list,
 						      src_cnt, len);
 	struct dma_device *device = chan ? chan->device : NULL;
 	struct dma_async_tx_descriptor *tx = NULL;
+	dma_addr_t *dma_src = NULL;
 
 	BUG_ON(src_cnt <= 1);
 
-	if (device && src_cnt <= device->max_xor) {
-		dma_addr_t *dma_src = (dma_addr_t *) src_list;
-		unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+	if (submit->scribble)
+		dma_src = submit->scribble;
+	else if (sizeof(dma_addr_t) <= sizeof(struct page *))
+		dma_src = (dma_addr_t *) src_list;
+
+	if (dma_src && device && src_cnt <= device->max_xor &&
+	    is_dma_xor_aligned(device, offset, 0, len)) {
+		unsigned long dma_prep_flags = 0;
 		int i;
 
 		pr_debug("%s: (async) len: %zu\n", __func__, len);
 
+		if (submit->cb_fn)
+			dma_prep_flags |= DMA_PREP_INTERRUPT;
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_prep_flags |= DMA_PREP_FENCE;
 		for (i = 0; i < src_cnt; i++)
 			dma_src[i] = dma_map_page(device->dev, src_list[i],
 						  offset, len, DMA_TO_DEVICE);
 
-		tx = device->device_prep_dma_zero_sum(chan, dma_src, src_cnt,
-						      len, result,
-						      dma_prep_flags);
+		tx = device->device_prep_dma_xor_val(chan, dma_src, src_cnt,
+						     len, result,
+						     dma_prep_flags);
 		if (unlikely(!tx)) {
-			async_tx_quiesce(&depend_tx);
+			async_tx_quiesce(&submit->depend_tx);
 
 			while (!tx) {
 				dma_async_issue_pending(chan);
-				tx = device->device_prep_dma_zero_sum(chan,
+				tx = device->device_prep_dma_xor_val(chan,
 					dma_src, src_cnt, len, result,
 					dma_prep_flags);
 			}
 		}
 
-		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+		async_tx_submit(chan, tx, submit);
 	} else {
-		unsigned long xor_flags = flags;
+		enum async_tx_flags flags_orig = submit->flags;
 
 		pr_debug("%s: (sync) len: %zu\n", __func__, len);
+		WARN_ONCE(device && src_cnt <= device->max_xor,
+			  "%s: no space for dma address conversion\n",
+			  __func__);
 
-		xor_flags |= ASYNC_TX_XOR_DROP_DST;
-		xor_flags &= ~ASYNC_TX_ACK;
+		submit->flags |= ASYNC_TX_XOR_DROP_DST;
+		submit->flags &= ~ASYNC_TX_ACK;
 
-		tx = async_xor(dest, src_list, offset, src_cnt, len, xor_flags,
-			depend_tx, NULL, NULL);
+		tx = async_xor(dest, src_list, offset, src_cnt, len, submit);
 
 		async_tx_quiesce(&tx);
 
-		*result = page_is_zero(dest, offset, len) ? 0 : 1;
+		*result = !page_is_zero(dest, offset, len) << SUM_CHECK_P;
 
-		async_tx_sync_epilog(cb_fn, cb_param);
+		async_tx_sync_epilog(submit);
+		submit->flags = flags_orig;
 	}
 
 	return tx;
 }
-EXPORT_SYMBOL_GPL(async_xor_zero_sum);
-
-static int __init async_xor_init(void)
-{
-	#ifdef CONFIG_ASYNC_TX_DMA
-	/* To conserve stack space the input src_list (array of page pointers)
-	 * is reused to hold the array of dma addresses passed to the driver.
-	 * This conversion is only possible when dma_addr_t is less than the
-	 * the size of a pointer.  HIGHMEM64G is known to violate this
-	 * assumption.
-	 */
-	BUILD_BUG_ON(sizeof(dma_addr_t) > sizeof(struct page *));
-	#endif
-
-	return 0;
-}
-
-static void __exit async_xor_exit(void)
-{
-	do { } while (0);
-}
-
-module_init(async_xor_init);
-module_exit(async_xor_exit);
+EXPORT_SYMBOL_GPL(async_xor_val);
 
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("asynchronous xor/xor-zero-sum api");
diff --git a/crypto/async_tx/raid6test.c b/crypto/async_tx/raid6test.c
new file mode 100644
index 00000000000..3ec27c7e62e
--- /dev/null
+++ b/crypto/async_tx/raid6test.c
@@ -0,0 +1,240 @@
+/*
+ * asynchronous raid6 recovery self test
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * based on drivers/md/raid6test/test.c:
+ * 	Copyright 2002-2007 H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/async_tx.h>
+#include <linux/random.h>
+
+#undef pr
+#define pr(fmt, args...) pr_info("raid6test: " fmt, ##args)
+
+#define NDISKS 16 /* Including P and Q */
+
+static struct page *dataptrs[NDISKS];
+static addr_conv_t addr_conv[NDISKS];
+static struct page *data[NDISKS+3];
+static struct page *spare;
+static struct page *recovi;
+static struct page *recovj;
+
+static void callback(void *param)
+{
+	struct completion *cmp = param;
+
+	complete(cmp);
+}
+
+static void makedata(int disks)
+{
+	int i, j;
+
+	for (i = 0; i < disks; i++) {
+		for (j = 0; j < PAGE_SIZE/sizeof(u32); j += sizeof(u32)) {
+			u32 *p = page_address(data[i]) + j;
+
+			*p = random32();
+		}
+
+		dataptrs[i] = data[i];
+	}
+}
+
+static char disk_type(int d, int disks)
+{
+	if (d == disks - 2)
+		return 'P';
+	else if (d == disks - 1)
+		return 'Q';
+	else
+		return 'D';
+}
+
+/* Recover two failed blocks. */
+static void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, struct page **ptrs)
+{
+	struct async_submit_ctl submit;
+	struct completion cmp;
+	struct dma_async_tx_descriptor *tx = NULL;
+	enum sum_check_flags result = ~0;
+
+	if (faila > failb)
+		swap(faila, failb);
+
+	if (failb == disks-1) {
+		if (faila == disks-2) {
+			/* P+Q failure.  Just rebuild the syndrome. */
+			init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
+			tx = async_gen_syndrome(ptrs, 0, disks, bytes, &submit);
+		} else {
+			struct page *blocks[disks];
+			struct page *dest;
+			int count = 0;
+			int i;
+
+			/* data+Q failure.  Reconstruct data from P,
+			 * then rebuild syndrome
+			 */
+			for (i = disks; i-- ; ) {
+				if (i == faila || i == failb)
+					continue;
+				blocks[count++] = ptrs[i];
+			}
+			dest = ptrs[faila];
+			init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
+					  NULL, NULL, addr_conv);
+			tx = async_xor(dest, blocks, 0, count, bytes, &submit);
+
+			init_async_submit(&submit, 0, tx, NULL, NULL, addr_conv);
+			tx = async_gen_syndrome(ptrs, 0, disks, bytes, &submit);
+		}
+	} else {
+		if (failb == disks-2) {
+			/* data+P failure. */
+			init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
+			tx = async_raid6_datap_recov(disks, bytes, faila, ptrs, &submit);
+		} else {
+			/* data+data failure. */
+			init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
+			tx = async_raid6_2data_recov(disks, bytes, faila, failb, ptrs, &submit);
+		}
+	}
+	init_completion(&cmp);
+	init_async_submit(&submit, ASYNC_TX_ACK, tx, callback, &cmp, addr_conv);
+	tx = async_syndrome_val(ptrs, 0, disks, bytes, &result, spare, &submit);
+	async_tx_issue_pending(tx);
+
+	if (wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)) == 0)
+		pr("%s: timeout! (faila: %d failb: %d disks: %d)\n",
+		   __func__, faila, failb, disks);
+
+	if (result != 0)
+		pr("%s: validation failure! faila: %d failb: %d sum_check_flags: %x\n",
+		   __func__, faila, failb, result);
+}
+
+static int test_disks(int i, int j, int disks)
+{
+	int erra, errb;
+
+	memset(page_address(recovi), 0xf0, PAGE_SIZE);
+	memset(page_address(recovj), 0xba, PAGE_SIZE);
+
+	dataptrs[i] = recovi;
+	dataptrs[j] = recovj;
+
+	raid6_dual_recov(disks, PAGE_SIZE, i, j, dataptrs);
+
+	erra = memcmp(page_address(data[i]), page_address(recovi), PAGE_SIZE);
+	errb = memcmp(page_address(data[j]), page_address(recovj), PAGE_SIZE);
+
+	pr("%s(%d, %d): faila=%3d(%c)  failb=%3d(%c)  %s\n",
+	   __func__, i, j, i, disk_type(i, disks), j, disk_type(j, disks),
+	   (!erra && !errb) ? "OK" : !erra ? "ERRB" : !errb ? "ERRA" : "ERRAB");
+
+	dataptrs[i] = data[i];
+	dataptrs[j] = data[j];
+
+	return erra || errb;
+}
+
+static int test(int disks, int *tests)
+{
+	struct dma_async_tx_descriptor *tx;
+	struct async_submit_ctl submit;
+	struct completion cmp;
+	int err = 0;
+	int i, j;
+
+	recovi = data[disks];
+	recovj = data[disks+1];
+	spare  = data[disks+2];
+
+	makedata(disks);
+
+	/* Nuke syndromes */
+	memset(page_address(data[disks-2]), 0xee, PAGE_SIZE);
+	memset(page_address(data[disks-1]), 0xee, PAGE_SIZE);
+
+	/* Generate assumed good syndrome */
+	init_completion(&cmp);
+	init_async_submit(&submit, ASYNC_TX_ACK, NULL, callback, &cmp, addr_conv);
+	tx = async_gen_syndrome(dataptrs, 0, disks, PAGE_SIZE, &submit);
+	async_tx_issue_pending(tx);
+
+	if (wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)) == 0) {
+		pr("error: initial gen_syndrome(%d) timed out\n", disks);
+		return 1;
+	}
+
+	pr("testing the %d-disk case...\n", disks);
+	for (i = 0; i < disks-1; i++)
+		for (j = i+1; j < disks; j++) {
+			(*tests)++;
+			err += test_disks(i, j, disks);
+		}
+
+	return err;
+}
+
+
+static int raid6_test(void)
+{
+	int err = 0;
+	int tests = 0;
+	int i;
+
+	for (i = 0; i < NDISKS+3; i++) {
+		data[i] = alloc_page(GFP_KERNEL);
+		if (!data[i]) {
+			while (i--)
+				put_page(data[i]);
+			return -ENOMEM;
+		}
+	}
+
+	/* the 4-disk and 5-disk cases are special for the recovery code */
+	if (NDISKS > 4)
+		err += test(4, &tests);
+	if (NDISKS > 5)
+		err += test(5, &tests);
+	err += test(NDISKS, &tests);
+
+	pr("\n");
+	pr("complete (%d tests, %d failure%s)\n",
+	   tests, err, err == 1 ? "" : "s");
+
+	for (i = 0; i < NDISKS+3; i++)
+		put_page(data[i]);
+
+	return 0;
+}
+
+static void raid6_test_exit(void)
+{
+}
+
+/* when compiled-in wait for drivers to load first (assumes dma drivers
+ * are also compliled-in)
+ */
+late_initcall(raid6_test);
+module_exit(raid6_test_exit);
+MODULE_AUTHOR("Dan Williams <dan.j.williams@intel.com>");
+MODULE_DESCRIPTION("asynchronous RAID-6 recovery self tests");
+MODULE_LICENSE("GPL");
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index d295bdccc09..9335b87c517 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -115,6 +115,9 @@ static const struct file_operations acpi_button_state_fops = {
 	.release = single_release,
 };
 
+static BLOCKING_NOTIFIER_HEAD(acpi_lid_notifier);
+static struct acpi_device *lid_device;
+
 /* --------------------------------------------------------------------------
                               FS Interface (/proc)
    -------------------------------------------------------------------------- */
@@ -231,11 +234,38 @@ static int acpi_button_remove_fs(struct acpi_device *device)
 /* --------------------------------------------------------------------------
                                 Driver Interface
    -------------------------------------------------------------------------- */
+int acpi_lid_notifier_register(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&acpi_lid_notifier, nb);
+}
+EXPORT_SYMBOL(acpi_lid_notifier_register);
+
+int acpi_lid_notifier_unregister(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&acpi_lid_notifier, nb);
+}
+EXPORT_SYMBOL(acpi_lid_notifier_unregister);
+
+int acpi_lid_open(void)
+{
+	acpi_status status;
+	unsigned long long state;
+
+	status = acpi_evaluate_integer(lid_device->handle, "_LID", NULL,
+				       &state);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	return !!state;
+}
+EXPORT_SYMBOL(acpi_lid_open);
+
 static int acpi_lid_send_state(struct acpi_device *device)
 {
 	struct acpi_button *button = acpi_driver_data(device);
 	unsigned long long state;
 	acpi_status status;
+	int ret;
 
 	status = acpi_evaluate_integer(device->handle, "_LID", NULL, &state);
 	if (ACPI_FAILURE(status))
@@ -244,7 +274,12 @@ static int acpi_lid_send_state(struct acpi_device *device)
 	/* input layer checks if event is redundant */
 	input_report_switch(button->input, SW_LID, !state);
 	input_sync(button->input);
-	return 0;
+
+	ret = blocking_notifier_call_chain(&acpi_lid_notifier, state, device);
+	if (ret == NOTIFY_DONE)
+		ret = blocking_notifier_call_chain(&acpi_lid_notifier, state,
+						   device);
+	return ret;
 }
 
 static void acpi_button_notify(struct acpi_device *device, u32 event)
@@ -366,8 +401,14 @@ static int acpi_button_add(struct acpi_device *device)
 	error = input_register_device(input);
 	if (error)
 		goto err_remove_fs;
-	if (button->type == ACPI_BUTTON_TYPE_LID)
+	if (button->type == ACPI_BUTTON_TYPE_LID) {
 		acpi_lid_send_state(device);
+		/*
+		 * This assumes there's only one lid device, or if there are
+		 * more we only care about the last one...
+		 */
+		lid_device = device;
+	}
 
 	if (device->wakeup.flags.valid) {
 		/* Button's GPE is run-wake GPE */
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 56071b67bed..5633b86e3ed 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -193,7 +193,7 @@ acpi_status __init acpi_os_initialize(void)
 
 static void bind_to_cpu0(struct work_struct *work)
 {
-	set_cpus_allowed(current, cpumask_of_cpu(0));
+	set_cpus_allowed_ptr(current, cpumask_of(0));
 	kfree(work);
 }
 
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 11088cf1031..8ba0ed0b9dd 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -511,7 +511,7 @@ int acpi_processor_preregister_performance(
 	struct acpi_processor *match_pr;
 	struct acpi_psd_package *match_pdomain;
 
-	if (!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))
 		return -ENOMEM;
 
 	mutex_lock(&performance_mutex);
@@ -558,7 +558,6 @@ int acpi_processor_preregister_performance(
 	 * Now that we have _PSD data from all CPUs, lets setup P-state 
 	 * domain info.
 	 */
-	cpumask_clear(covered_cpus);
 	for_each_possible_cpu(i) {
 		pr = per_cpu(processors, i);
 		if (!pr)
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index ce7cf3bc510..4c6c14c1e30 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -77,7 +77,7 @@ static int acpi_processor_update_tsd_coord(void)
 	struct acpi_tsd_package *pdomain, *match_pdomain;
 	struct acpi_processor_throttling *pthrottling, *match_pthrottling;
 
-	if (!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))
 		return -ENOMEM;
 
 	/*
@@ -105,7 +105,6 @@ static int acpi_processor_update_tsd_coord(void)
 	if (retval)
 		goto err_ret;
 
-	cpumask_clear(covered_cpus);
 	for_each_possible_cpu(i) {
 		pr = per_cpu(processors, i);
 		if (!pr)
diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index 2de64065aa1..29e66d603d3 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -790,11 +790,15 @@ he_init_group(struct he_dev *he_dev, int group)
 	he_dev->rbps_base = pci_alloc_consistent(he_dev->pci_dev,
 		CONFIG_RBPS_SIZE * sizeof(struct he_rbp), &he_dev->rbps_phys);
 	if (he_dev->rbps_base == NULL) {
-		hprintk("failed to alloc rbps\n");
-		return -ENOMEM;
+		hprintk("failed to alloc rbps_base\n");
+		goto out_destroy_rbps_pool;
 	}
 	memset(he_dev->rbps_base, 0, CONFIG_RBPS_SIZE * sizeof(struct he_rbp));
 	he_dev->rbps_virt = kmalloc(CONFIG_RBPS_SIZE * sizeof(struct he_virt), GFP_KERNEL);
+	if (he_dev->rbps_virt == NULL) {
+		hprintk("failed to alloc rbps_virt\n");
+		goto out_free_rbps_base;
+	}
 
 	for (i = 0; i < CONFIG_RBPS_SIZE; ++i) {
 		dma_addr_t dma_handle;
@@ -802,7 +806,7 @@ he_init_group(struct he_dev *he_dev, int group)
 
 		cpuaddr = pci_pool_alloc(he_dev->rbps_pool, GFP_KERNEL|GFP_DMA, &dma_handle);
 		if (cpuaddr == NULL)
-			return -ENOMEM;
+			goto out_free_rbps_virt;
 
 		he_dev->rbps_virt[i].virt = cpuaddr;
 		he_dev->rbps_base[i].status = RBP_LOANED | RBP_SMALLBUF | (i << RBP_INDEX_OFF);
@@ -827,17 +831,21 @@ he_init_group(struct he_dev *he_dev, int group)
 			CONFIG_RBPL_BUFSIZE, 8, 0);
 	if (he_dev->rbpl_pool == NULL) {
 		hprintk("unable to create rbpl pool\n");
-		return -ENOMEM;
+		goto out_free_rbps_virt;
 	}
 
 	he_dev->rbpl_base = pci_alloc_consistent(he_dev->pci_dev,
 		CONFIG_RBPL_SIZE * sizeof(struct he_rbp), &he_dev->rbpl_phys);
 	if (he_dev->rbpl_base == NULL) {
-		hprintk("failed to alloc rbpl\n");
-		return -ENOMEM;
+		hprintk("failed to alloc rbpl_base\n");
+		goto out_destroy_rbpl_pool;
 	}
 	memset(he_dev->rbpl_base, 0, CONFIG_RBPL_SIZE * sizeof(struct he_rbp));
 	he_dev->rbpl_virt = kmalloc(CONFIG_RBPL_SIZE * sizeof(struct he_virt), GFP_KERNEL);
+	if (he_dev->rbpl_virt == NULL) {
+		hprintk("failed to alloc rbpl_virt\n");
+		goto out_free_rbpl_base;
+	}
 
 	for (i = 0; i < CONFIG_RBPL_SIZE; ++i) {
 		dma_addr_t dma_handle;
@@ -845,7 +853,7 @@ he_init_group(struct he_dev *he_dev, int group)
 
 		cpuaddr = pci_pool_alloc(he_dev->rbpl_pool, GFP_KERNEL|GFP_DMA, &dma_handle);
 		if (cpuaddr == NULL)
-			return -ENOMEM;
+			goto out_free_rbpl_virt;
 
 		he_dev->rbpl_virt[i].virt = cpuaddr;
 		he_dev->rbpl_base[i].status = RBP_LOANED | (i << RBP_INDEX_OFF);
@@ -870,7 +878,7 @@ he_init_group(struct he_dev *he_dev, int group)
 		CONFIG_RBRQ_SIZE * sizeof(struct he_rbrq), &he_dev->rbrq_phys);
 	if (he_dev->rbrq_base == NULL) {
 		hprintk("failed to allocate rbrq\n");
-		return -ENOMEM;
+		goto out_free_rbpl_virt;
 	}
 	memset(he_dev->rbrq_base, 0, CONFIG_RBRQ_SIZE * sizeof(struct he_rbrq));
 
@@ -894,7 +902,7 @@ he_init_group(struct he_dev *he_dev, int group)
 		CONFIG_TBRQ_SIZE * sizeof(struct he_tbrq), &he_dev->tbrq_phys);
 	if (he_dev->tbrq_base == NULL) {
 		hprintk("failed to allocate tbrq\n");
-		return -ENOMEM;
+		goto out_free_rbpq_base;
 	}
 	memset(he_dev->tbrq_base, 0, CONFIG_TBRQ_SIZE * sizeof(struct he_tbrq));
 
@@ -906,6 +914,39 @@ he_init_group(struct he_dev *he_dev, int group)
 	he_writel(he_dev, CONFIG_TBRQ_THRESH, G0_TBRQ_THRESH + (group * 16));
 
 	return 0;
+
+out_free_rbpq_base:
+	pci_free_consistent(he_dev->pci_dev, CONFIG_RBRQ_SIZE *
+			sizeof(struct he_rbrq), he_dev->rbrq_base,
+			he_dev->rbrq_phys);
+	i = CONFIG_RBPL_SIZE;
+out_free_rbpl_virt:
+	while (--i)
+		pci_pool_free(he_dev->rbps_pool, he_dev->rbpl_virt[i].virt,
+				he_dev->rbps_base[i].phys);
+	kfree(he_dev->rbpl_virt);
+
+out_free_rbpl_base:
+	pci_free_consistent(he_dev->pci_dev, CONFIG_RBPL_SIZE *
+			sizeof(struct he_rbp), he_dev->rbpl_base,
+			he_dev->rbpl_phys);
+out_destroy_rbpl_pool:
+	pci_pool_destroy(he_dev->rbpl_pool);
+
+	i = CONFIG_RBPL_SIZE;
+out_free_rbps_virt:
+	while (--i)
+		pci_pool_free(he_dev->rbpl_pool, he_dev->rbps_virt[i].virt,
+				he_dev->rbpl_base[i].phys);
+	kfree(he_dev->rbps_virt);
+
+out_free_rbps_base:
+	pci_free_consistent(he_dev->pci_dev, CONFIG_RBPS_SIZE *
+			sizeof(struct he_rbp), he_dev->rbps_base,
+			he_dev->rbps_phys);
+out_destroy_rbps_pool:
+	pci_pool_destroy(he_dev->rbps_pool);
+	return -ENOMEM;
 }
 
 static int __devinit
diff --git a/drivers/atm/solos-attrlist.c b/drivers/atm/solos-attrlist.c
index efa2808dd94..1a9332e4efe 100644
--- a/drivers/atm/solos-attrlist.c
+++ b/drivers/atm/solos-attrlist.c
@@ -25,6 +25,10 @@ SOLOS_ATTR_RO(RSCorrectedErrorsUp)
 SOLOS_ATTR_RO(RSUnCorrectedErrorsUp)
 SOLOS_ATTR_RO(InterleaveRDn)
 SOLOS_ATTR_RO(InterleaveRUp)
+SOLOS_ATTR_RO(BisRDn)
+SOLOS_ATTR_RO(BisRUp)
+SOLOS_ATTR_RO(INPdown)
+SOLOS_ATTR_RO(INPup)
 SOLOS_ATTR_RO(ShowtimeStart)
 SOLOS_ATTR_RO(ATURVendor)
 SOLOS_ATTR_RO(ATUCCountry)
@@ -62,6 +66,13 @@ SOLOS_ATTR_RW(Defaults)
 SOLOS_ATTR_RW(LineMode)
 SOLOS_ATTR_RW(Profile)
 SOLOS_ATTR_RW(DetectNoise)
+SOLOS_ATTR_RW(BisAForceSNRMarginDn)
+SOLOS_ATTR_RW(BisMForceSNRMarginDn)
+SOLOS_ATTR_RW(BisAMaxMargin)
+SOLOS_ATTR_RW(BisMMaxMargin)
+SOLOS_ATTR_RW(AnnexAForceSNRMarginDn)
+SOLOS_ATTR_RW(AnnexAMaxMargin)
+SOLOS_ATTR_RW(AnnexMMaxMargin)
 SOLOS_ATTR_RO(SupportedAnnexes)
 SOLOS_ATTR_RO(Status)
 SOLOS_ATTR_RO(TotalStart)
diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
index 307321b32cb..c5f5186d62a 100644
--- a/drivers/atm/solos-pci.c
+++ b/drivers/atm/solos-pci.c
@@ -59,21 +59,29 @@
 #define RX_DMA_ADDR(port)	(0x30 + (4 * (port)))
 
 #define DATA_RAM_SIZE	32768
-#define BUF_SIZE	4096
+#define BUF_SIZE	2048
+#define OLD_BUF_SIZE	4096 /* For FPGA versions <= 2*/
 #define FPGA_PAGE	528 /* FPGA flash page size*/
 #define SOLOS_PAGE	512 /* Solos flash page size*/
 #define FPGA_BLOCK	(FPGA_PAGE * 8) /* FPGA flash block size*/
 #define SOLOS_BLOCK	(SOLOS_PAGE * 8) /* Solos flash block size*/
 
-#define RX_BUF(card, nr) ((card->buffers) + (nr)*BUF_SIZE*2)
-#define TX_BUF(card, nr) ((card->buffers) + (nr)*BUF_SIZE*2 + BUF_SIZE)
+#define RX_BUF(card, nr) ((card->buffers) + (nr)*(card->buffer_size)*2)
+#define TX_BUF(card, nr) ((card->buffers) + (nr)*(card->buffer_size)*2 + (card->buffer_size))
+#define FLASH_BUF ((card->buffers) + 4*(card->buffer_size)*2)
 
 #define RX_DMA_SIZE	2048
 
+#define FPGA_VERSION(a,b) (((a) << 8) + (b))
+#define LEGACY_BUFFERS	2
+#define DMA_SUPPORTED	4
+
 static int reset = 0;
 static int atmdebug = 0;
 static int firmware_upgrade = 0;
 static int fpga_upgrade = 0;
+static int db_firmware_upgrade = 0;
+static int db_fpga_upgrade = 0;
 
 struct pkt_hdr {
 	__le16 size;
@@ -116,6 +124,8 @@ struct solos_card {
 	wait_queue_head_t param_wq;
 	wait_queue_head_t fw_wq;
 	int using_dma;
+	int fpga_version;
+	int buffer_size;
 };
 
 
@@ -136,10 +146,14 @@ MODULE_PARM_DESC(reset, "Reset Solos chips on startup");
 MODULE_PARM_DESC(atmdebug, "Print ATM data");
 MODULE_PARM_DESC(firmware_upgrade, "Initiate Solos firmware upgrade");
 MODULE_PARM_DESC(fpga_upgrade, "Initiate FPGA upgrade");
+MODULE_PARM_DESC(db_firmware_upgrade, "Initiate daughter board Solos firmware upgrade");
+MODULE_PARM_DESC(db_fpga_upgrade, "Initiate daughter board FPGA upgrade");
 module_param(reset, int, 0444);
 module_param(atmdebug, int, 0644);
 module_param(firmware_upgrade, int, 0444);
 module_param(fpga_upgrade, int, 0444);
+module_param(db_firmware_upgrade, int, 0444);
+module_param(db_fpga_upgrade, int, 0444);
 
 static void fpga_queue(struct solos_card *card, int port, struct sk_buff *skb,
 		       struct atm_vcc *vcc);
@@ -517,10 +531,32 @@ static int flash_upgrade(struct solos_card *card, int chip)
 	if (chip == 0) {
 		fw_name = "solos-FPGA.bin";
 		blocksize = FPGA_BLOCK;
-	} else {
+	} 
+	
+	if (chip == 1) {
 		fw_name = "solos-Firmware.bin";
 		blocksize = SOLOS_BLOCK;
 	}
+	
+	if (chip == 2){
+		if (card->fpga_version > LEGACY_BUFFERS){
+			fw_name = "solos-db-FPGA.bin";
+			blocksize = FPGA_BLOCK;
+		} else {
+			dev_info(&card->dev->dev, "FPGA version doesn't support daughter board upgrades\n");
+			return -EPERM;
+		}
+	}
+	
+	if (chip == 3){
+		if (card->fpga_version > LEGACY_BUFFERS){
+			fw_name = "solos-Firmware.bin";
+			blocksize = SOLOS_BLOCK;
+		} else {
+		dev_info(&card->dev->dev, "FPGA version doesn't support daughter board upgrades\n");
+		return -EPERM;
+		}
+	}
 
 	if (request_firmware(&fw, fw_name, &card->dev->dev))
 		return -ENOENT;
@@ -536,8 +572,10 @@ static int flash_upgrade(struct solos_card *card, int chip)
 	data32 = ioread32(card->config_regs + FPGA_MODE); 
 
 	/* Set mode to Chip Erase */
-	dev_info(&card->dev->dev, "Set FPGA Flash mode to %s Chip Erase\n",
-		 chip?"Solos":"FPGA");
+	if(chip == 0 || chip == 2)
+		dev_info(&card->dev->dev, "Set FPGA Flash mode to FPGA Chip Erase\n");
+	if(chip == 1 || chip == 3)
+		dev_info(&card->dev->dev, "Set FPGA Flash mode to Solos Chip Erase\n");
 	iowrite32((chip * 2), card->config_regs + FLASH_MODE);
 
 
@@ -557,7 +595,10 @@ static int flash_upgrade(struct solos_card *card, int chip)
 		/* Copy block to buffer, swapping each 16 bits */
 		for(i = 0; i < blocksize; i += 4) {
 			uint32_t word = swahb32p((uint32_t *)(fw->data + offset + i));
-			iowrite32(word, RX_BUF(card, 3) + i);
+			if(card->fpga_version > LEGACY_BUFFERS)
+				iowrite32(word, FLASH_BUF + i);
+			else
+				iowrite32(word, RX_BUF(card, 3) + i);
 		}
 
 		/* Specify block number and then trigger flash write */
@@ -630,6 +671,10 @@ void solos_bh(unsigned long card_arg)
 				memcpy_fromio(header, RX_BUF(card, port), sizeof(*header));
 
 				size = le16_to_cpu(header->size);
+				if (size > (card->buffer_size - sizeof(*header))){
+					dev_warn(&card->dev->dev, "Invalid buffer size\n");
+					continue;
+				}
 
 				skb = alloc_skb(size + 1, GFP_ATOMIC);
 				if (!skb) {
@@ -1094,12 +1139,18 @@ static int fpga_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	fpga_ver = (data32 & 0x0000FFFF);
 	major_ver = ((data32 & 0xFF000000) >> 24);
 	minor_ver = ((data32 & 0x00FF0000) >> 16);
+	card->fpga_version = FPGA_VERSION(major_ver,minor_ver);
+	if (card->fpga_version > LEGACY_BUFFERS)
+		card->buffer_size = BUF_SIZE;
+	else
+		card->buffer_size = OLD_BUF_SIZE;
 	dev_info(&dev->dev, "Solos FPGA Version %d.%02d svn-%d\n",
 		 major_ver, minor_ver, fpga_ver);
 
-	if (0 && fpga_ver > 27)
+	if (card->fpga_version >= DMA_SUPPORTED){
 		card->using_dma = 1;
-	else {
+	} else {
+		card->using_dma = 0;
 		/* Set RX empty flag for all ports */
 		iowrite32(0xF0, card->config_regs + FLAGS_ADDR);
 	}
@@ -1131,6 +1182,12 @@ static int fpga_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	if (firmware_upgrade)
 		flash_upgrade(card, 1);
 
+	if (db_fpga_upgrade)
+		flash_upgrade(card, 2);
+
+	if (db_firmware_upgrade)
+		flash_upgrade(card, 3);
+
 	err = atm_init(card);
 	if (err)
 		goto out_free_irq;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 71d1b9bab70..614da5b8613 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3412,7 +3412,7 @@ static int cdrom_print_info(const char *header, int val, char *info,
 	return 0;
 }
 
-static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp,
+static int cdrom_sysctl_info(ctl_table *ctl, int write,
                            void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int pos;
@@ -3489,7 +3489,7 @@ static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp,
 		goto done;
 doit:
 	mutex_unlock(&cdrom_mutex);
-	return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	return proc_dostring(ctl, write, buffer, lenp, ppos);
 done:
 	printk(KERN_INFO "cdrom: info buffer too small\n");
 	goto doit;
@@ -3525,12 +3525,12 @@ static void cdrom_update_settings(void)
 	mutex_unlock(&cdrom_mutex);
 }
 
-static int cdrom_sysctl_handler(ctl_table *ctl, int write, struct file * filp,
+static int cdrom_sysctl_handler(ctl_table *ctl, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write) {
 	
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 6a06913b01d..08a6f50ae79 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -1087,6 +1087,14 @@ config MMTIMER
 	  The mmtimer device allows direct userspace access to the
 	  Altix system timer.
 
+config UV_MMTIMER
+	tristate "UV_MMTIMER Memory mapped RTC for SGI UV"
+	depends on X86_UV
+	default m
+	help
+	  The uv_mmtimer device allows direct userspace access to the
+	  UV system timer.
+
 source "drivers/char/tpm/Kconfig"
 
 config TELCLOCK
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 66f779ad4f4..19a79dd79ee 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_RAW_DRIVER)	+= raw.o
 obj-$(CONFIG_SGI_SNSC)		+= snsc.o snsc_event.o
 obj-$(CONFIG_MSPEC)		+= mspec.o
 obj-$(CONFIG_MMTIMER)		+= mmtimer.o
+obj-$(CONFIG_UV_MMTIMER)	+= uv_mmtimer.o
 obj-$(CONFIG_VIOTAPE)		+= viotape.o
 obj-$(CONFIG_HVCS)		+= hvcs.o
 obj-$(CONFIG_IBM_BSR)		+= bsr.o
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 1540e693d91..4068467ce7b 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -46,6 +46,8 @@
 #define PCI_DEVICE_ID_INTEL_Q35_IG          0x29B2
 #define PCI_DEVICE_ID_INTEL_Q33_HB          0x29D0
 #define PCI_DEVICE_ID_INTEL_Q33_IG          0x29D2
+#define PCI_DEVICE_ID_INTEL_B43_HB          0x2E40
+#define PCI_DEVICE_ID_INTEL_B43_IG          0x2E42
 #define PCI_DEVICE_ID_INTEL_GM45_HB         0x2A40
 #define PCI_DEVICE_ID_INTEL_GM45_IG         0x2A42
 #define PCI_DEVICE_ID_INTEL_IGD_E_HB        0x2E00
@@ -91,6 +93,7 @@
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_G45_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_GM45_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_G41_HB || \
+		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_B43_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IGDNG_D_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IGDNG_M_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IGDNG_MA_HB)
@@ -804,23 +807,39 @@ static void intel_i830_setup_flush(void)
 	if (!intel_private.i8xx_page)
 		return;
 
-	/* make page uncached */
-	map_page_into_agp(intel_private.i8xx_page);
-
 	intel_private.i8xx_flush_page = kmap(intel_private.i8xx_page);
 	if (!intel_private.i8xx_flush_page)
 		intel_i830_fini_flush();
 }
 
+static void
+do_wbinvd(void *null)
+{
+	wbinvd();
+}
+
+/* The chipset_flush interface needs to get data that has already been
+ * flushed out of the CPU all the way out to main memory, because the GPU
+ * doesn't snoop those buffers.
+ *
+ * The 8xx series doesn't have the same lovely interface for flushing the
+ * chipset write buffers that the later chips do. According to the 865
+ * specs, it's 64 octwords, or 1KB.  So, to get those previous things in
+ * that buffer out, we just fill 1KB and clflush it out, on the assumption
+ * that it'll push whatever was in there out.  It appears to work.
+ */
 static void intel_i830_chipset_flush(struct agp_bridge_data *bridge)
 {
 	unsigned int *pg = intel_private.i8xx_flush_page;
-	int i;
 
-	for (i = 0; i < 256; i += 2)
-		*(pg + i) = i;
+	memset(pg, 0, 1024);
 
-	wmb();
+	if (cpu_has_clflush) {
+		clflush_cache_range(pg, 1024);
+	} else {
+		if (on_each_cpu(do_wbinvd, NULL, 1) != 0)
+			printk(KERN_ERR "Timed out waiting for cache flush.\n");
+	}
 }
 
 /* The intel i830 automatically initializes the agp aperture during POST.
@@ -1341,6 +1360,7 @@ static void intel_i965_get_gtt_range(int *gtt_offset, int *gtt_size)
 	case PCI_DEVICE_ID_INTEL_Q45_HB:
 	case PCI_DEVICE_ID_INTEL_G45_HB:
 	case PCI_DEVICE_ID_INTEL_G41_HB:
+	case PCI_DEVICE_ID_INTEL_B43_HB:
 	case PCI_DEVICE_ID_INTEL_IGDNG_D_HB:
 	case PCI_DEVICE_ID_INTEL_IGDNG_M_HB:
 	case PCI_DEVICE_ID_INTEL_IGDNG_MA_HB:
@@ -2335,6 +2355,8 @@ static const struct intel_driver_description {
 	    "Q45/Q43", NULL, &intel_i965_driver },
 	{ PCI_DEVICE_ID_INTEL_G45_HB, PCI_DEVICE_ID_INTEL_G45_IG, 0,
 	    "G45/G43", NULL, &intel_i965_driver },
+	{ PCI_DEVICE_ID_INTEL_B43_HB, PCI_DEVICE_ID_INTEL_B43_IG, 0,
+	    "B43", NULL, &intel_i965_driver },
 	{ PCI_DEVICE_ID_INTEL_G41_HB, PCI_DEVICE_ID_INTEL_G41_IG, 0,
 	    "G41", NULL, &intel_i965_driver },
 	{ PCI_DEVICE_ID_INTEL_IGDNG_D_HB, PCI_DEVICE_ID_INTEL_IGDNG_D_IG, 0,
@@ -2535,6 +2557,7 @@ static struct pci_device_id agp_intel_pci_table[] = {
 	ID(PCI_DEVICE_ID_INTEL_Q45_HB),
 	ID(PCI_DEVICE_ID_INTEL_G45_HB),
 	ID(PCI_DEVICE_ID_INTEL_G41_HB),
+	ID(PCI_DEVICE_ID_INTEL_B43_HB),
 	ID(PCI_DEVICE_ID_INTEL_IGDNG_D_HB),
 	ID(PCI_DEVICE_ID_INTEL_IGDNG_M_HB),
 	ID(PCI_DEVICE_ID_INTEL_IGDNG_MA_HB),
diff --git a/drivers/char/bfin-otp.c b/drivers/char/bfin-otp.c
index 0a01329451e..e3dd24bff51 100644
--- a/drivers/char/bfin-otp.c
+++ b/drivers/char/bfin-otp.c
@@ -1,8 +1,7 @@
 /*
  * Blackfin On-Chip OTP Memory Interface
- *  Supports BF52x/BF54x
  *
- * Copyright 2007-2008 Analog Devices Inc.
+ * Copyright 2007-2009 Analog Devices Inc.
  *
  * Enter bugs at http://blackfin.uclinux.org/
  *
@@ -17,8 +16,10 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/types.h>
+#include <mtd/mtd-abi.h>
 
 #include <asm/blackfin.h>
+#include <asm/bfrom.h>
 #include <asm/uaccess.h>
 
 #define stamp(fmt, args...) pr_debug("%s:%i: " fmt "\n", __func__, __LINE__, ## args)
@@ -30,39 +31,6 @@
 
 static DEFINE_MUTEX(bfin_otp_lock);
 
-/* OTP Boot ROM functions */
-#define _BOOTROM_OTP_COMMAND           0xEF000018
-#define _BOOTROM_OTP_READ              0xEF00001A
-#define _BOOTROM_OTP_WRITE             0xEF00001C
-
-static u32 (* const otp_command)(u32 command, u32 value) = (void *)_BOOTROM_OTP_COMMAND;
-static u32 (* const otp_read)(u32 page, u32 flags, u64 *page_content) = (void *)_BOOTROM_OTP_READ;
-static u32 (* const otp_write)(u32 page, u32 flags, u64 *page_content) = (void *)_BOOTROM_OTP_WRITE;
-
-/* otp_command(): defines for "command" */
-#define OTP_INIT             0x00000001
-#define OTP_CLOSE            0x00000002
-
-/* otp_{read,write}(): defines for "flags" */
-#define OTP_LOWER_HALF       0x00000000 /* select upper/lower 64-bit half (bit 0) */
-#define OTP_UPPER_HALF       0x00000001
-#define OTP_NO_ECC           0x00000010 /* do not use ECC */
-#define OTP_LOCK             0x00000020 /* sets page protection bit for page */
-#define OTP_ACCESS_READ      0x00001000
-#define OTP_ACCESS_READWRITE 0x00002000
-
-/* Return values for all functions */
-#define OTP_SUCCESS          0x00000000
-#define OTP_MASTER_ERROR     0x001
-#define OTP_WRITE_ERROR      0x003
-#define OTP_READ_ERROR       0x005
-#define OTP_ACC_VIO_ERROR    0x009
-#define OTP_DATA_MULT_ERROR  0x011
-#define OTP_ECC_MULT_ERROR   0x021
-#define OTP_PREV_WR_ERROR    0x041
-#define OTP_DATA_SB_WARN     0x100
-#define OTP_ECC_SB_WARN      0x200
-
 /**
  *	bfin_otp_read - Read OTP pages
  *
@@ -86,9 +54,11 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count,
 	page = *pos / (sizeof(u64) * 2);
 	while (bytes_done < count) {
 		flags = (*pos % (sizeof(u64) * 2) ? OTP_UPPER_HALF : OTP_LOWER_HALF);
-		stamp("processing page %i (%s)", page, (flags == OTP_UPPER_HALF ? "upper" : "lower"));
-		ret = otp_read(page, flags, &content);
+		stamp("processing page %i (0x%x:%s)", page, flags,
+			(flags & OTP_UPPER_HALF ? "upper" : "lower"));
+		ret = bfrom_OtpRead(page, flags, &content);
 		if (ret & OTP_MASTER_ERROR) {
+			stamp("error from otp: 0x%x", ret);
 			bytes_done = -EIO;
 			break;
 		}
@@ -96,7 +66,7 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count,
 			bytes_done = -EFAULT;
 			break;
 		}
-		if (flags == OTP_UPPER_HALF)
+		if (flags & OTP_UPPER_HALF)
 			++page;
 		bytes_done += sizeof(content);
 		*pos += sizeof(content);
@@ -108,14 +78,53 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count,
 }
 
 #ifdef CONFIG_BFIN_OTP_WRITE_ENABLE
+static bool allow_writes;
+
+/**
+ *	bfin_otp_init_timing - setup OTP timing parameters
+ *
+ *	Required before doing any write operation.  Algorithms from HRM.
+ */
+static u32 bfin_otp_init_timing(void)
+{
+	u32 tp1, tp2, tp3, timing;
+
+	tp1 = get_sclk() / 1000000;
+	tp2 = (2 * get_sclk() / 10000000) << 8;
+	tp3 = (0x1401) << 15;
+	timing = tp1 | tp2 | tp3;
+	if (bfrom_OtpCommand(OTP_INIT, timing))
+		return 0;
+
+	return timing;
+}
+
+/**
+ *	bfin_otp_deinit_timing - set timings to only allow reads
+ *
+ *	Should be called after all writes are done.
+ */
+static void bfin_otp_deinit_timing(u32 timing)
+{
+	/* mask bits [31:15] so that any attempts to write fail */
+	bfrom_OtpCommand(OTP_CLOSE, 0);
+	bfrom_OtpCommand(OTP_INIT, timing & ~(-1 << 15));
+	bfrom_OtpCommand(OTP_CLOSE, 0);
+}
+
 /**
- *	bfin_otp_write - Write OTP pages
+ *	bfin_otp_write - write OTP pages
  *
  *	All writes must be in half page chunks (half page == 64 bits).
  */
 static ssize_t bfin_otp_write(struct file *filp, const char __user *buff, size_t count, loff_t *pos)
 {
-	stampit();
+	ssize_t bytes_done;
+	u32 timing, page, base_flags, flags, ret;
+	u64 content;
+
+	if (!allow_writes)
+		return -EACCES;
 
 	if (count % sizeof(u64))
 		return -EMSGSIZE;
@@ -123,20 +132,96 @@ static ssize_t bfin_otp_write(struct file *filp, const char __user *buff, size_t
 	if (mutex_lock_interruptible(&bfin_otp_lock))
 		return -ERESTARTSYS;
 
-	/* need otp_init() documentation before this can be implemented */
+	stampit();
+
+	timing = bfin_otp_init_timing();
+	if (timing == 0) {
+		mutex_unlock(&bfin_otp_lock);
+		return -EIO;
+	}
+
+	base_flags = OTP_CHECK_FOR_PREV_WRITE;
+
+	bytes_done = 0;
+	page = *pos / (sizeof(u64) * 2);
+	while (bytes_done < count) {
+		flags = base_flags | (*pos % (sizeof(u64) * 2) ? OTP_UPPER_HALF : OTP_LOWER_HALF);
+		stamp("processing page %i (0x%x:%s) from %p", page, flags,
+			(flags & OTP_UPPER_HALF ? "upper" : "lower"), buff + bytes_done);
+		if (copy_from_user(&content, buff + bytes_done, sizeof(content))) {
+			bytes_done = -EFAULT;
+			break;
+		}
+		ret = bfrom_OtpWrite(page, flags, &content);
+		if (ret & OTP_MASTER_ERROR) {
+			stamp("error from otp: 0x%x", ret);
+			bytes_done = -EIO;
+			break;
+		}
+		if (flags & OTP_UPPER_HALF)
+			++page;
+		bytes_done += sizeof(content);
+		*pos += sizeof(content);
+	}
+
+	bfin_otp_deinit_timing(timing);
 
 	mutex_unlock(&bfin_otp_lock);
 
+	return bytes_done;
+}
+
+static long bfin_otp_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
+{
+	stampit();
+
+	switch (cmd) {
+	case OTPLOCK: {
+		u32 timing;
+		int ret = -EIO;
+
+		if (!allow_writes)
+			return -EACCES;
+
+		if (mutex_lock_interruptible(&bfin_otp_lock))
+			return -ERESTARTSYS;
+
+		timing = bfin_otp_init_timing();
+		if (timing) {
+			u32 otp_result = bfrom_OtpWrite(arg, OTP_LOCK, NULL);
+			stamp("locking page %lu resulted in 0x%x", arg, otp_result);
+			if (!(otp_result & OTP_MASTER_ERROR))
+				ret = 0;
+
+			bfin_otp_deinit_timing(timing);
+		}
+
+		mutex_unlock(&bfin_otp_lock);
+
+		return ret;
+	}
+
+	case MEMLOCK:
+		allow_writes = false;
+		return 0;
+
+	case MEMUNLOCK:
+		allow_writes = true;
+		return 0;
+	}
+
 	return -EINVAL;
 }
 #else
 # define bfin_otp_write NULL
+# define bfin_otp_ioctl NULL
 #endif
 
 static struct file_operations bfin_otp_fops = {
-	.owner    = THIS_MODULE,
-	.read     = bfin_otp_read,
-	.write    = bfin_otp_write,
+	.owner          = THIS_MODULE,
+	.unlocked_ioctl = bfin_otp_ioctl,
+	.read           = bfin_otp_read,
+	.write          = bfin_otp_write,
 };
 
 static struct miscdevice bfin_otp_misc_device = {
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 4a9f3492b92..70a770ac013 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -166,9 +166,8 @@ static irqreturn_t hpet_interrupt(int irq, void *data)
 		unsigned long m, t;
 
 		t = devp->hd_ireqfreq;
-		m = read_counter(&devp->hd_hpet->hpet_mc);
-		write_counter(t + m + devp->hd_hpets->hp_delta,
-			      &devp->hd_timer->hpet_compare);
+		m = read_counter(&devp->hd_timer->hpet_compare);
+		write_counter(t + m, &devp->hd_timer->hpet_compare);
 	}
 
 	if (devp->hd_flags & HPET_SHARED_IRQ)
@@ -504,21 +503,25 @@ static int hpet_ioctl_ieon(struct hpet_dev *devp)
 	g = v | Tn_32MODE_CNF_MASK | Tn_INT_ENB_CNF_MASK;
 
 	if (devp->hd_flags & HPET_PERIODIC) {
-		write_counter(t, &timer->hpet_compare);
 		g |= Tn_TYPE_CNF_MASK;
-		v |= Tn_TYPE_CNF_MASK;
-		writeq(v, &timer->hpet_config);
-		v |= Tn_VAL_SET_CNF_MASK;
+		v |= Tn_TYPE_CNF_MASK | Tn_VAL_SET_CNF_MASK;
 		writeq(v, &timer->hpet_config);
 		local_irq_save(flags);
 
-		/* NOTE:  what we modify here is a hidden accumulator
+		/*
+		 * NOTE: First we modify the hidden accumulator
 		 * register supported by periodic-capable comparators.
 		 * We never want to modify the (single) counter; that
-		 * would affect all the comparators.
+		 * would affect all the comparators. The value written
+		 * is the counter value when the first interrupt is due.
 		 */
 		m = read_counter(&hpet->hpet_mc);
 		write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare);
+		/*
+		 * Then we modify the comparator, indicating the period
+		 * for subsequent interrupt.
+		 */
+		write_counter(t, &timer->hpet_compare);
 	} else {
 		local_irq_save(flags);
 		m = read_counter(&hpet->hpet_mc);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 0aede1d6a9e..6c8b65d069e 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -690,7 +690,7 @@ static ssize_t read_zero(struct file * file, char __user * buf,
 
 		if (chunk > PAGE_SIZE)
 			chunk = PAGE_SIZE;	/* Just for latency reasons */
-		unwritten = clear_user(buf, chunk);
+		unwritten = __clear_user(buf, chunk);
 		written += chunk - unwritten;
 		if (unwritten)
 			break;
diff --git a/drivers/char/mwave/mwavedd.c b/drivers/char/mwave/mwavedd.c
index 94ad2c3bfc4..a4ec50c9507 100644
--- a/drivers/char/mwave/mwavedd.c
+++ b/drivers/char/mwave/mwavedd.c
@@ -281,12 +281,6 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
 		case IOCTL_MW_REGISTER_IPC: {
 			unsigned int ipcnum = (unsigned int) ioarg;
 	
-			PRINTK_3(TRACE_MWAVE,
-				"mwavedd::mwave_ioctl IOCTL_MW_REGISTER_IPC"
-				" ipcnum %x entry usIntCount %x\n",
-				ipcnum,
-				pDrvData->IPCs[ipcnum].usIntCount);
-	
 			if (ipcnum >= ARRAY_SIZE(pDrvData->IPCs)) {
 				PRINTK_ERROR(KERN_ERR_MWAVE
 						"mwavedd::mwave_ioctl:"
@@ -295,6 +289,12 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
 						ipcnum);
 				return -EINVAL;
 			}
+			PRINTK_3(TRACE_MWAVE,
+				"mwavedd::mwave_ioctl IOCTL_MW_REGISTER_IPC"
+				" ipcnum %x entry usIntCount %x\n",
+				ipcnum,
+				pDrvData->IPCs[ipcnum].usIntCount);
+
 			lock_kernel();
 			pDrvData->IPCs[ipcnum].bIsHere = FALSE;
 			pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
@@ -310,11 +310,6 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
 		case IOCTL_MW_GET_IPC: {
 			unsigned int ipcnum = (unsigned int) ioarg;
 	
-			PRINTK_3(TRACE_MWAVE,
-				"mwavedd::mwave_ioctl IOCTL_MW_GET_IPC"
-				" ipcnum %x, usIntCount %x\n",
-				ipcnum,
-				pDrvData->IPCs[ipcnum].usIntCount);
 			if (ipcnum >= ARRAY_SIZE(pDrvData->IPCs)) {
 				PRINTK_ERROR(KERN_ERR_MWAVE
 						"mwavedd::mwave_ioctl:"
@@ -322,6 +317,11 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
 						" Invalid ipcnum %x\n", ipcnum);
 				return -EINVAL;
 			}
+			PRINTK_3(TRACE_MWAVE,
+				"mwavedd::mwave_ioctl IOCTL_MW_GET_IPC"
+				" ipcnum %x, usIntCount %x\n",
+				ipcnum,
+				pDrvData->IPCs[ipcnum].usIntCount);
 	
 			lock_kernel();
 			if (pDrvData->IPCs[ipcnum].bIsEnabled == TRUE) {
diff --git a/drivers/char/random.c b/drivers/char/random.c
index d8a9255e1a3..04b505e5a5e 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1231,7 +1231,7 @@ static char sysctl_bootid[16];
  * as an ASCII string in the standard UUID format.  If accesses via the
  * sysctl system call, it is returned as 16 bytes of binary data.
  */
-static int proc_do_uuid(ctl_table *table, int write, struct file *filp,
+static int proc_do_uuid(ctl_table *table, int write,
 			void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	ctl_table fake_table;
@@ -1254,7 +1254,7 @@ static int proc_do_uuid(ctl_table *table, int write, struct file *filp,
 	fake_table.data = buf;
 	fake_table.maxlen = sizeof(buf);
 
-	return proc_dostring(&fake_table, write, filp, buffer, lenp, ppos);
+	return proc_dostring(&fake_table, write, buffer, lenp, ppos);
 }
 
 static int uuid_strategy(ctl_table *table,
diff --git a/drivers/char/rio/rioctrl.c b/drivers/char/rio/rioctrl.c
index eecee0f576d..74339559f0b 100644
--- a/drivers/char/rio/rioctrl.c
+++ b/drivers/char/rio/rioctrl.c
@@ -873,7 +873,7 @@ int riocontrol(struct rio_info *p, dev_t dev, int cmd, unsigned long arg, int su
 		/*
 		 ** It is important that the product code is an unsigned object!
 		 */
-		if (DownLoad.ProductCode > MAX_PRODUCT) {
+		if (DownLoad.ProductCode >= MAX_PRODUCT) {
 			rio_dprintk(RIO_DEBUG_CTRL, "RIO_DOWNLOAD: Bad product code %d passed\n", DownLoad.ProductCode);
 			p->RIOError.Error = NO_SUCH_PRODUCT;
 			return -ENXIO;
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index 32b957efa42..45d58002b06 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -742,7 +742,7 @@ EXPORT_SYMBOL_GPL(tpm_pcr_read);
  * the module usage count.
  */
 #define TPM_ORD_PCR_EXTEND cpu_to_be32(20)
-#define EXTEND_PCR_SIZE 34
+#define EXTEND_PCR_RESULT_SIZE 34
 static struct tpm_input_header pcrextend_header = {
 	.tag = TPM_TAG_RQU_COMMAND,
 	.length = cpu_to_be32(34),
@@ -760,10 +760,9 @@ int tpm_pcr_extend(u32 chip_num, int pcr_idx, const u8 *hash)
 		return -ENODEV;
 
 	cmd.header.in = pcrextend_header;
-	BUG_ON(be32_to_cpu(cmd.header.in.length) > EXTEND_PCR_SIZE);
 	cmd.params.pcrextend_in.pcr_idx = cpu_to_be32(pcr_idx);
 	memcpy(cmd.params.pcrextend_in.hash, hash, TPM_DIGEST_SIZE);
-	rc = transmit_cmd(chip, &cmd, cmd.header.in.length,
+	rc = transmit_cmd(chip, &cmd, EXTEND_PCR_RESULT_SIZE,
 			  "attempting extend a PCR value");
 
 	module_put(chip->dev->driver->owner);
diff --git a/drivers/char/uv_mmtimer.c b/drivers/char/uv_mmtimer.c
new file mode 100644
index 00000000000..867b67be9f0
--- /dev/null
+++ b/drivers/char/uv_mmtimer.c
@@ -0,0 +1,216 @@
+/*
+ * Timer device implementation for SGI UV platform.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2009 Silicon Graphics, Inc.  All rights reserved.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/ioctl.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mmtimer.h>
+#include <linux/miscdevice.h>
+#include <linux/posix-timers.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/math64.h>
+#include <linux/smp_lock.h>
+
+#include <asm/genapic.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+
+MODULE_AUTHOR("Dimitri Sivanich <sivanich@sgi.com>");
+MODULE_DESCRIPTION("SGI UV Memory Mapped RTC Timer");
+MODULE_LICENSE("GPL");
+
+/* name of the device, usually in /dev */
+#define UV_MMTIMER_NAME "mmtimer"
+#define UV_MMTIMER_DESC "SGI UV Memory Mapped RTC Timer"
+#define UV_MMTIMER_VERSION "1.0"
+
+static long uv_mmtimer_ioctl(struct file *file, unsigned int cmd,
+						unsigned long arg);
+static int uv_mmtimer_mmap(struct file *file, struct vm_area_struct *vma);
+
+/*
+ * Period in femtoseconds (10^-15 s)
+ */
+static unsigned long uv_mmtimer_femtoperiod;
+
+static const struct file_operations uv_mmtimer_fops = {
+	.owner = THIS_MODULE,
+	.mmap =	uv_mmtimer_mmap,
+	.unlocked_ioctl = uv_mmtimer_ioctl,
+};
+
+/**
+ * uv_mmtimer_ioctl - ioctl interface for /dev/uv_mmtimer
+ * @file: file structure for the device
+ * @cmd: command to execute
+ * @arg: optional argument to command
+ *
+ * Executes the command specified by @cmd.  Returns 0 for success, < 0 for
+ * failure.
+ *
+ * Valid commands:
+ *
+ * %MMTIMER_GETOFFSET - Should return the offset (relative to the start
+ * of the page where the registers are mapped) for the counter in question.
+ *
+ * %MMTIMER_GETRES - Returns the resolution of the clock in femto (10^-15)
+ * seconds
+ *
+ * %MMTIMER_GETFREQ - Copies the frequency of the clock in Hz to the address
+ * specified by @arg
+ *
+ * %MMTIMER_GETBITS - Returns the number of bits in the clock's counter
+ *
+ * %MMTIMER_MMAPAVAIL - Returns 1 if registers can be mmap'd into userspace
+ *
+ * %MMTIMER_GETCOUNTER - Gets the current value in the counter and places it
+ * in the address specified by @arg.
+ */
+static long uv_mmtimer_ioctl(struct file *file, unsigned int cmd,
+						unsigned long arg)
+{
+	int ret = 0;
+
+	switch (cmd) {
+	case MMTIMER_GETOFFSET:	/* offset of the counter */
+		/*
+		 * UV RTC register is on its own page
+		 */
+		if (PAGE_SIZE <= (1 << 16))
+			ret = ((UV_LOCAL_MMR_BASE | UVH_RTC) & (PAGE_SIZE-1))
+				/ 8;
+		else
+			ret = -ENOSYS;
+		break;
+
+	case MMTIMER_GETRES: /* resolution of the clock in 10^-15 s */
+		if (copy_to_user((unsigned long __user *)arg,
+				&uv_mmtimer_femtoperiod, sizeof(unsigned long)))
+			ret = -EFAULT;
+		break;
+
+	case MMTIMER_GETFREQ: /* frequency in Hz */
+		if (copy_to_user((unsigned long __user *)arg,
+				&sn_rtc_cycles_per_second,
+				sizeof(unsigned long)))
+			ret = -EFAULT;
+		break;
+
+	case MMTIMER_GETBITS: /* number of bits in the clock */
+		ret = hweight64(UVH_RTC_REAL_TIME_CLOCK_MASK);
+		break;
+
+	case MMTIMER_MMAPAVAIL: /* can we mmap the clock into userspace? */
+		ret = (PAGE_SIZE <= (1 << 16)) ? 1 : 0;
+		break;
+
+	case MMTIMER_GETCOUNTER:
+		if (copy_to_user((unsigned long __user *)arg,
+				(unsigned long *)uv_local_mmr_address(UVH_RTC),
+				sizeof(unsigned long)))
+			ret = -EFAULT;
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * uv_mmtimer_mmap - maps the clock's registers into userspace
+ * @file: file structure for the device
+ * @vma: VMA to map the registers into
+ *
+ * Calls remap_pfn_range() to map the clock's registers into
+ * the calling process' address space.
+ */
+static int uv_mmtimer_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long uv_mmtimer_addr;
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	if (PAGE_SIZE > (1 << 16))
+		return -ENOSYS;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	uv_mmtimer_addr = UV_LOCAL_MMR_BASE | UVH_RTC;
+	uv_mmtimer_addr &= ~(PAGE_SIZE - 1);
+	uv_mmtimer_addr &= 0xfffffffffffffffUL;
+
+	if (remap_pfn_range(vma, vma->vm_start, uv_mmtimer_addr >> PAGE_SHIFT,
+					PAGE_SIZE, vma->vm_page_prot)) {
+		printk(KERN_ERR "remap_pfn_range failed in uv_mmtimer_mmap\n");
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static struct miscdevice uv_mmtimer_miscdev = {
+	MISC_DYNAMIC_MINOR,
+	UV_MMTIMER_NAME,
+	&uv_mmtimer_fops
+};
+
+
+/**
+ * uv_mmtimer_init - device initialization routine
+ *
+ * Does initial setup for the uv_mmtimer device.
+ */
+static int __init uv_mmtimer_init(void)
+{
+	if (!is_uv_system()) {
+		printk(KERN_ERR "%s: Hardware unsupported\n", UV_MMTIMER_NAME);
+		return -1;
+	}
+
+	/*
+	 * Sanity check the cycles/sec variable
+	 */
+	if (sn_rtc_cycles_per_second < 100000) {
+		printk(KERN_ERR "%s: unable to determine clock frequency\n",
+		       UV_MMTIMER_NAME);
+		return -1;
+	}
+
+	uv_mmtimer_femtoperiod = ((unsigned long)1E15 +
+				sn_rtc_cycles_per_second / 2) /
+				sn_rtc_cycles_per_second;
+
+	if (misc_register(&uv_mmtimer_miscdev)) {
+		printk(KERN_ERR "%s: failed to register device\n",
+		       UV_MMTIMER_NAME);
+		return -1;
+	}
+
+	printk(KERN_INFO "%s: v%s, %ld MHz\n", UV_MMTIMER_DESC,
+		UV_MMTIMER_VERSION,
+		sn_rtc_cycles_per_second/(unsigned long)1E6);
+
+	return 0;
+}
+
+module_init(uv_mmtimer_init);
diff --git a/drivers/dca/dca-core.c b/drivers/dca/dca-core.c
index 25b743abfb5..52e6bb70a49 100644
--- a/drivers/dca/dca-core.c
+++ b/drivers/dca/dca-core.c
@@ -28,7 +28,7 @@
 #include <linux/device.h>
 #include <linux/dca.h>
 
-#define DCA_VERSION "1.8"
+#define DCA_VERSION "1.12.1"
 
 MODULE_VERSION(DCA_VERSION);
 MODULE_LICENSE("GPL");
@@ -36,20 +36,92 @@ MODULE_AUTHOR("Intel Corporation");
 
 static DEFINE_SPINLOCK(dca_lock);
 
-static LIST_HEAD(dca_providers);
+static LIST_HEAD(dca_domains);
 
-static struct dca_provider *dca_find_provider_by_dev(struct device *dev)
+static struct pci_bus *dca_pci_rc_from_dev(struct device *dev)
 {
-	struct dca_provider *dca, *ret = NULL;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_bus *bus = pdev->bus;
 
-	list_for_each_entry(dca, &dca_providers, node) {
-		if ((!dev) || (dca->ops->dev_managed(dca, dev))) {
-			ret = dca;
-			break;
-		}
+	while (bus->parent)
+		bus = bus->parent;
+
+	return bus;
+}
+
+static struct dca_domain *dca_allocate_domain(struct pci_bus *rc)
+{
+	struct dca_domain *domain;
+
+	domain = kzalloc(sizeof(*domain), GFP_NOWAIT);
+	if (!domain)
+		return NULL;
+
+	INIT_LIST_HEAD(&domain->dca_providers);
+	domain->pci_rc = rc;
+
+	return domain;
+}
+
+static void dca_free_domain(struct dca_domain *domain)
+{
+	list_del(&domain->node);
+	kfree(domain);
+}
+
+static struct dca_domain *dca_find_domain(struct pci_bus *rc)
+{
+	struct dca_domain *domain;
+
+	list_for_each_entry(domain, &dca_domains, node)
+		if (domain->pci_rc == rc)
+			return domain;
+
+	return NULL;
+}
+
+static struct dca_domain *dca_get_domain(struct device *dev)
+{
+	struct pci_bus *rc;
+	struct dca_domain *domain;
+
+	rc = dca_pci_rc_from_dev(dev);
+	domain = dca_find_domain(rc);
+
+	if (!domain) {
+		domain = dca_allocate_domain(rc);
+		if (domain)
+			list_add(&domain->node, &dca_domains);
+	}
+
+	return domain;
+}
+
+static struct dca_provider *dca_find_provider_by_dev(struct device *dev)
+{
+	struct dca_provider *dca;
+	struct pci_bus *rc;
+	struct dca_domain *domain;
+
+	if (dev) {
+		rc = dca_pci_rc_from_dev(dev);
+		domain = dca_find_domain(rc);
+		if (!domain)
+			return NULL;
+	} else {
+		if (!list_empty(&dca_domains))
+			domain = list_first_entry(&dca_domains,
+						  struct dca_domain,
+						  node);
+		else
+			return NULL;
 	}
 
-	return ret;
+	list_for_each_entry(dca, &domain->dca_providers, node)
+		if ((!dev) || (dca->ops->dev_managed(dca, dev)))
+			return dca;
+
+	return NULL;
 }
 
 /**
@@ -61,6 +133,8 @@ int dca_add_requester(struct device *dev)
 	struct dca_provider *dca;
 	int err, slot = -ENODEV;
 	unsigned long flags;
+	struct pci_bus *pci_rc;
+	struct dca_domain *domain;
 
 	if (!dev)
 		return -EFAULT;
@@ -74,7 +148,14 @@ int dca_add_requester(struct device *dev)
 		return -EEXIST;
 	}
 
-	list_for_each_entry(dca, &dca_providers, node) {
+	pci_rc = dca_pci_rc_from_dev(dev);
+	domain = dca_find_domain(pci_rc);
+	if (!domain) {
+		spin_unlock_irqrestore(&dca_lock, flags);
+		return -ENODEV;
+	}
+
+	list_for_each_entry(dca, &domain->dca_providers, node) {
 		slot = dca->ops->add_requester(dca, dev);
 		if (slot >= 0)
 			break;
@@ -222,13 +303,19 @@ int register_dca_provider(struct dca_provider *dca, struct device *dev)
 {
 	int err;
 	unsigned long flags;
+	struct dca_domain *domain;
 
 	err = dca_sysfs_add_provider(dca, dev);
 	if (err)
 		return err;
 
 	spin_lock_irqsave(&dca_lock, flags);
-	list_add(&dca->node, &dca_providers);
+	domain = dca_get_domain(dev);
+	if (!domain) {
+		spin_unlock_irqrestore(&dca_lock, flags);
+		return -ENODEV;
+	}
+	list_add(&dca->node, &domain->dca_providers);
 	spin_unlock_irqrestore(&dca_lock, flags);
 
 	blocking_notifier_call_chain(&dca_provider_chain,
@@ -241,15 +328,24 @@ EXPORT_SYMBOL_GPL(register_dca_provider);
  * unregister_dca_provider - remove a dca provider
  * @dca - struct created by alloc_dca_provider()
  */
-void unregister_dca_provider(struct dca_provider *dca)
+void unregister_dca_provider(struct dca_provider *dca, struct device *dev)
 {
 	unsigned long flags;
+	struct pci_bus *pci_rc;
+	struct dca_domain *domain;
 
 	blocking_notifier_call_chain(&dca_provider_chain,
 				     DCA_PROVIDER_REMOVE, NULL);
 
 	spin_lock_irqsave(&dca_lock, flags);
+
 	list_del(&dca->node);
+
+	pci_rc = dca_pci_rc_from_dev(dev);
+	domain = dca_find_domain(pci_rc);
+	if (list_empty(&domain->dca_providers))
+		dca_free_domain(domain);
+
 	spin_unlock_irqrestore(&dca_lock, flags);
 
 	dca_sysfs_remove_provider(dca);
@@ -276,7 +372,7 @@ EXPORT_SYMBOL_GPL(dca_unregister_notify);
 
 static int __init dca_init(void)
 {
-	printk(KERN_ERR "dca service started, version %s\n", DCA_VERSION);
+	pr_info("dca service started, version %s\n", DCA_VERSION);
 	return dca_sysfs_init();
 }
 
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 81e1020fb51..5903a88351b 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -17,11 +17,15 @@ if DMADEVICES
 
 comment "DMA Devices"
 
+config ASYNC_TX_DISABLE_CHANNEL_SWITCH
+	bool
+
 config INTEL_IOATDMA
 	tristate "Intel I/OAT DMA support"
 	depends on PCI && X86
 	select DMA_ENGINE
 	select DCA
+	select ASYNC_TX_DISABLE_CHANNEL_SWITCH
 	help
 	  Enable support for the Intel(R) I/OAT DMA engine present
 	  in recent Intel Xeon chipsets.
@@ -97,6 +101,14 @@ config TXX9_DMAC
 	  Support the TXx9 SoC internal DMA controller.  This can be
 	  integrated in chips such as the Toshiba TX4927/38/39.
 
+config SH_DMAE
+	tristate "Renesas SuperH DMAC support"
+	depends on SUPERH && SH_DMA
+	depends on !SH_DMA_API
+	select DMA_ENGINE
+	help
+	  Enable support for the Renesas SuperH DMA controllers.
+
 config DMA_ENGINE
 	bool
 
@@ -116,7 +128,7 @@ config NET_DMA
 
 config ASYNC_TX_DMA
 	bool "Async_tx: Offload support for the async_tx api"
-	depends on DMA_ENGINE && !HIGHMEM64G
+	depends on DMA_ENGINE
 	help
 	  This allows the async_tx api to take advantage of offload engines for
 	  memcpy, memset, xor, and raid6 p+q operations.  If your platform has
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 40e1e008357..eca71ba78ae 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,8 +1,7 @@
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_DMATEST) += dmatest.o
-obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
-ioatdma-objs := ioat.o ioat_dma.o ioat_dca.o
+obj-$(CONFIG_INTEL_IOATDMA) += ioat/
 obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
 obj-$(CONFIG_FSL_DMA) += fsldma.o
 obj-$(CONFIG_MV_XOR) += mv_xor.o
@@ -10,3 +9,4 @@ obj-$(CONFIG_DW_DMAC) += dw_dmac.o
 obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
 obj-$(CONFIG_MX3_IPU) += ipu/
 obj-$(CONFIG_TXX9_DMAC) += txx9dmac.o
+obj-$(CONFIG_SH_DMAE) += shdma.o
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index c8522e6f1ad..7585c4164bd 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -87,6 +87,7 @@ static struct at_desc *atc_alloc_descriptor(struct dma_chan *chan,
 	desc = dma_pool_alloc(atdma->dma_desc_pool, gfp_flags, &phys);
 	if (desc) {
 		memset(desc, 0, sizeof(struct at_desc));
+		INIT_LIST_HEAD(&desc->tx_list);
 		dma_async_tx_descriptor_init(&desc->txd, chan);
 		/* txd.flags will be overwritten in prep functions */
 		desc->txd.flags = DMA_CTRL_ACK;
@@ -150,11 +151,11 @@ static void atc_desc_put(struct at_dma_chan *atchan, struct at_desc *desc)
 		struct at_desc *child;
 
 		spin_lock_bh(&atchan->lock);
-		list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+		list_for_each_entry(child, &desc->tx_list, desc_node)
 			dev_vdbg(chan2dev(&atchan->chan_common),
 					"moving child desc %p to freelist\n",
 					child);
-		list_splice_init(&desc->txd.tx_list, &atchan->free_list);
+		list_splice_init(&desc->tx_list, &atchan->free_list);
 		dev_vdbg(chan2dev(&atchan->chan_common),
 			 "moving desc %p to freelist\n", desc);
 		list_add(&desc->desc_node, &atchan->free_list);
@@ -247,30 +248,33 @@ atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc)
 	param = txd->callback_param;
 
 	/* move children to free_list */
-	list_splice_init(&txd->tx_list, &atchan->free_list);
+	list_splice_init(&desc->tx_list, &atchan->free_list);
 	/* move myself to free_list */
 	list_move(&desc->desc_node, &atchan->free_list);
 
 	/* unmap dma addresses */
-	if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-		if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-			dma_unmap_single(chan2parent(&atchan->chan_common),
-					desc->lli.daddr,
-					desc->len, DMA_FROM_DEVICE);
-		else
-			dma_unmap_page(chan2parent(&atchan->chan_common),
-					desc->lli.daddr,
-					desc->len, DMA_FROM_DEVICE);
-	}
-	if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-		if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-			dma_unmap_single(chan2parent(&atchan->chan_common),
-					desc->lli.saddr,
-					desc->len, DMA_TO_DEVICE);
-		else
-			dma_unmap_page(chan2parent(&atchan->chan_common),
-					desc->lli.saddr,
-					desc->len, DMA_TO_DEVICE);
+	if (!atchan->chan_common.private) {
+		struct device *parent = chan2parent(&atchan->chan_common);
+		if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+			if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
+				dma_unmap_single(parent,
+						desc->lli.daddr,
+						desc->len, DMA_FROM_DEVICE);
+			else
+				dma_unmap_page(parent,
+						desc->lli.daddr,
+						desc->len, DMA_FROM_DEVICE);
+		}
+		if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+			if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
+				dma_unmap_single(parent,
+						desc->lli.saddr,
+						desc->len, DMA_TO_DEVICE);
+			else
+				dma_unmap_page(parent,
+						desc->lli.saddr,
+						desc->len, DMA_TO_DEVICE);
+		}
 	}
 
 	/*
@@ -334,7 +338,7 @@ static void atc_cleanup_descriptors(struct at_dma_chan *atchan)
 			/* This one is currently in progress */
 			return;
 
-		list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+		list_for_each_entry(child, &desc->tx_list, desc_node)
 			if (!(child->lli.ctrla & ATC_DONE))
 				/* Currently in progress */
 				return;
@@ -407,7 +411,7 @@ static void atc_handle_error(struct at_dma_chan *atchan)
 	dev_crit(chan2dev(&atchan->chan_common),
 			"  cookie: %d\n", bad_desc->txd.cookie);
 	atc_dump_lli(atchan, &bad_desc->lli);
-	list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node)
+	list_for_each_entry(child, &bad_desc->tx_list, desc_node)
 		atc_dump_lli(atchan, &child->lli);
 
 	/* Pretend the descriptor completed successfully */
@@ -587,7 +591,7 @@ atc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 			prev->lli.dscr = desc->txd.phys;
 			/* insert the link descriptor to the LD ring */
 			list_add_tail(&desc->desc_node,
-					&first->txd.tx_list);
+					&first->tx_list);
 		}
 		prev = desc;
 	}
@@ -646,8 +650,6 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 
 	reg_width = atslave->reg_width;
 
-	sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction);
-
 	ctrla = ATC_DEFAULT_CTRLA | atslave->ctrla;
 	ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN;
 
@@ -687,7 +689,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				prev->lli.dscr = desc->txd.phys;
 				/* insert the link descriptor to the LD ring */
 				list_add_tail(&desc->desc_node,
-						&first->txd.tx_list);
+						&first->tx_list);
 			}
 			prev = desc;
 			total_len += len;
@@ -729,7 +731,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				prev->lli.dscr = desc->txd.phys;
 				/* insert the link descriptor to the LD ring */
 				list_add_tail(&desc->desc_node,
-						&first->txd.tx_list);
+						&first->tx_list);
 			}
 			prev = desc;
 			total_len += len;
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
index 4c972afc49e..495457e3dc4 100644
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -165,6 +165,7 @@ struct at_desc {
 	struct at_lli			lli;
 
 	/* THEN values for driver housekeeping */
+	struct list_head		tx_list;
 	struct dma_async_tx_descriptor	txd;
 	struct list_head		desc_node;
 	size_t				len;
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 5a87384ea4f..bd0b248de2c 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -608,6 +608,40 @@ void dmaengine_put(void)
 }
 EXPORT_SYMBOL(dmaengine_put);
 
+static bool device_has_all_tx_types(struct dma_device *device)
+{
+	/* A device that satisfies this test has channels that will never cause
+	 * an async_tx channel switch event as all possible operation types can
+	 * be handled.
+	 */
+	#ifdef CONFIG_ASYNC_TX_DMA
+	if (!dma_has_cap(DMA_INTERRUPT, device->cap_mask))
+		return false;
+	#endif
+
+	#if defined(CONFIG_ASYNC_MEMCPY) || defined(CONFIG_ASYNC_MEMCPY_MODULE)
+	if (!dma_has_cap(DMA_MEMCPY, device->cap_mask))
+		return false;
+	#endif
+
+	#if defined(CONFIG_ASYNC_MEMSET) || defined(CONFIG_ASYNC_MEMSET_MODULE)
+	if (!dma_has_cap(DMA_MEMSET, device->cap_mask))
+		return false;
+	#endif
+
+	#if defined(CONFIG_ASYNC_XOR) || defined(CONFIG_ASYNC_XOR_MODULE)
+	if (!dma_has_cap(DMA_XOR, device->cap_mask))
+		return false;
+	#endif
+
+	#if defined(CONFIG_ASYNC_PQ) || defined(CONFIG_ASYNC_PQ_MODULE)
+	if (!dma_has_cap(DMA_PQ, device->cap_mask))
+		return false;
+	#endif
+
+	return true;
+}
+
 static int get_dma_id(struct dma_device *device)
 {
 	int rc;
@@ -644,8 +678,12 @@ int dma_async_device_register(struct dma_device *device)
 		!device->device_prep_dma_memcpy);
 	BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) &&
 		!device->device_prep_dma_xor);
-	BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) &&
-		!device->device_prep_dma_zero_sum);
+	BUG_ON(dma_has_cap(DMA_XOR_VAL, device->cap_mask) &&
+		!device->device_prep_dma_xor_val);
+	BUG_ON(dma_has_cap(DMA_PQ, device->cap_mask) &&
+		!device->device_prep_dma_pq);
+	BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) &&
+		!device->device_prep_dma_pq_val);
 	BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) &&
 		!device->device_prep_dma_memset);
 	BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) &&
@@ -661,6 +699,12 @@ int dma_async_device_register(struct dma_device *device)
 	BUG_ON(!device->device_issue_pending);
 	BUG_ON(!device->dev);
 
+	/* note: this only matters in the
+	 * CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH=y case
+	 */
+	if (device_has_all_tx_types(device))
+		dma_cap_set(DMA_ASYNC_TX, device->cap_mask);
+
 	idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
 	if (!idr_ref)
 		return -ENOMEM;
@@ -933,55 +977,29 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 {
 	tx->chan = chan;
 	spin_lock_init(&tx->lock);
-	INIT_LIST_HEAD(&tx->tx_list);
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
 /* dma_wait_for_async_tx - spin wait for a transaction to complete
  * @tx: in-flight transaction to wait on
- *
- * This routine assumes that tx was obtained from a call to async_memcpy,
- * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped
- * and submitted).  Walking the parent chain is only meant to cover for DMA
- * drivers that do not implement the DMA_INTERRUPT capability and may race with
- * the driver's descriptor cleanup routine.
  */
 enum dma_status
 dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
 {
-	enum dma_status status;
-	struct dma_async_tx_descriptor *iter;
-	struct dma_async_tx_descriptor *parent;
+	unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000);
 
 	if (!tx)
 		return DMA_SUCCESS;
 
-	WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for"
-		  " %s\n", __func__, dma_chan_name(tx->chan));
-
-	/* poll through the dependency chain, return when tx is complete */
-	do {
-		iter = tx;
-
-		/* find the root of the unsubmitted dependency chain */
-		do {
-			parent = iter->parent;
-			if (!parent)
-				break;
-			else
-				iter = parent;
-		} while (parent);
-
-		/* there is a small window for ->parent == NULL and
-		 * ->cookie == -EBUSY
-		 */
-		while (iter->cookie == -EBUSY)
-			cpu_relax();
-
-		status = dma_sync_wait(iter->chan, iter->cookie);
-	} while (status == DMA_IN_PROGRESS || (iter != tx));
-
-	return status;
+	while (tx->cookie == -EBUSY) {
+		if (time_after_eq(jiffies, dma_sync_wait_timeout)) {
+			pr_err("%s timeout waiting for descriptor submission\n",
+				__func__);
+			return DMA_ERROR;
+		}
+		cpu_relax();
+	}
+	return dma_sync_wait(tx->chan, tx->cookie);
 }
 EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
 
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index d93017fc787..a32a4cf7b1e 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -48,6 +48,11 @@ module_param(xor_sources, uint, S_IRUGO);
 MODULE_PARM_DESC(xor_sources,
 		"Number of xor source buffers (default: 3)");
 
+static unsigned int pq_sources = 3;
+module_param(pq_sources, uint, S_IRUGO);
+MODULE_PARM_DESC(pq_sources,
+		"Number of p+q source buffers (default: 3)");
+
 /*
  * Initialization patterns. All bytes in the source buffer has bit 7
  * set, all bytes in the destination buffer has bit 7 cleared.
@@ -232,6 +237,7 @@ static int dmatest_func(void *data)
 	dma_cookie_t		cookie;
 	enum dma_status		status;
 	enum dma_ctrl_flags 	flags;
+	u8			pq_coefs[pq_sources];
 	int			ret;
 	int			src_cnt;
 	int			dst_cnt;
@@ -248,6 +254,11 @@ static int dmatest_func(void *data)
 	else if (thread->type == DMA_XOR) {
 		src_cnt = xor_sources | 1; /* force odd to ensure dst = src */
 		dst_cnt = 1;
+	} else if (thread->type == DMA_PQ) {
+		src_cnt = pq_sources | 1; /* force odd to ensure dst = src */
+		dst_cnt = 2;
+		for (i = 0; i < pq_sources; i++)
+			pq_coefs[i] = 1;
 	} else
 		goto err_srcs;
 
@@ -283,6 +294,7 @@ static int dmatest_func(void *data)
 		dma_addr_t dma_dsts[dst_cnt];
 		struct completion cmp;
 		unsigned long tmo = msecs_to_jiffies(3000);
+		u8 align = 0;
 
 		total_tests++;
 
@@ -290,6 +302,18 @@ static int dmatest_func(void *data)
 		src_off = dmatest_random() % (test_buf_size - len + 1);
 		dst_off = dmatest_random() % (test_buf_size - len + 1);
 
+		/* honor alignment restrictions */
+		if (thread->type == DMA_MEMCPY)
+			align = dev->copy_align;
+		else if (thread->type == DMA_XOR)
+			align = dev->xor_align;
+		else if (thread->type == DMA_PQ)
+			align = dev->pq_align;
+
+		len = (len >> align) << align;
+		src_off = (src_off >> align) << align;
+		dst_off = (dst_off >> align) << align;
+
 		dmatest_init_srcs(thread->srcs, src_off, len);
 		dmatest_init_dsts(thread->dsts, dst_off, len);
 
@@ -306,6 +330,7 @@ static int dmatest_func(void *data)
 						     DMA_BIDIRECTIONAL);
 		}
 
+
 		if (thread->type == DMA_MEMCPY)
 			tx = dev->device_prep_dma_memcpy(chan,
 							 dma_dsts[0] + dst_off,
@@ -316,6 +341,15 @@ static int dmatest_func(void *data)
 						      dma_dsts[0] + dst_off,
 						      dma_srcs, xor_sources,
 						      len, flags);
+		else if (thread->type == DMA_PQ) {
+			dma_addr_t dma_pq[dst_cnt];
+
+			for (i = 0; i < dst_cnt; i++)
+				dma_pq[i] = dma_dsts[i] + dst_off;
+			tx = dev->device_prep_dma_pq(chan, dma_pq, dma_srcs,
+						     pq_sources, pq_coefs,
+						     len, flags);
+		}
 
 		if (!tx) {
 			for (i = 0; i < src_cnt; i++)
@@ -459,6 +493,8 @@ static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_ty
 		op = "copy";
 	else if (type == DMA_XOR)
 		op = "xor";
+	else if (type == DMA_PQ)
+		op = "pq";
 	else
 		return -EINVAL;
 
@@ -514,6 +550,10 @@ static int dmatest_add_channel(struct dma_chan *chan)
 		cnt = dmatest_add_threads(dtc, DMA_XOR);
 		thread_count += cnt > 0 ? cnt : 0;
 	}
+	if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
+		cnt = dmatest_add_threads(dtc, DMA_PQ);
+		thread_count += cnt > 0 ?: 0;
+	}
 
 	pr_info("dmatest: Started %u threads using %s\n",
 		thread_count, dma_chan_name(chan));
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 933c143b6a7..2eea823516a 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -116,7 +116,7 @@ static void dwc_sync_desc_for_cpu(struct dw_dma_chan *dwc, struct dw_desc *desc)
 {
 	struct dw_desc	*child;
 
-	list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+	list_for_each_entry(child, &desc->tx_list, desc_node)
 		dma_sync_single_for_cpu(chan2parent(&dwc->chan),
 				child->txd.phys, sizeof(child->lli),
 				DMA_TO_DEVICE);
@@ -137,11 +137,11 @@ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
 		dwc_sync_desc_for_cpu(dwc, desc);
 
 		spin_lock_bh(&dwc->lock);
-		list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+		list_for_each_entry(child, &desc->tx_list, desc_node)
 			dev_vdbg(chan2dev(&dwc->chan),
 					"moving child desc %p to freelist\n",
 					child);
-		list_splice_init(&desc->txd.tx_list, &dwc->free_list);
+		list_splice_init(&desc->tx_list, &dwc->free_list);
 		dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc);
 		list_add(&desc->desc_node, &dwc->free_list);
 		spin_unlock_bh(&dwc->lock);
@@ -209,19 +209,28 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc)
 	param = txd->callback_param;
 
 	dwc_sync_desc_for_cpu(dwc, desc);
-	list_splice_init(&txd->tx_list, &dwc->free_list);
+	list_splice_init(&desc->tx_list, &dwc->free_list);
 	list_move(&desc->desc_node, &dwc->free_list);
 
-	/*
-	 * We use dma_unmap_page() regardless of how the buffers were
-	 * mapped before they were submitted...
-	 */
-	if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP))
-		dma_unmap_page(chan2parent(&dwc->chan), desc->lli.dar,
-			       desc->len, DMA_FROM_DEVICE);
-	if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP))
-		dma_unmap_page(chan2parent(&dwc->chan), desc->lli.sar,
-			       desc->len, DMA_TO_DEVICE);
+	if (!dwc->chan.private) {
+		struct device *parent = chan2parent(&dwc->chan);
+		if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+			if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
+				dma_unmap_single(parent, desc->lli.dar,
+						desc->len, DMA_FROM_DEVICE);
+			else
+				dma_unmap_page(parent, desc->lli.dar,
+						desc->len, DMA_FROM_DEVICE);
+		}
+		if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+			if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
+				dma_unmap_single(parent, desc->lli.sar,
+						desc->len, DMA_TO_DEVICE);
+			else
+				dma_unmap_page(parent, desc->lli.sar,
+						desc->len, DMA_TO_DEVICE);
+		}
+	}
 
 	/*
 	 * The API requires that no submissions are done from a
@@ -289,7 +298,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 			/* This one is currently in progress */
 			return;
 
-		list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+		list_for_each_entry(child, &desc->tx_list, desc_node)
 			if (child->lli.llp == llp)
 				/* Currently in progress */
 				return;
@@ -356,7 +365,7 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	dev_printk(KERN_CRIT, chan2dev(&dwc->chan),
 			"  cookie: %d\n", bad_desc->txd.cookie);
 	dwc_dump_lli(dwc, &bad_desc->lli);
-	list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node)
+	list_for_each_entry(child, &bad_desc->tx_list, desc_node)
 		dwc_dump_lli(dwc, &child->lli);
 
 	/* Pretend the descriptor completed successfully */
@@ -608,7 +617,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 					prev->txd.phys, sizeof(prev->lli),
 					DMA_TO_DEVICE);
 			list_add_tail(&desc->desc_node,
-					&first->txd.tx_list);
+					&first->tx_list);
 		}
 		prev = desc;
 	}
@@ -658,8 +667,6 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	reg_width = dws->reg_width;
 	prev = first = NULL;
 
-	sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction);
-
 	switch (direction) {
 	case DMA_TO_DEVICE:
 		ctllo = (DWC_DEFAULT_CTLLO
@@ -700,7 +707,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 						sizeof(prev->lli),
 						DMA_TO_DEVICE);
 				list_add_tail(&desc->desc_node,
-						&first->txd.tx_list);
+						&first->tx_list);
 			}
 			prev = desc;
 			total_len += len;
@@ -746,7 +753,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 						sizeof(prev->lli),
 						DMA_TO_DEVICE);
 				list_add_tail(&desc->desc_node,
-						&first->txd.tx_list);
+						&first->tx_list);
 			}
 			prev = desc;
 			total_len += len;
@@ -902,6 +909,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 			break;
 		}
 
+		INIT_LIST_HEAD(&desc->tx_list);
 		dma_async_tx_descriptor_init(&desc->txd, chan);
 		desc->txd.tx_submit = dwc_tx_submit;
 		desc->txd.flags = DMA_CTRL_ACK;
diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h
index 13a58076703..d9a939f67f4 100644
--- a/drivers/dma/dw_dmac_regs.h
+++ b/drivers/dma/dw_dmac_regs.h
@@ -217,6 +217,7 @@ struct dw_desc {
 
 	/* THEN values for driver housekeeping */
 	struct list_head		desc_node;
+	struct list_head		tx_list;
 	struct dma_async_tx_descriptor	txd;
 	size_t				len;
 };
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index ef87a898414..296f9e747fa 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -34,6 +34,7 @@
 #include <linux/dmapool.h>
 #include <linux/of_platform.h>
 
+#include <asm/fsldma.h>
 #include "fsldma.h"
 
 static void dma_init(struct fsl_dma_chan *fsl_chan)
@@ -280,28 +281,40 @@ static void fsl_chan_set_dest_loop_size(struct fsl_dma_chan *fsl_chan, int size)
 }
 
 /**
- * fsl_chan_toggle_ext_pause - Toggle channel external pause status
+ * fsl_chan_set_request_count - Set DMA Request Count for external control
  * @fsl_chan : Freescale DMA channel
- * @size     : Pause control size, 0 for disable external pause control.
- *             The maximum is 1024.
+ * @size     : Number of bytes to transfer in a single request
+ *
+ * The Freescale DMA channel can be controlled by the external signal DREQ#.
+ * The DMA request count is how many bytes are allowed to transfer before
+ * pausing the channel, after which a new assertion of DREQ# resumes channel
+ * operation.
  *
- * The Freescale DMA channel can be controlled by the external
- * signal DREQ#. The pause control size is how many bytes are allowed
- * to transfer before pausing the channel, after which a new assertion
- * of DREQ# resumes channel operation.
+ * A size of 0 disables external pause control. The maximum size is 1024.
  */
-static void fsl_chan_toggle_ext_pause(struct fsl_dma_chan *fsl_chan, int size)
+static void fsl_chan_set_request_count(struct fsl_dma_chan *fsl_chan, int size)
 {
-	if (size > 1024)
-		return;
+	BUG_ON(size > 1024);
+	DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr,
+		DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32)
+			| ((__ilog2(size) << 24) & 0x0f000000),
+		32);
+}
 
-	if (size) {
-		DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr,
-			DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32)
-				| ((__ilog2(size) << 24) & 0x0f000000),
-			32);
+/**
+ * fsl_chan_toggle_ext_pause - Toggle channel external pause status
+ * @fsl_chan : Freescale DMA channel
+ * @enable   : 0 is disabled, 1 is enabled.
+ *
+ * The Freescale DMA channel can be controlled by the external signal DREQ#.
+ * The DMA Request Count feature should be used in addition to this feature
+ * to set the number of bytes to transfer before pausing the channel.
+ */
+static void fsl_chan_toggle_ext_pause(struct fsl_dma_chan *fsl_chan, int enable)
+{
+	if (enable)
 		fsl_chan->feature |= FSL_DMA_CHAN_PAUSE_EXT;
-	} else
+	else
 		fsl_chan->feature &= ~FSL_DMA_CHAN_PAUSE_EXT;
 }
 
@@ -326,7 +339,8 @@ static void fsl_chan_toggle_ext_start(struct fsl_dma_chan *fsl_chan, int enable)
 static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 {
 	struct fsl_dma_chan *fsl_chan = to_fsl_chan(tx->chan);
-	struct fsl_desc_sw *desc;
+	struct fsl_desc_sw *desc = tx_to_fsl_desc(tx);
+	struct fsl_desc_sw *child;
 	unsigned long flags;
 	dma_cookie_t cookie;
 
@@ -334,7 +348,7 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	spin_lock_irqsave(&fsl_chan->desc_lock, flags);
 
 	cookie = fsl_chan->common.cookie;
-	list_for_each_entry(desc, &tx->tx_list, node) {
+	list_for_each_entry(child, &desc->tx_list, node) {
 		cookie++;
 		if (cookie < 0)
 			cookie = 1;
@@ -343,8 +357,8 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	}
 
 	fsl_chan->common.cookie = cookie;
-	append_ld_queue(fsl_chan, tx_to_fsl_desc(tx));
-	list_splice_init(&tx->tx_list, fsl_chan->ld_queue.prev);
+	append_ld_queue(fsl_chan, desc);
+	list_splice_init(&desc->tx_list, fsl_chan->ld_queue.prev);
 
 	spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
 
@@ -366,6 +380,7 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(
 	desc_sw = dma_pool_alloc(fsl_chan->desc_pool, GFP_ATOMIC, &pdesc);
 	if (desc_sw) {
 		memset(desc_sw, 0, sizeof(struct fsl_desc_sw));
+		INIT_LIST_HEAD(&desc_sw->tx_list);
 		dma_async_tx_descriptor_init(&desc_sw->async_tx,
 						&fsl_chan->common);
 		desc_sw->async_tx.tx_submit = fsl_dma_tx_submit;
@@ -455,7 +470,7 @@ fsl_dma_prep_interrupt(struct dma_chan *chan, unsigned long flags)
 	new->async_tx.flags = flags;
 
 	/* Insert the link descriptor to the LD ring */
-	list_add_tail(&new->node, &new->async_tx.tx_list);
+	list_add_tail(&new->node, &new->tx_list);
 
 	/* Set End-of-link to the last link descriptor of new list*/
 	set_ld_eol(fsl_chan, new);
@@ -513,7 +528,7 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy(
 		dma_dest += copy;
 
 		/* Insert the link descriptor to the LD ring */
-		list_add_tail(&new->node, &first->async_tx.tx_list);
+		list_add_tail(&new->node, &first->tx_list);
 	} while (len);
 
 	new->async_tx.flags = flags; /* client is in control of this ack */
@@ -528,7 +543,7 @@ fail:
 	if (!first)
 		return NULL;
 
-	list = &first->async_tx.tx_list;
+	list = &first->tx_list;
 	list_for_each_entry_safe_reverse(new, prev, list, node) {
 		list_del(&new->node);
 		dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys);
@@ -538,6 +553,229 @@ fail:
 }
 
 /**
+ * fsl_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
+ * @chan: DMA channel
+ * @sgl: scatterlist to transfer to/from
+ * @sg_len: number of entries in @scatterlist
+ * @direction: DMA direction
+ * @flags: DMAEngine flags
+ *
+ * Prepare a set of descriptors for a DMA_SLAVE transaction. Following the
+ * DMA_SLAVE API, this gets the device-specific information from the
+ * chan->private variable.
+ */
+static struct dma_async_tx_descriptor *fsl_dma_prep_slave_sg(
+	struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len,
+	enum dma_data_direction direction, unsigned long flags)
+{
+	struct fsl_dma_chan *fsl_chan;
+	struct fsl_desc_sw *first = NULL, *prev = NULL, *new = NULL;
+	struct fsl_dma_slave *slave;
+	struct list_head *tx_list;
+	size_t copy;
+
+	int i;
+	struct scatterlist *sg;
+	size_t sg_used;
+	size_t hw_used;
+	struct fsl_dma_hw_addr *hw;
+	dma_addr_t dma_dst, dma_src;
+
+	if (!chan)
+		return NULL;
+
+	if (!chan->private)
+		return NULL;
+
+	fsl_chan = to_fsl_chan(chan);
+	slave = chan->private;
+
+	if (list_empty(&slave->addresses))
+		return NULL;
+
+	hw = list_first_entry(&slave->addresses, struct fsl_dma_hw_addr, entry);
+	hw_used = 0;
+
+	/*
+	 * Build the hardware transaction to copy from the scatterlist to
+	 * the hardware, or from the hardware to the scatterlist
+	 *
+	 * If you are copying from the hardware to the scatterlist and it
+	 * takes two hardware entries to fill an entire page, then both
+	 * hardware entries will be coalesced into the same page
+	 *
+	 * If you are copying from the scatterlist to the hardware and a
+	 * single page can fill two hardware entries, then the data will
+	 * be read out of the page into the first hardware entry, and so on
+	 */
+	for_each_sg(sgl, sg, sg_len, i) {
+		sg_used = 0;
+
+		/* Loop until the entire scatterlist entry is used */
+		while (sg_used < sg_dma_len(sg)) {
+
+			/*
+			 * If we've used up the current hardware address/length
+			 * pair, we need to load a new one
+			 *
+			 * This is done in a while loop so that descriptors with
+			 * length == 0 will be skipped
+			 */
+			while (hw_used >= hw->length) {
+
+				/*
+				 * If the current hardware entry is the last
+				 * entry in the list, we're finished
+				 */
+				if (list_is_last(&hw->entry, &slave->addresses))
+					goto finished;
+
+				/* Get the next hardware address/length pair */
+				hw = list_entry(hw->entry.next,
+						struct fsl_dma_hw_addr, entry);
+				hw_used = 0;
+			}
+
+			/* Allocate the link descriptor from DMA pool */
+			new = fsl_dma_alloc_descriptor(fsl_chan);
+			if (!new) {
+				dev_err(fsl_chan->dev, "No free memory for "
+						       "link descriptor\n");
+				goto fail;
+			}
+#ifdef FSL_DMA_LD_DEBUG
+			dev_dbg(fsl_chan->dev, "new link desc alloc %p\n", new);
+#endif
+
+			/*
+			 * Calculate the maximum number of bytes to transfer,
+			 * making sure it is less than the DMA controller limit
+			 */
+			copy = min_t(size_t, sg_dma_len(sg) - sg_used,
+					     hw->length - hw_used);
+			copy = min_t(size_t, copy, FSL_DMA_BCR_MAX_CNT);
+
+			/*
+			 * DMA_FROM_DEVICE
+			 * from the hardware to the scatterlist
+			 *
+			 * DMA_TO_DEVICE
+			 * from the scatterlist to the hardware
+			 */
+			if (direction == DMA_FROM_DEVICE) {
+				dma_src = hw->address + hw_used;
+				dma_dst = sg_dma_address(sg) + sg_used;
+			} else {
+				dma_src = sg_dma_address(sg) + sg_used;
+				dma_dst = hw->address + hw_used;
+			}
+
+			/* Fill in the descriptor */
+			set_desc_cnt(fsl_chan, &new->hw, copy);
+			set_desc_src(fsl_chan, &new->hw, dma_src);
+			set_desc_dest(fsl_chan, &new->hw, dma_dst);
+
+			/*
+			 * If this is not the first descriptor, chain the
+			 * current descriptor after the previous descriptor
+			 */
+			if (!first) {
+				first = new;
+			} else {
+				set_desc_next(fsl_chan, &prev->hw,
+					      new->async_tx.phys);
+			}
+
+			new->async_tx.cookie = 0;
+			async_tx_ack(&new->async_tx);
+
+			prev = new;
+			sg_used += copy;
+			hw_used += copy;
+
+			/* Insert the link descriptor into the LD ring */
+			list_add_tail(&new->node, &first->tx_list);
+		}
+	}
+
+finished:
+
+	/* All of the hardware address/length pairs had length == 0 */
+	if (!first || !new)
+		return NULL;
+
+	new->async_tx.flags = flags;
+	new->async_tx.cookie = -EBUSY;
+
+	/* Set End-of-link to the last link descriptor of new list */
+	set_ld_eol(fsl_chan, new);
+
+	/* Enable extra controller features */
+	if (fsl_chan->set_src_loop_size)
+		fsl_chan->set_src_loop_size(fsl_chan, slave->src_loop_size);
+
+	if (fsl_chan->set_dest_loop_size)
+		fsl_chan->set_dest_loop_size(fsl_chan, slave->dst_loop_size);
+
+	if (fsl_chan->toggle_ext_start)
+		fsl_chan->toggle_ext_start(fsl_chan, slave->external_start);
+
+	if (fsl_chan->toggle_ext_pause)
+		fsl_chan->toggle_ext_pause(fsl_chan, slave->external_pause);
+
+	if (fsl_chan->set_request_count)
+		fsl_chan->set_request_count(fsl_chan, slave->request_count);
+
+	return &first->async_tx;
+
+fail:
+	/* If first was not set, then we failed to allocate the very first
+	 * descriptor, and we're done */
+	if (!first)
+		return NULL;
+
+	/*
+	 * First is set, so all of the descriptors we allocated have been added
+	 * to first->tx_list, INCLUDING "first" itself. Therefore we
+	 * must traverse the list backwards freeing each descriptor in turn
+	 *
+	 * We're re-using variables for the loop, oh well
+	 */
+	tx_list = &first->tx_list;
+	list_for_each_entry_safe_reverse(new, prev, tx_list, node) {
+		list_del_init(&new->node);
+		dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys);
+	}
+
+	return NULL;
+}
+
+static void fsl_dma_device_terminate_all(struct dma_chan *chan)
+{
+	struct fsl_dma_chan *fsl_chan;
+	struct fsl_desc_sw *desc, *tmp;
+	unsigned long flags;
+
+	if (!chan)
+		return;
+
+	fsl_chan = to_fsl_chan(chan);
+
+	/* Halt the DMA engine */
+	dma_halt(fsl_chan);
+
+	spin_lock_irqsave(&fsl_chan->desc_lock, flags);
+
+	/* Remove and free all of the descriptors in the LD queue */
+	list_for_each_entry_safe(desc, tmp, &fsl_chan->ld_queue, node) {
+		list_del(&desc->node);
+		dma_pool_free(fsl_chan->desc_pool, desc, desc->async_tx.phys);
+	}
+
+	spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
+}
+
+/**
  * fsl_dma_update_completed_cookie - Update the completed cookie.
  * @fsl_chan : Freescale DMA channel
  */
@@ -883,6 +1121,7 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev,
 		new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start;
 		new_fsl_chan->set_src_loop_size = fsl_chan_set_src_loop_size;
 		new_fsl_chan->set_dest_loop_size = fsl_chan_set_dest_loop_size;
+		new_fsl_chan->set_request_count = fsl_chan_set_request_count;
 	}
 
 	spin_lock_init(&new_fsl_chan->desc_lock);
@@ -962,12 +1201,15 @@ static int __devinit of_fsl_dma_probe(struct of_device *dev,
 
 	dma_cap_set(DMA_MEMCPY, fdev->common.cap_mask);
 	dma_cap_set(DMA_INTERRUPT, fdev->common.cap_mask);
+	dma_cap_set(DMA_SLAVE, fdev->common.cap_mask);
 	fdev->common.device_alloc_chan_resources = fsl_dma_alloc_chan_resources;
 	fdev->common.device_free_chan_resources = fsl_dma_free_chan_resources;
 	fdev->common.device_prep_dma_interrupt = fsl_dma_prep_interrupt;
 	fdev->common.device_prep_dma_memcpy = fsl_dma_prep_memcpy;
 	fdev->common.device_is_tx_complete = fsl_dma_is_complete;
 	fdev->common.device_issue_pending = fsl_dma_memcpy_issue_pending;
+	fdev->common.device_prep_slave_sg = fsl_dma_prep_slave_sg;
+	fdev->common.device_terminate_all = fsl_dma_device_terminate_all;
 	fdev->common.dev = &dev->dev;
 
 	fdev->irq = irq_of_parse_and_map(dev->node, 0);
diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index dc7f2686579..0df14cbb8ca 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -90,6 +90,7 @@ struct fsl_dma_ld_hw {
 struct fsl_desc_sw {
 	struct fsl_dma_ld_hw hw;
 	struct list_head node;
+	struct list_head tx_list;
 	struct dma_async_tx_descriptor async_tx;
 	struct list_head *ld;
 	void *priv;
@@ -143,10 +144,11 @@ struct fsl_dma_chan {
 	struct tasklet_struct tasklet;
 	u32 feature;
 
-	void (*toggle_ext_pause)(struct fsl_dma_chan *fsl_chan, int size);
+	void (*toggle_ext_pause)(struct fsl_dma_chan *fsl_chan, int enable);
 	void (*toggle_ext_start)(struct fsl_dma_chan *fsl_chan, int enable);
 	void (*set_src_loop_size)(struct fsl_dma_chan *fsl_chan, int size);
 	void (*set_dest_loop_size)(struct fsl_dma_chan *fsl_chan, int size);
+	void (*set_request_count)(struct fsl_dma_chan *fsl_chan, int size);
 };
 
 #define to_fsl_chan(chan) container_of(chan, struct fsl_dma_chan, common)
diff --git a/drivers/dma/ioat.c b/drivers/dma/ioat.c
deleted file mode 100644
index 2225bb6ba3d..00000000000
--- a/drivers/dma/ioat.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Intel I/OAT DMA Linux driver
- * Copyright(c) 2007 - 2009 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- */
-
-/*
- * This driver supports an Intel I/OAT DMA engine, which does asynchronous
- * copy operations.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/dca.h>
-#include "ioatdma.h"
-#include "ioatdma_registers.h"
-#include "ioatdma_hw.h"
-
-MODULE_VERSION(IOAT_DMA_VERSION);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Intel Corporation");
-
-static struct pci_device_id ioat_pci_tbl[] = {
-	/* I/OAT v1 platforms */
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB)  },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SCNB) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_UNISYS, PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) },
-
-	/* I/OAT v2 platforms */
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB) },
-
-	/* I/OAT v3 platforms */
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG0) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG1) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG2) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG3) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG4) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) },
-	{ 0, }
-};
-
-struct ioat_device {
-	struct pci_dev		*pdev;
-	void __iomem		*iobase;
-	struct ioatdma_device	*dma;
-	struct dca_provider	*dca;
-};
-
-static int __devinit ioat_probe(struct pci_dev *pdev,
-				const struct pci_device_id *id);
-static void __devexit ioat_remove(struct pci_dev *pdev);
-
-static int ioat_dca_enabled = 1;
-module_param(ioat_dca_enabled, int, 0644);
-MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)");
-
-static struct pci_driver ioat_pci_driver = {
-	.name		= "ioatdma",
-	.id_table	= ioat_pci_tbl,
-	.probe		= ioat_probe,
-	.remove		= __devexit_p(ioat_remove),
-};
-
-static int __devinit ioat_probe(struct pci_dev *pdev,
-				const struct pci_device_id *id)
-{
-	void __iomem *iobase;
-	struct ioat_device *device;
-	unsigned long mmio_start, mmio_len;
-	int err;
-
-	err = pci_enable_device(pdev);
-	if (err)
-		goto err_enable_device;
-
-	err = pci_request_regions(pdev, ioat_pci_driver.name);
-	if (err)
-		goto err_request_regions;
-
-	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (err)
-		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-	if (err)
-		goto err_set_dma_mask;
-
-	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (err)
-		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-	if (err)
-		goto err_set_dma_mask;
-
-	mmio_start = pci_resource_start(pdev, 0);
-	mmio_len = pci_resource_len(pdev, 0);
-	iobase = ioremap(mmio_start, mmio_len);
-	if (!iobase) {
-		err = -ENOMEM;
-		goto err_ioremap;
-	}
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device) {
-		err = -ENOMEM;
-		goto err_kzalloc;
-	}
-	device->pdev = pdev;
-	pci_set_drvdata(pdev, device);
-	device->iobase = iobase;
-
-	pci_set_master(pdev);
-
-	switch (readb(iobase + IOAT_VER_OFFSET)) {
-	case IOAT_VER_1_2:
-		device->dma = ioat_dma_probe(pdev, iobase);
-		if (device->dma && ioat_dca_enabled)
-			device->dca = ioat_dca_init(pdev, iobase);
-		break;
-	case IOAT_VER_2_0:
-		device->dma = ioat_dma_probe(pdev, iobase);
-		if (device->dma && ioat_dca_enabled)
-			device->dca = ioat2_dca_init(pdev, iobase);
-		break;
-	case IOAT_VER_3_0:
-		device->dma = ioat_dma_probe(pdev, iobase);
-		if (device->dma && ioat_dca_enabled)
-			device->dca = ioat3_dca_init(pdev, iobase);
-		break;
-	default:
-		err = -ENODEV;
-		break;
-	}
-	if (!device->dma)
-		err = -ENODEV;
-
-	if (err)
-		goto err_version;
-
-	return 0;
-
-err_version:
-	kfree(device);
-err_kzalloc:
-	iounmap(iobase);
-err_ioremap:
-err_set_dma_mask:
-	pci_release_regions(pdev);
-	pci_disable_device(pdev);
-err_request_regions:
-err_enable_device:
-	return err;
-}
-
-static void __devexit ioat_remove(struct pci_dev *pdev)
-{
-	struct ioat_device *device = pci_get_drvdata(pdev);
-
-	dev_err(&pdev->dev, "Removing dma and dca services\n");
-	if (device->dca) {
-		unregister_dca_provider(device->dca);
-		free_dca_provider(device->dca);
-		device->dca = NULL;
-	}
-
-	if (device->dma) {
-		ioat_dma_remove(device->dma);
-		device->dma = NULL;
-	}
-
-	kfree(device);
-}
-
-static int __init ioat_init_module(void)
-{
-	return pci_register_driver(&ioat_pci_driver);
-}
-module_init(ioat_init_module);
-
-static void __exit ioat_exit_module(void)
-{
-	pci_unregister_driver(&ioat_pci_driver);
-}
-module_exit(ioat_exit_module);
diff --git a/drivers/dma/ioat/Makefile b/drivers/dma/ioat/Makefile
new file mode 100644
index 00000000000..8997d3fb905
--- /dev/null
+++ b/drivers/dma/ioat/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
+ioatdma-objs := pci.o dma.o dma_v2.o dma_v3.o dca.o
diff --git a/drivers/dma/ioat_dca.c b/drivers/dma/ioat/dca.c
index c012a1e1504..69d02615c4d 100644
--- a/drivers/dma/ioat_dca.c
+++ b/drivers/dma/ioat/dca.c
@@ -33,8 +33,8 @@
 #define cpu_physical_id(cpu) (cpuid_ebx(1) >> 24)
 #endif
 
-#include "ioatdma.h"
-#include "ioatdma_registers.h"
+#include "dma.h"
+#include "registers.h"
 
 /*
  * Bit 7 of a tag map entry is the "valid" bit, if it is set then bits 0:6
@@ -242,7 +242,8 @@ static struct dca_ops ioat_dca_ops = {
 };
 
 
-struct dca_provider *ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase)
+struct dca_provider * __devinit
+ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase)
 {
 	struct dca_provider *dca;
 	struct ioat_dca_priv *ioatdca;
@@ -407,7 +408,8 @@ static int ioat2_dca_count_dca_slots(void __iomem *iobase, u16 dca_offset)
 	return slots;
 }
 
-struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase)
+struct dca_provider * __devinit
+ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase)
 {
 	struct dca_provider *dca;
 	struct ioat_dca_priv *ioatdca;
@@ -602,7 +604,8 @@ static int ioat3_dca_count_dca_slots(void *iobase, u16 dca_offset)
 	return slots;
 }
 
-struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase)
+struct dca_provider * __devinit
+ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase)
 {
 	struct dca_provider *dca;
 	struct ioat_dca_priv *ioatdca;
diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c
new file mode 100644
index 00000000000..c524d36d3c2
--- /dev/null
+++ b/drivers/dma/ioat/dma.c
@@ -0,0 +1,1238 @@
+/*
+ * Intel I/OAT DMA Linux driver
+ * Copyright(c) 2004 - 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ */
+
+/*
+ * This driver supports an Intel I/OAT DMA engine, which does asynchronous
+ * copy operations.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/i7300_idle.h>
+#include "dma.h"
+#include "registers.h"
+#include "hw.h"
+
+int ioat_pending_level = 4;
+module_param(ioat_pending_level, int, 0644);
+MODULE_PARM_DESC(ioat_pending_level,
+		 "high-water mark for pushing ioat descriptors (default: 4)");
+
+/* internal functions */
+static void ioat1_cleanup(struct ioat_dma_chan *ioat);
+static void ioat1_dma_start_null_desc(struct ioat_dma_chan *ioat);
+
+/**
+ * ioat_dma_do_interrupt - handler used for single vector interrupt mode
+ * @irq: interrupt id
+ * @data: interrupt data
+ */
+static irqreturn_t ioat_dma_do_interrupt(int irq, void *data)
+{
+	struct ioatdma_device *instance = data;
+	struct ioat_chan_common *chan;
+	unsigned long attnstatus;
+	int bit;
+	u8 intrctrl;
+
+	intrctrl = readb(instance->reg_base + IOAT_INTRCTRL_OFFSET);
+
+	if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN))
+		return IRQ_NONE;
+
+	if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) {
+		writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
+		return IRQ_NONE;
+	}
+
+	attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET);
+	for_each_bit(bit, &attnstatus, BITS_PER_LONG) {
+		chan = ioat_chan_by_index(instance, bit);
+		tasklet_schedule(&chan->cleanup_task);
+	}
+
+	writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
+	return IRQ_HANDLED;
+}
+
+/**
+ * ioat_dma_do_interrupt_msix - handler used for vector-per-channel interrupt mode
+ * @irq: interrupt id
+ * @data: interrupt data
+ */
+static irqreturn_t ioat_dma_do_interrupt_msix(int irq, void *data)
+{
+	struct ioat_chan_common *chan = data;
+
+	tasklet_schedule(&chan->cleanup_task);
+
+	return IRQ_HANDLED;
+}
+
+static void ioat1_cleanup_tasklet(unsigned long data);
+
+/* common channel initialization */
+void ioat_init_channel(struct ioatdma_device *device,
+		       struct ioat_chan_common *chan, int idx,
+		       void (*timer_fn)(unsigned long),
+		       void (*tasklet)(unsigned long),
+		       unsigned long ioat)
+{
+	struct dma_device *dma = &device->common;
+
+	chan->device = device;
+	chan->reg_base = device->reg_base + (0x80 * (idx + 1));
+	spin_lock_init(&chan->cleanup_lock);
+	chan->common.device = dma;
+	list_add_tail(&chan->common.device_node, &dma->channels);
+	device->idx[idx] = chan;
+	init_timer(&chan->timer);
+	chan->timer.function = timer_fn;
+	chan->timer.data = ioat;
+	tasklet_init(&chan->cleanup_task, tasklet, ioat);
+	tasklet_disable(&chan->cleanup_task);
+}
+
+static void ioat1_timer_event(unsigned long data);
+
+/**
+ * ioat1_dma_enumerate_channels - find and initialize the device's channels
+ * @device: the device to be enumerated
+ */
+static int ioat1_enumerate_channels(struct ioatdma_device *device)
+{
+	u8 xfercap_scale;
+	u32 xfercap;
+	int i;
+	struct ioat_dma_chan *ioat;
+	struct device *dev = &device->pdev->dev;
+	struct dma_device *dma = &device->common;
+
+	INIT_LIST_HEAD(&dma->channels);
+	dma->chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET);
+	dma->chancnt &= 0x1f; /* bits [4:0] valid */
+	if (dma->chancnt > ARRAY_SIZE(device->idx)) {
+		dev_warn(dev, "(%d) exceeds max supported channels (%zu)\n",
+			 dma->chancnt, ARRAY_SIZE(device->idx));
+		dma->chancnt = ARRAY_SIZE(device->idx);
+	}
+	xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
+	xfercap_scale &= 0x1f; /* bits [4:0] valid */
+	xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
+	dev_dbg(dev, "%s: xfercap = %d\n", __func__, xfercap);
+
+#ifdef  CONFIG_I7300_IDLE_IOAT_CHANNEL
+	if (i7300_idle_platform_probe(NULL, NULL, 1) == 0)
+		dma->chancnt--;
+#endif
+	for (i = 0; i < dma->chancnt; i++) {
+		ioat = devm_kzalloc(dev, sizeof(*ioat), GFP_KERNEL);
+		if (!ioat)
+			break;
+
+		ioat_init_channel(device, &ioat->base, i,
+				  ioat1_timer_event,
+				  ioat1_cleanup_tasklet,
+				  (unsigned long) ioat);
+		ioat->xfercap = xfercap;
+		spin_lock_init(&ioat->desc_lock);
+		INIT_LIST_HEAD(&ioat->free_desc);
+		INIT_LIST_HEAD(&ioat->used_desc);
+	}
+	dma->chancnt = i;
+	return i;
+}
+
+/**
+ * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended
+ *                                 descriptors to hw
+ * @chan: DMA channel handle
+ */
+static inline void
+__ioat1_dma_memcpy_issue_pending(struct ioat_dma_chan *ioat)
+{
+	void __iomem *reg_base = ioat->base.reg_base;
+
+	dev_dbg(to_dev(&ioat->base), "%s: pending: %d\n",
+		__func__, ioat->pending);
+	ioat->pending = 0;
+	writeb(IOAT_CHANCMD_APPEND, reg_base + IOAT1_CHANCMD_OFFSET);
+}
+
+static void ioat1_dma_memcpy_issue_pending(struct dma_chan *chan)
+{
+	struct ioat_dma_chan *ioat = to_ioat_chan(chan);
+
+	if (ioat->pending > 0) {
+		spin_lock_bh(&ioat->desc_lock);
+		__ioat1_dma_memcpy_issue_pending(ioat);
+		spin_unlock_bh(&ioat->desc_lock);
+	}
+}
+
+/**
+ * ioat1_reset_channel - restart a channel
+ * @ioat: IOAT DMA channel handle
+ */
+static void ioat1_reset_channel(struct ioat_dma_chan *ioat)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	void __iomem *reg_base = chan->reg_base;
+	u32 chansts, chanerr;
+
+	dev_warn(to_dev(chan), "reset\n");
+	chanerr = readl(reg_base + IOAT_CHANERR_OFFSET);
+	chansts = *chan->completion & IOAT_CHANSTS_STATUS;
+	if (chanerr) {
+		dev_err(to_dev(chan),
+			"chan%d, CHANSTS = 0x%08x CHANERR = 0x%04x, clearing\n",
+			chan_num(chan), chansts, chanerr);
+		writel(chanerr, reg_base + IOAT_CHANERR_OFFSET);
+	}
+
+	/*
+	 * whack it upside the head with a reset
+	 * and wait for things to settle out.
+	 * force the pending count to a really big negative
+	 * to make sure no one forces an issue_pending
+	 * while we're waiting.
+	 */
+
+	ioat->pending = INT_MIN;
+	writeb(IOAT_CHANCMD_RESET,
+	       reg_base + IOAT_CHANCMD_OFFSET(chan->device->version));
+	set_bit(IOAT_RESET_PENDING, &chan->state);
+	mod_timer(&chan->timer, jiffies + RESET_DELAY);
+}
+
+static dma_cookie_t ioat1_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct dma_chan *c = tx->chan;
+	struct ioat_dma_chan *ioat = to_ioat_chan(c);
+	struct ioat_desc_sw *desc = tx_to_ioat_desc(tx);
+	struct ioat_chan_common *chan = &ioat->base;
+	struct ioat_desc_sw *first;
+	struct ioat_desc_sw *chain_tail;
+	dma_cookie_t cookie;
+
+	spin_lock_bh(&ioat->desc_lock);
+	/* cookie incr and addition to used_list must be atomic */
+	cookie = c->cookie;
+	cookie++;
+	if (cookie < 0)
+		cookie = 1;
+	c->cookie = cookie;
+	tx->cookie = cookie;
+	dev_dbg(to_dev(&ioat->base), "%s: cookie: %d\n", __func__, cookie);
+
+	/* write address into NextDescriptor field of last desc in chain */
+	first = to_ioat_desc(desc->tx_list.next);
+	chain_tail = to_ioat_desc(ioat->used_desc.prev);
+	/* make descriptor updates globally visible before chaining */
+	wmb();
+	chain_tail->hw->next = first->txd.phys;
+	list_splice_tail_init(&desc->tx_list, &ioat->used_desc);
+	dump_desc_dbg(ioat, chain_tail);
+	dump_desc_dbg(ioat, first);
+
+	if (!test_and_set_bit(IOAT_COMPLETION_PENDING, &chan->state))
+		mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+
+	ioat->active += desc->hw->tx_cnt;
+	ioat->pending += desc->hw->tx_cnt;
+	if (ioat->pending >= ioat_pending_level)
+		__ioat1_dma_memcpy_issue_pending(ioat);
+	spin_unlock_bh(&ioat->desc_lock);
+
+	return cookie;
+}
+
+/**
+ * ioat_dma_alloc_descriptor - allocate and return a sw and hw descriptor pair
+ * @ioat: the channel supplying the memory pool for the descriptors
+ * @flags: allocation flags
+ */
+static struct ioat_desc_sw *
+ioat_dma_alloc_descriptor(struct ioat_dma_chan *ioat, gfp_t flags)
+{
+	struct ioat_dma_descriptor *desc;
+	struct ioat_desc_sw *desc_sw;
+	struct ioatdma_device *ioatdma_device;
+	dma_addr_t phys;
+
+	ioatdma_device = ioat->base.device;
+	desc = pci_pool_alloc(ioatdma_device->dma_pool, flags, &phys);
+	if (unlikely(!desc))
+		return NULL;
+
+	desc_sw = kzalloc(sizeof(*desc_sw), flags);
+	if (unlikely(!desc_sw)) {
+		pci_pool_free(ioatdma_device->dma_pool, desc, phys);
+		return NULL;
+	}
+
+	memset(desc, 0, sizeof(*desc));
+
+	INIT_LIST_HEAD(&desc_sw->tx_list);
+	dma_async_tx_descriptor_init(&desc_sw->txd, &ioat->base.common);
+	desc_sw->txd.tx_submit = ioat1_tx_submit;
+	desc_sw->hw = desc;
+	desc_sw->txd.phys = phys;
+	set_desc_id(desc_sw, -1);
+
+	return desc_sw;
+}
+
+static int ioat_initial_desc_count = 256;
+module_param(ioat_initial_desc_count, int, 0644);
+MODULE_PARM_DESC(ioat_initial_desc_count,
+		 "ioat1: initial descriptors per channel (default: 256)");
+/**
+ * ioat1_dma_alloc_chan_resources - returns the number of allocated descriptors
+ * @chan: the channel to be filled out
+ */
+static int ioat1_dma_alloc_chan_resources(struct dma_chan *c)
+{
+	struct ioat_dma_chan *ioat = to_ioat_chan(c);
+	struct ioat_chan_common *chan = &ioat->base;
+	struct ioat_desc_sw *desc;
+	u32 chanerr;
+	int i;
+	LIST_HEAD(tmp_list);
+
+	/* have we already been set up? */
+	if (!list_empty(&ioat->free_desc))
+		return ioat->desccount;
+
+	/* Setup register to interrupt and write completion status on error */
+	writew(IOAT_CHANCTRL_RUN, chan->reg_base + IOAT_CHANCTRL_OFFSET);
+
+	chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+	if (chanerr) {
+		dev_err(to_dev(chan), "CHANERR = %x, clearing\n", chanerr);
+		writel(chanerr, chan->reg_base + IOAT_CHANERR_OFFSET);
+	}
+
+	/* Allocate descriptors */
+	for (i = 0; i < ioat_initial_desc_count; i++) {
+		desc = ioat_dma_alloc_descriptor(ioat, GFP_KERNEL);
+		if (!desc) {
+			dev_err(to_dev(chan), "Only %d initial descriptors\n", i);
+			break;
+		}
+		set_desc_id(desc, i);
+		list_add_tail(&desc->node, &tmp_list);
+	}
+	spin_lock_bh(&ioat->desc_lock);
+	ioat->desccount = i;
+	list_splice(&tmp_list, &ioat->free_desc);
+	spin_unlock_bh(&ioat->desc_lock);
+
+	/* allocate a completion writeback area */
+	/* doing 2 32bit writes to mmio since 1 64b write doesn't work */
+	chan->completion = pci_pool_alloc(chan->device->completion_pool,
+					  GFP_KERNEL, &chan->completion_dma);
+	memset(chan->completion, 0, sizeof(*chan->completion));
+	writel(((u64) chan->completion_dma) & 0x00000000FFFFFFFF,
+	       chan->reg_base + IOAT_CHANCMP_OFFSET_LOW);
+	writel(((u64) chan->completion_dma) >> 32,
+	       chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH);
+
+	tasklet_enable(&chan->cleanup_task);
+	ioat1_dma_start_null_desc(ioat);  /* give chain to dma device */
+	dev_dbg(to_dev(chan), "%s: allocated %d descriptors\n",
+		__func__, ioat->desccount);
+	return ioat->desccount;
+}
+
+/**
+ * ioat1_dma_free_chan_resources - release all the descriptors
+ * @chan: the channel to be cleaned
+ */
+static void ioat1_dma_free_chan_resources(struct dma_chan *c)
+{
+	struct ioat_dma_chan *ioat = to_ioat_chan(c);
+	struct ioat_chan_common *chan = &ioat->base;
+	struct ioatdma_device *ioatdma_device = chan->device;
+	struct ioat_desc_sw *desc, *_desc;
+	int in_use_descs = 0;
+
+	/* Before freeing channel resources first check
+	 * if they have been previously allocated for this channel.
+	 */
+	if (ioat->desccount == 0)
+		return;
+
+	tasklet_disable(&chan->cleanup_task);
+	del_timer_sync(&chan->timer);
+	ioat1_cleanup(ioat);
+
+	/* Delay 100ms after reset to allow internal DMA logic to quiesce
+	 * before removing DMA descriptor resources.
+	 */
+	writeb(IOAT_CHANCMD_RESET,
+	       chan->reg_base + IOAT_CHANCMD_OFFSET(chan->device->version));
+	mdelay(100);
+
+	spin_lock_bh(&ioat->desc_lock);
+	list_for_each_entry_safe(desc, _desc, &ioat->used_desc, node) {
+		dev_dbg(to_dev(chan), "%s: freeing %d from used list\n",
+			__func__, desc_id(desc));
+		dump_desc_dbg(ioat, desc);
+		in_use_descs++;
+		list_del(&desc->node);
+		pci_pool_free(ioatdma_device->dma_pool, desc->hw,
+			      desc->txd.phys);
+		kfree(desc);
+	}
+	list_for_each_entry_safe(desc, _desc,
+				 &ioat->free_desc, node) {
+		list_del(&desc->node);
+		pci_pool_free(ioatdma_device->dma_pool, desc->hw,
+			      desc->txd.phys);
+		kfree(desc);
+	}
+	spin_unlock_bh(&ioat->desc_lock);
+
+	pci_pool_free(ioatdma_device->completion_pool,
+		      chan->completion,
+		      chan->completion_dma);
+
+	/* one is ok since we left it on there on purpose */
+	if (in_use_descs > 1)
+		dev_err(to_dev(chan), "Freeing %d in use descriptors!\n",
+			in_use_descs - 1);
+
+	chan->last_completion = 0;
+	chan->completion_dma = 0;
+	ioat->pending = 0;
+	ioat->desccount = 0;
+}
+
+/**
+ * ioat1_dma_get_next_descriptor - return the next available descriptor
+ * @ioat: IOAT DMA channel handle
+ *
+ * Gets the next descriptor from the chain, and must be called with the
+ * channel's desc_lock held.  Allocates more descriptors if the channel
+ * has run out.
+ */
+static struct ioat_desc_sw *
+ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat)
+{
+	struct ioat_desc_sw *new;
+
+	if (!list_empty(&ioat->free_desc)) {
+		new = to_ioat_desc(ioat->free_desc.next);
+		list_del(&new->node);
+	} else {
+		/* try to get another desc */
+		new = ioat_dma_alloc_descriptor(ioat, GFP_ATOMIC);
+		if (!new) {
+			dev_err(to_dev(&ioat->base), "alloc failed\n");
+			return NULL;
+		}
+	}
+	dev_dbg(to_dev(&ioat->base), "%s: allocated: %d\n",
+		__func__, desc_id(new));
+	prefetch(new->hw);
+	return new;
+}
+
+static struct dma_async_tx_descriptor *
+ioat1_dma_prep_memcpy(struct dma_chan *c, dma_addr_t dma_dest,
+		      dma_addr_t dma_src, size_t len, unsigned long flags)
+{
+	struct ioat_dma_chan *ioat = to_ioat_chan(c);
+	struct ioat_desc_sw *desc;
+	size_t copy;
+	LIST_HEAD(chain);
+	dma_addr_t src = dma_src;
+	dma_addr_t dest = dma_dest;
+	size_t total_len = len;
+	struct ioat_dma_descriptor *hw = NULL;
+	int tx_cnt = 0;
+
+	spin_lock_bh(&ioat->desc_lock);
+	desc = ioat1_dma_get_next_descriptor(ioat);
+	do {
+		if (!desc)
+			break;
+
+		tx_cnt++;
+		copy = min_t(size_t, len, ioat->xfercap);
+
+		hw = desc->hw;
+		hw->size = copy;
+		hw->ctl = 0;
+		hw->src_addr = src;
+		hw->dst_addr = dest;
+
+		list_add_tail(&desc->node, &chain);
+
+		len -= copy;
+		dest += copy;
+		src += copy;
+		if (len) {
+			struct ioat_desc_sw *next;
+
+			async_tx_ack(&desc->txd);
+			next = ioat1_dma_get_next_descriptor(ioat);
+			hw->next = next ? next->txd.phys : 0;
+			dump_desc_dbg(ioat, desc);
+			desc = next;
+		} else
+			hw->next = 0;
+	} while (len);
+
+	if (!desc) {
+		struct ioat_chan_common *chan = &ioat->base;
+
+		dev_err(to_dev(chan),
+			"chan%d - get_next_desc failed\n", chan_num(chan));
+		list_splice(&chain, &ioat->free_desc);
+		spin_unlock_bh(&ioat->desc_lock);
+		return NULL;
+	}
+	spin_unlock_bh(&ioat->desc_lock);
+
+	desc->txd.flags = flags;
+	desc->len = total_len;
+	list_splice(&chain, &desc->tx_list);
+	hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+	hw->ctl_f.compl_write = 1;
+	hw->tx_cnt = tx_cnt;
+	dump_desc_dbg(ioat, desc);
+
+	return &desc->txd;
+}
+
+static void ioat1_cleanup_tasklet(unsigned long data)
+{
+	struct ioat_dma_chan *chan = (void *)data;
+
+	ioat1_cleanup(chan);
+	writew(IOAT_CHANCTRL_RUN, chan->base.reg_base + IOAT_CHANCTRL_OFFSET);
+}
+
+void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags,
+		    size_t len, struct ioat_dma_descriptor *hw)
+{
+	struct pci_dev *pdev = chan->device->pdev;
+	size_t offset = len - hw->size;
+
+	if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
+		ioat_unmap(pdev, hw->dst_addr - offset, len,
+			   PCI_DMA_FROMDEVICE, flags, 1);
+
+	if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP))
+		ioat_unmap(pdev, hw->src_addr - offset, len,
+			   PCI_DMA_TODEVICE, flags, 0);
+}
+
+unsigned long ioat_get_current_completion(struct ioat_chan_common *chan)
+{
+	unsigned long phys_complete;
+	u64 completion;
+
+	completion = *chan->completion;
+	phys_complete = ioat_chansts_to_addr(completion);
+
+	dev_dbg(to_dev(chan), "%s: phys_complete: %#llx\n", __func__,
+		(unsigned long long) phys_complete);
+
+	if (is_ioat_halted(completion)) {
+		u32 chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+		dev_err(to_dev(chan), "Channel halted, chanerr = %x\n",
+			chanerr);
+
+		/* TODO do something to salvage the situation */
+	}
+
+	return phys_complete;
+}
+
+bool ioat_cleanup_preamble(struct ioat_chan_common *chan,
+			   unsigned long *phys_complete)
+{
+	*phys_complete = ioat_get_current_completion(chan);
+	if (*phys_complete == chan->last_completion)
+		return false;
+	clear_bit(IOAT_COMPLETION_ACK, &chan->state);
+	mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+
+	return true;
+}
+
+static void __cleanup(struct ioat_dma_chan *ioat, unsigned long phys_complete)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	struct list_head *_desc, *n;
+	struct dma_async_tx_descriptor *tx;
+
+	dev_dbg(to_dev(chan), "%s: phys_complete: %lx\n",
+		 __func__, phys_complete);
+	list_for_each_safe(_desc, n, &ioat->used_desc) {
+		struct ioat_desc_sw *desc;
+
+		prefetch(n);
+		desc = list_entry(_desc, typeof(*desc), node);
+		tx = &desc->txd;
+		/*
+		 * Incoming DMA requests may use multiple descriptors,
+		 * due to exceeding xfercap, perhaps. If so, only the
+		 * last one will have a cookie, and require unmapping.
+		 */
+		dump_desc_dbg(ioat, desc);
+		if (tx->cookie) {
+			chan->completed_cookie = tx->cookie;
+			tx->cookie = 0;
+			ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw);
+			ioat->active -= desc->hw->tx_cnt;
+			if (tx->callback) {
+				tx->callback(tx->callback_param);
+				tx->callback = NULL;
+			}
+		}
+
+		if (tx->phys != phys_complete) {
+			/*
+			 * a completed entry, but not the last, so clean
+			 * up if the client is done with the descriptor
+			 */
+			if (async_tx_test_ack(tx))
+				list_move_tail(&desc->node, &ioat->free_desc);
+		} else {
+			/*
+			 * last used desc. Do not remove, so we can
+			 * append from it.
+			 */
+
+			/* if nothing else is pending, cancel the
+			 * completion timeout
+			 */
+			if (n == &ioat->used_desc) {
+				dev_dbg(to_dev(chan),
+					"%s cancel completion timeout\n",
+					__func__);
+				clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
+			}
+
+			/* TODO check status bits? */
+			break;
+		}
+	}
+
+	chan->last_completion = phys_complete;
+}
+
+/**
+ * ioat1_cleanup - cleanup up finished descriptors
+ * @chan: ioat channel to be cleaned up
+ *
+ * To prevent lock contention we defer cleanup when the locks are
+ * contended with a terminal timeout that forces cleanup and catches
+ * completion notification errors.
+ */
+static void ioat1_cleanup(struct ioat_dma_chan *ioat)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	unsigned long phys_complete;
+
+	prefetch(chan->completion);
+
+	if (!spin_trylock_bh(&chan->cleanup_lock))
+		return;
+
+	if (!ioat_cleanup_preamble(chan, &phys_complete)) {
+		spin_unlock_bh(&chan->cleanup_lock);
+		return;
+	}
+
+	if (!spin_trylock_bh(&ioat->desc_lock)) {
+		spin_unlock_bh(&chan->cleanup_lock);
+		return;
+	}
+
+	__cleanup(ioat, phys_complete);
+
+	spin_unlock_bh(&ioat->desc_lock);
+	spin_unlock_bh(&chan->cleanup_lock);
+}
+
+static void ioat1_timer_event(unsigned long data)
+{
+	struct ioat_dma_chan *ioat = (void *) data;
+	struct ioat_chan_common *chan = &ioat->base;
+
+	dev_dbg(to_dev(chan), "%s: state: %lx\n", __func__, chan->state);
+
+	spin_lock_bh(&chan->cleanup_lock);
+	if (test_and_clear_bit(IOAT_RESET_PENDING, &chan->state)) {
+		struct ioat_desc_sw *desc;
+
+		spin_lock_bh(&ioat->desc_lock);
+
+		/* restart active descriptors */
+		desc = to_ioat_desc(ioat->used_desc.prev);
+		ioat_set_chainaddr(ioat, desc->txd.phys);
+		ioat_start(chan);
+
+		ioat->pending = 0;
+		set_bit(IOAT_COMPLETION_PENDING, &chan->state);
+		mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+		spin_unlock_bh(&ioat->desc_lock);
+	} else if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
+		unsigned long phys_complete;
+
+		spin_lock_bh(&ioat->desc_lock);
+		/* if we haven't made progress and we have already
+		 * acknowledged a pending completion once, then be more
+		 * forceful with a restart
+		 */
+		if (ioat_cleanup_preamble(chan, &phys_complete))
+			__cleanup(ioat, phys_complete);
+		else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
+			ioat1_reset_channel(ioat);
+		else {
+			u64 status = ioat_chansts(chan);
+
+			/* manually update the last completion address */
+			if (ioat_chansts_to_addr(status) != 0)
+				*chan->completion = status;
+
+			set_bit(IOAT_COMPLETION_ACK, &chan->state);
+			mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+		}
+		spin_unlock_bh(&ioat->desc_lock);
+	}
+	spin_unlock_bh(&chan->cleanup_lock);
+}
+
+static enum dma_status
+ioat1_dma_is_complete(struct dma_chan *c, dma_cookie_t cookie,
+		      dma_cookie_t *done, dma_cookie_t *used)
+{
+	struct ioat_dma_chan *ioat = to_ioat_chan(c);
+
+	if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
+		return DMA_SUCCESS;
+
+	ioat1_cleanup(ioat);
+
+	return ioat_is_complete(c, cookie, done, used);
+}
+
+static void ioat1_dma_start_null_desc(struct ioat_dma_chan *ioat)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	struct ioat_desc_sw *desc;
+	struct ioat_dma_descriptor *hw;
+
+	spin_lock_bh(&ioat->desc_lock);
+
+	desc = ioat1_dma_get_next_descriptor(ioat);
+
+	if (!desc) {
+		dev_err(to_dev(chan),
+			"Unable to start null desc - get next desc failed\n");
+		spin_unlock_bh(&ioat->desc_lock);
+		return;
+	}
+
+	hw = desc->hw;
+	hw->ctl = 0;
+	hw->ctl_f.null = 1;
+	hw->ctl_f.int_en = 1;
+	hw->ctl_f.compl_write = 1;
+	/* set size to non-zero value (channel returns error when size is 0) */
+	hw->size = NULL_DESC_BUFFER_SIZE;
+	hw->src_addr = 0;
+	hw->dst_addr = 0;
+	async_tx_ack(&desc->txd);
+	hw->next = 0;
+	list_add_tail(&desc->node, &ioat->used_desc);
+	dump_desc_dbg(ioat, desc);
+
+	ioat_set_chainaddr(ioat, desc->txd.phys);
+	ioat_start(chan);
+	spin_unlock_bh(&ioat->desc_lock);
+}
+
+/*
+ * Perform a IOAT transaction to verify the HW works.
+ */
+#define IOAT_TEST_SIZE 2000
+
+static void __devinit ioat_dma_test_callback(void *dma_async_param)
+{
+	struct completion *cmp = dma_async_param;
+
+	complete(cmp);
+}
+
+/**
+ * ioat_dma_self_test - Perform a IOAT transaction to verify the HW works.
+ * @device: device to be tested
+ */
+int __devinit ioat_dma_self_test(struct ioatdma_device *device)
+{
+	int i;
+	u8 *src;
+	u8 *dest;
+	struct dma_device *dma = &device->common;
+	struct device *dev = &device->pdev->dev;
+	struct dma_chan *dma_chan;
+	struct dma_async_tx_descriptor *tx;
+	dma_addr_t dma_dest, dma_src;
+	dma_cookie_t cookie;
+	int err = 0;
+	struct completion cmp;
+	unsigned long tmo;
+	unsigned long flags;
+
+	src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
+	if (!src)
+		return -ENOMEM;
+	dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
+	if (!dest) {
+		kfree(src);
+		return -ENOMEM;
+	}
+
+	/* Fill in src buffer */
+	for (i = 0; i < IOAT_TEST_SIZE; i++)
+		src[i] = (u8)i;
+
+	/* Start copy, using first DMA channel */
+	dma_chan = container_of(dma->channels.next, struct dma_chan,
+				device_node);
+	if (dma->device_alloc_chan_resources(dma_chan) < 1) {
+		dev_err(dev, "selftest cannot allocate chan resource\n");
+		err = -ENODEV;
+		goto out;
+	}
+
+	dma_src = dma_map_single(dev, src, IOAT_TEST_SIZE, DMA_TO_DEVICE);
+	dma_dest = dma_map_single(dev, dest, IOAT_TEST_SIZE, DMA_FROM_DEVICE);
+	flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_COMPL_DEST_UNMAP_SINGLE |
+		DMA_PREP_INTERRUPT;
+	tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src,
+						   IOAT_TEST_SIZE, flags);
+	if (!tx) {
+		dev_err(dev, "Self-test prep failed, disabling\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	async_tx_ack(tx);
+	init_completion(&cmp);
+	tx->callback = ioat_dma_test_callback;
+	tx->callback_param = &cmp;
+	cookie = tx->tx_submit(tx);
+	if (cookie < 0) {
+		dev_err(dev, "Self-test setup failed, disabling\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+	dma->device_issue_pending(dma_chan);
+
+	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+
+	if (tmo == 0 ||
+	    dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL)
+					!= DMA_SUCCESS) {
+		dev_err(dev, "Self-test copy timed out, disabling\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+	if (memcmp(src, dest, IOAT_TEST_SIZE)) {
+		dev_err(dev, "Self-test copy failed compare, disabling\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+free_resources:
+	dma->device_free_chan_resources(dma_chan);
+out:
+	kfree(src);
+	kfree(dest);
+	return err;
+}
+
+static char ioat_interrupt_style[32] = "msix";
+module_param_string(ioat_interrupt_style, ioat_interrupt_style,
+		    sizeof(ioat_interrupt_style), 0644);
+MODULE_PARM_DESC(ioat_interrupt_style,
+		 "set ioat interrupt style: msix (default), "
+		 "msix-single-vector, msi, intx)");
+
+/**
+ * ioat_dma_setup_interrupts - setup interrupt handler
+ * @device: ioat device
+ */
+static int ioat_dma_setup_interrupts(struct ioatdma_device *device)
+{
+	struct ioat_chan_common *chan;
+	struct pci_dev *pdev = device->pdev;
+	struct device *dev = &pdev->dev;
+	struct msix_entry *msix;
+	int i, j, msixcnt;
+	int err = -EINVAL;
+	u8 intrctrl = 0;
+
+	if (!strcmp(ioat_interrupt_style, "msix"))
+		goto msix;
+	if (!strcmp(ioat_interrupt_style, "msix-single-vector"))
+		goto msix_single_vector;
+	if (!strcmp(ioat_interrupt_style, "msi"))
+		goto msi;
+	if (!strcmp(ioat_interrupt_style, "intx"))
+		goto intx;
+	dev_err(dev, "invalid ioat_interrupt_style %s\n", ioat_interrupt_style);
+	goto err_no_irq;
+
+msix:
+	/* The number of MSI-X vectors should equal the number of channels */
+	msixcnt = device->common.chancnt;
+	for (i = 0; i < msixcnt; i++)
+		device->msix_entries[i].entry = i;
+
+	err = pci_enable_msix(pdev, device->msix_entries, msixcnt);
+	if (err < 0)
+		goto msi;
+	if (err > 0)
+		goto msix_single_vector;
+
+	for (i = 0; i < msixcnt; i++) {
+		msix = &device->msix_entries[i];
+		chan = ioat_chan_by_index(device, i);
+		err = devm_request_irq(dev, msix->vector,
+				       ioat_dma_do_interrupt_msix, 0,
+				       "ioat-msix", chan);
+		if (err) {
+			for (j = 0; j < i; j++) {
+				msix = &device->msix_entries[j];
+				chan = ioat_chan_by_index(device, j);
+				devm_free_irq(dev, msix->vector, chan);
+			}
+			goto msix_single_vector;
+		}
+	}
+	intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL;
+	goto done;
+
+msix_single_vector:
+	msix = &device->msix_entries[0];
+	msix->entry = 0;
+	err = pci_enable_msix(pdev, device->msix_entries, 1);
+	if (err)
+		goto msi;
+
+	err = devm_request_irq(dev, msix->vector, ioat_dma_do_interrupt, 0,
+			       "ioat-msix", device);
+	if (err) {
+		pci_disable_msix(pdev);
+		goto msi;
+	}
+	goto done;
+
+msi:
+	err = pci_enable_msi(pdev);
+	if (err)
+		goto intx;
+
+	err = devm_request_irq(dev, pdev->irq, ioat_dma_do_interrupt, 0,
+			       "ioat-msi", device);
+	if (err) {
+		pci_disable_msi(pdev);
+		goto intx;
+	}
+	goto done;
+
+intx:
+	err = devm_request_irq(dev, pdev->irq, ioat_dma_do_interrupt,
+			       IRQF_SHARED, "ioat-intx", device);
+	if (err)
+		goto err_no_irq;
+
+done:
+	if (device->intr_quirk)
+		device->intr_quirk(device);
+	intrctrl |= IOAT_INTRCTRL_MASTER_INT_EN;
+	writeb(intrctrl, device->reg_base + IOAT_INTRCTRL_OFFSET);
+	return 0;
+
+err_no_irq:
+	/* Disable all interrupt generation */
+	writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
+	dev_err(dev, "no usable interrupts\n");
+	return err;
+}
+
+static void ioat_disable_interrupts(struct ioatdma_device *device)
+{
+	/* Disable all interrupt generation */
+	writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
+}
+
+int __devinit ioat_probe(struct ioatdma_device *device)
+{
+	int err = -ENODEV;
+	struct dma_device *dma = &device->common;
+	struct pci_dev *pdev = device->pdev;
+	struct device *dev = &pdev->dev;
+
+	/* DMA coherent memory pool for DMA descriptor allocations */
+	device->dma_pool = pci_pool_create("dma_desc_pool", pdev,
+					   sizeof(struct ioat_dma_descriptor),
+					   64, 0);
+	if (!device->dma_pool) {
+		err = -ENOMEM;
+		goto err_dma_pool;
+	}
+
+	device->completion_pool = pci_pool_create("completion_pool", pdev,
+						  sizeof(u64), SMP_CACHE_BYTES,
+						  SMP_CACHE_BYTES);
+
+	if (!device->completion_pool) {
+		err = -ENOMEM;
+		goto err_completion_pool;
+	}
+
+	device->enumerate_channels(device);
+
+	dma_cap_set(DMA_MEMCPY, dma->cap_mask);
+	dma->dev = &pdev->dev;
+
+	if (!dma->chancnt) {
+		dev_err(dev, "zero channels detected\n");
+		goto err_setup_interrupts;
+	}
+
+	err = ioat_dma_setup_interrupts(device);
+	if (err)
+		goto err_setup_interrupts;
+
+	err = device->self_test(device);
+	if (err)
+		goto err_self_test;
+
+	return 0;
+
+err_self_test:
+	ioat_disable_interrupts(device);
+err_setup_interrupts:
+	pci_pool_destroy(device->completion_pool);
+err_completion_pool:
+	pci_pool_destroy(device->dma_pool);
+err_dma_pool:
+	return err;
+}
+
+int __devinit ioat_register(struct ioatdma_device *device)
+{
+	int err = dma_async_device_register(&device->common);
+
+	if (err) {
+		ioat_disable_interrupts(device);
+		pci_pool_destroy(device->completion_pool);
+		pci_pool_destroy(device->dma_pool);
+	}
+
+	return err;
+}
+
+/* ioat1_intr_quirk - fix up dma ctrl register to enable / disable msi */
+static void ioat1_intr_quirk(struct ioatdma_device *device)
+{
+	struct pci_dev *pdev = device->pdev;
+	u32 dmactrl;
+
+	pci_read_config_dword(pdev, IOAT_PCI_DMACTRL_OFFSET, &dmactrl);
+	if (pdev->msi_enabled)
+		dmactrl |= IOAT_PCI_DMACTRL_MSI_EN;
+	else
+		dmactrl &= ~IOAT_PCI_DMACTRL_MSI_EN;
+	pci_write_config_dword(pdev, IOAT_PCI_DMACTRL_OFFSET, dmactrl);
+}
+
+static ssize_t ring_size_show(struct dma_chan *c, char *page)
+{
+	struct ioat_dma_chan *ioat = to_ioat_chan(c);
+
+	return sprintf(page, "%d\n", ioat->desccount);
+}
+static struct ioat_sysfs_entry ring_size_attr = __ATTR_RO(ring_size);
+
+static ssize_t ring_active_show(struct dma_chan *c, char *page)
+{
+	struct ioat_dma_chan *ioat = to_ioat_chan(c);
+
+	return sprintf(page, "%d\n", ioat->active);
+}
+static struct ioat_sysfs_entry ring_active_attr = __ATTR_RO(ring_active);
+
+static ssize_t cap_show(struct dma_chan *c, char *page)
+{
+	struct dma_device *dma = c->device;
+
+	return sprintf(page, "copy%s%s%s%s%s%s\n",
+		       dma_has_cap(DMA_PQ, dma->cap_mask) ? " pq" : "",
+		       dma_has_cap(DMA_PQ_VAL, dma->cap_mask) ? " pq_val" : "",
+		       dma_has_cap(DMA_XOR, dma->cap_mask) ? " xor" : "",
+		       dma_has_cap(DMA_XOR_VAL, dma->cap_mask) ? " xor_val" : "",
+		       dma_has_cap(DMA_MEMSET, dma->cap_mask)  ? " fill" : "",
+		       dma_has_cap(DMA_INTERRUPT, dma->cap_mask) ? " intr" : "");
+
+}
+struct ioat_sysfs_entry ioat_cap_attr = __ATTR_RO(cap);
+
+static ssize_t version_show(struct dma_chan *c, char *page)
+{
+	struct dma_device *dma = c->device;
+	struct ioatdma_device *device = to_ioatdma_device(dma);
+
+	return sprintf(page, "%d.%d\n",
+		       device->version >> 4, device->version & 0xf);
+}
+struct ioat_sysfs_entry ioat_version_attr = __ATTR_RO(version);
+
+static struct attribute *ioat1_attrs[] = {
+	&ring_size_attr.attr,
+	&ring_active_attr.attr,
+	&ioat_cap_attr.attr,
+	&ioat_version_attr.attr,
+	NULL,
+};
+
+static ssize_t
+ioat_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct ioat_sysfs_entry *entry;
+	struct ioat_chan_common *chan;
+
+	entry = container_of(attr, struct ioat_sysfs_entry, attr);
+	chan = container_of(kobj, struct ioat_chan_common, kobj);
+
+	if (!entry->show)
+		return -EIO;
+	return entry->show(&chan->common, page);
+}
+
+struct sysfs_ops ioat_sysfs_ops = {
+	.show	= ioat_attr_show,
+};
+
+static struct kobj_type ioat1_ktype = {
+	.sysfs_ops = &ioat_sysfs_ops,
+	.default_attrs = ioat1_attrs,
+};
+
+void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type)
+{
+	struct dma_device *dma = &device->common;
+	struct dma_chan *c;
+
+	list_for_each_entry(c, &dma->channels, device_node) {
+		struct ioat_chan_common *chan = to_chan_common(c);
+		struct kobject *parent = &c->dev->device.kobj;
+		int err;
+
+		err = kobject_init_and_add(&chan->kobj, type, parent, "quickdata");
+		if (err) {
+			dev_warn(to_dev(chan),
+				 "sysfs init error (%d), continuing...\n", err);
+			kobject_put(&chan->kobj);
+			set_bit(IOAT_KOBJ_INIT_FAIL, &chan->state);
+		}
+	}
+}
+
+void ioat_kobject_del(struct ioatdma_device *device)
+{
+	struct dma_device *dma = &device->common;
+	struct dma_chan *c;
+
+	list_for_each_entry(c, &dma->channels, device_node) {
+		struct ioat_chan_common *chan = to_chan_common(c);
+
+		if (!test_bit(IOAT_KOBJ_INIT_FAIL, &chan->state)) {
+			kobject_del(&chan->kobj);
+			kobject_put(&chan->kobj);
+		}
+	}
+}
+
+int __devinit ioat1_dma_probe(struct ioatdma_device *device, int dca)
+{
+	struct pci_dev *pdev = device->pdev;
+	struct dma_device *dma;
+	int err;
+
+	device->intr_quirk = ioat1_intr_quirk;
+	device->enumerate_channels = ioat1_enumerate_channels;
+	device->self_test = ioat_dma_self_test;
+	dma = &device->common;
+	dma->device_prep_dma_memcpy = ioat1_dma_prep_memcpy;
+	dma->device_issue_pending = ioat1_dma_memcpy_issue_pending;
+	dma->device_alloc_chan_resources = ioat1_dma_alloc_chan_resources;
+	dma->device_free_chan_resources = ioat1_dma_free_chan_resources;
+	dma->device_is_tx_complete = ioat1_dma_is_complete;
+
+	err = ioat_probe(device);
+	if (err)
+		return err;
+	ioat_set_tcp_copy_break(4096);
+	err = ioat_register(device);
+	if (err)
+		return err;
+	ioat_kobject_add(device, &ioat1_ktype);
+
+	if (dca)
+		device->dca = ioat_dca_init(pdev, device->reg_base);
+
+	return err;
+}
+
+void __devexit ioat_dma_remove(struct ioatdma_device *device)
+{
+	struct dma_device *dma = &device->common;
+
+	ioat_disable_interrupts(device);
+
+	ioat_kobject_del(device);
+
+	dma_async_device_unregister(dma);
+
+	pci_pool_destroy(device->dma_pool);
+	pci_pool_destroy(device->completion_pool);
+
+	INIT_LIST_HEAD(&dma->channels);
+}
diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h
new file mode 100644
index 00000000000..c14fdfeb7f3
--- /dev/null
+++ b/drivers/dma/ioat/dma.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef IOATDMA_H
+#define IOATDMA_H
+
+#include <linux/dmaengine.h>
+#include "hw.h"
+#include "registers.h"
+#include <linux/init.h>
+#include <linux/dmapool.h>
+#include <linux/cache.h>
+#include <linux/pci_ids.h>
+#include <net/tcp.h>
+
+#define IOAT_DMA_VERSION  "4.00"
+
+#define IOAT_LOW_COMPLETION_MASK	0xffffffc0
+#define IOAT_DMA_DCA_ANY_CPU		~0
+
+#define to_ioatdma_device(dev) container_of(dev, struct ioatdma_device, common)
+#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
+#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, txd)
+#define to_dev(ioat_chan) (&(ioat_chan)->device->pdev->dev)
+
+#define chan_num(ch) ((int)((ch)->reg_base - (ch)->device->reg_base) / 0x80)
+
+/*
+ * workaround for IOAT ver.3.0 null descriptor issue
+ * (channel returns error when size is 0)
+ */
+#define NULL_DESC_BUFFER_SIZE 1
+
+/**
+ * struct ioatdma_device - internal representation of a IOAT device
+ * @pdev: PCI-Express device
+ * @reg_base: MMIO register space base address
+ * @dma_pool: for allocating DMA descriptors
+ * @common: embedded struct dma_device
+ * @version: version of ioatdma device
+ * @msix_entries: irq handlers
+ * @idx: per channel data
+ * @dca: direct cache access context
+ * @intr_quirk: interrupt setup quirk (for ioat_v1 devices)
+ * @enumerate_channels: hw version specific channel enumeration
+ * @cleanup_tasklet: select between the v2 and v3 cleanup routines
+ * @timer_fn: select between the v2 and v3 timer watchdog routines
+ * @self_test: hardware version specific self test for each supported op type
+ *
+ * Note: the v3 cleanup routine supports raid operations
+ */
+struct ioatdma_device {
+	struct pci_dev *pdev;
+	void __iomem *reg_base;
+	struct pci_pool *dma_pool;
+	struct pci_pool *completion_pool;
+	struct dma_device common;
+	u8 version;
+	struct msix_entry msix_entries[4];
+	struct ioat_chan_common *idx[4];
+	struct dca_provider *dca;
+	void (*intr_quirk)(struct ioatdma_device *device);
+	int (*enumerate_channels)(struct ioatdma_device *device);
+	void (*cleanup_tasklet)(unsigned long data);
+	void (*timer_fn)(unsigned long data);
+	int (*self_test)(struct ioatdma_device *device);
+};
+
+struct ioat_chan_common {
+	struct dma_chan common;
+	void __iomem *reg_base;
+	unsigned long last_completion;
+	spinlock_t cleanup_lock;
+	dma_cookie_t completed_cookie;
+	unsigned long state;
+	#define IOAT_COMPLETION_PENDING 0
+	#define IOAT_COMPLETION_ACK 1
+	#define IOAT_RESET_PENDING 2
+	#define IOAT_KOBJ_INIT_FAIL 3
+	struct timer_list timer;
+	#define COMPLETION_TIMEOUT msecs_to_jiffies(100)
+	#define IDLE_TIMEOUT msecs_to_jiffies(2000)
+	#define RESET_DELAY msecs_to_jiffies(100)
+	struct ioatdma_device *device;
+	dma_addr_t completion_dma;
+	u64 *completion;
+	struct tasklet_struct cleanup_task;
+	struct kobject kobj;
+};
+
+struct ioat_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct dma_chan *, char *);
+};
+
+/**
+ * struct ioat_dma_chan - internal representation of a DMA channel
+ */
+struct ioat_dma_chan {
+	struct ioat_chan_common base;
+
+	size_t xfercap;	/* XFERCAP register value expanded out */
+
+	spinlock_t desc_lock;
+	struct list_head free_desc;
+	struct list_head used_desc;
+
+	int pending;
+	u16 desccount;
+	u16 active;
+};
+
+static inline struct ioat_chan_common *to_chan_common(struct dma_chan *c)
+{
+	return container_of(c, struct ioat_chan_common, common);
+}
+
+static inline struct ioat_dma_chan *to_ioat_chan(struct dma_chan *c)
+{
+	struct ioat_chan_common *chan = to_chan_common(c);
+
+	return container_of(chan, struct ioat_dma_chan, base);
+}
+
+/**
+ * ioat_is_complete - poll the status of an ioat transaction
+ * @c: channel handle
+ * @cookie: transaction identifier
+ * @done: if set, updated with last completed transaction
+ * @used: if set, updated with last used transaction
+ */
+static inline enum dma_status
+ioat_is_complete(struct dma_chan *c, dma_cookie_t cookie,
+		 dma_cookie_t *done, dma_cookie_t *used)
+{
+	struct ioat_chan_common *chan = to_chan_common(c);
+	dma_cookie_t last_used;
+	dma_cookie_t last_complete;
+
+	last_used = c->cookie;
+	last_complete = chan->completed_cookie;
+
+	if (done)
+		*done = last_complete;
+	if (used)
+		*used = last_used;
+
+	return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+/* wrapper around hardware descriptor format + additional software fields */
+
+/**
+ * struct ioat_desc_sw - wrapper around hardware descriptor
+ * @hw: hardware DMA descriptor (for memcpy)
+ * @node: this descriptor will either be on the free list,
+ *     or attached to a transaction list (tx_list)
+ * @txd: the generic software descriptor for all engines
+ * @id: identifier for debug
+ */
+struct ioat_desc_sw {
+	struct ioat_dma_descriptor *hw;
+	struct list_head node;
+	size_t len;
+	struct list_head tx_list;
+	struct dma_async_tx_descriptor txd;
+	#ifdef DEBUG
+	int id;
+	#endif
+};
+
+#ifdef DEBUG
+#define set_desc_id(desc, i) ((desc)->id = (i))
+#define desc_id(desc) ((desc)->id)
+#else
+#define set_desc_id(desc, i)
+#define desc_id(desc) (0)
+#endif
+
+static inline void
+__dump_desc_dbg(struct ioat_chan_common *chan, struct ioat_dma_descriptor *hw,
+		struct dma_async_tx_descriptor *tx, int id)
+{
+	struct device *dev = to_dev(chan);
+
+	dev_dbg(dev, "desc[%d]: (%#llx->%#llx) cookie: %d flags: %#x"
+		" ctl: %#x (op: %d int_en: %d compl: %d)\n", id,
+		(unsigned long long) tx->phys,
+		(unsigned long long) hw->next, tx->cookie, tx->flags,
+		hw->ctl, hw->ctl_f.op, hw->ctl_f.int_en, hw->ctl_f.compl_write);
+}
+
+#define dump_desc_dbg(c, d) \
+	({ if (d) __dump_desc_dbg(&c->base, d->hw, &d->txd, desc_id(d)); 0; })
+
+static inline void ioat_set_tcp_copy_break(unsigned long copybreak)
+{
+	#ifdef CONFIG_NET_DMA
+	sysctl_tcp_dma_copybreak = copybreak;
+	#endif
+}
+
+static inline struct ioat_chan_common *
+ioat_chan_by_index(struct ioatdma_device *device, int index)
+{
+	return device->idx[index];
+}
+
+static inline u64 ioat_chansts(struct ioat_chan_common *chan)
+{
+	u8 ver = chan->device->version;
+	u64 status;
+	u32 status_lo;
+
+	/* We need to read the low address first as this causes the
+	 * chipset to latch the upper bits for the subsequent read
+	 */
+	status_lo = readl(chan->reg_base + IOAT_CHANSTS_OFFSET_LOW(ver));
+	status = readl(chan->reg_base + IOAT_CHANSTS_OFFSET_HIGH(ver));
+	status <<= 32;
+	status |= status_lo;
+
+	return status;
+}
+
+static inline void ioat_start(struct ioat_chan_common *chan)
+{
+	u8 ver = chan->device->version;
+
+	writeb(IOAT_CHANCMD_START, chan->reg_base + IOAT_CHANCMD_OFFSET(ver));
+}
+
+static inline u64 ioat_chansts_to_addr(u64 status)
+{
+	return status & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
+}
+
+static inline u32 ioat_chanerr(struct ioat_chan_common *chan)
+{
+	return readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+}
+
+static inline void ioat_suspend(struct ioat_chan_common *chan)
+{
+	u8 ver = chan->device->version;
+
+	writeb(IOAT_CHANCMD_SUSPEND, chan->reg_base + IOAT_CHANCMD_OFFSET(ver));
+}
+
+static inline void ioat_set_chainaddr(struct ioat_dma_chan *ioat, u64 addr)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+
+	writel(addr & 0x00000000FFFFFFFF,
+	       chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW);
+	writel(addr >> 32,
+	       chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH);
+}
+
+static inline bool is_ioat_active(unsigned long status)
+{
+	return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_ACTIVE);
+}
+
+static inline bool is_ioat_idle(unsigned long status)
+{
+	return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_DONE);
+}
+
+static inline bool is_ioat_halted(unsigned long status)
+{
+	return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_HALTED);
+}
+
+static inline bool is_ioat_suspended(unsigned long status)
+{
+	return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_SUSPENDED);
+}
+
+/* channel was fatally programmed */
+static inline bool is_ioat_bug(unsigned long err)
+{
+	return !!(err & (IOAT_CHANERR_SRC_ADDR_ERR|IOAT_CHANERR_DEST_ADDR_ERR|
+			 IOAT_CHANERR_NEXT_ADDR_ERR|IOAT_CHANERR_CONTROL_ERR|
+			 IOAT_CHANERR_LENGTH_ERR));
+}
+
+static inline void ioat_unmap(struct pci_dev *pdev, dma_addr_t addr, size_t len,
+			      int direction, enum dma_ctrl_flags flags, bool dst)
+{
+	if ((dst && (flags & DMA_COMPL_DEST_UNMAP_SINGLE)) ||
+	    (!dst && (flags & DMA_COMPL_SRC_UNMAP_SINGLE)))
+		pci_unmap_single(pdev, addr, len, direction);
+	else
+		pci_unmap_page(pdev, addr, len, direction);
+}
+
+int __devinit ioat_probe(struct ioatdma_device *device);
+int __devinit ioat_register(struct ioatdma_device *device);
+int __devinit ioat1_dma_probe(struct ioatdma_device *dev, int dca);
+int __devinit ioat_dma_self_test(struct ioatdma_device *device);
+void __devexit ioat_dma_remove(struct ioatdma_device *device);
+struct dca_provider * __devinit ioat_dca_init(struct pci_dev *pdev,
+					      void __iomem *iobase);
+unsigned long ioat_get_current_completion(struct ioat_chan_common *chan);
+void ioat_init_channel(struct ioatdma_device *device,
+		       struct ioat_chan_common *chan, int idx,
+		       void (*timer_fn)(unsigned long),
+		       void (*tasklet)(unsigned long),
+		       unsigned long ioat);
+void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags,
+		    size_t len, struct ioat_dma_descriptor *hw);
+bool ioat_cleanup_preamble(struct ioat_chan_common *chan,
+			   unsigned long *phys_complete);
+void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type);
+void ioat_kobject_del(struct ioatdma_device *device);
+extern struct sysfs_ops ioat_sysfs_ops;
+extern struct ioat_sysfs_entry ioat_version_attr;
+extern struct ioat_sysfs_entry ioat_cap_attr;
+#endif /* IOATDMA_H */
diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
new file mode 100644
index 00000000000..96ffab7d37a
--- /dev/null
+++ b/drivers/dma/ioat/dma_v2.c
@@ -0,0 +1,871 @@
+/*
+ * Intel I/OAT DMA Linux driver
+ * Copyright(c) 2004 - 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ */
+
+/*
+ * This driver supports an Intel I/OAT DMA engine (versions >= 2), which
+ * does asynchronous data movement and checksumming operations.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/i7300_idle.h>
+#include "dma.h"
+#include "dma_v2.h"
+#include "registers.h"
+#include "hw.h"
+
+int ioat_ring_alloc_order = 8;
+module_param(ioat_ring_alloc_order, int, 0644);
+MODULE_PARM_DESC(ioat_ring_alloc_order,
+		 "ioat2+: allocate 2^n descriptors per channel"
+		 " (default: 8 max: 16)");
+static int ioat_ring_max_alloc_order = IOAT_MAX_ORDER;
+module_param(ioat_ring_max_alloc_order, int, 0644);
+MODULE_PARM_DESC(ioat_ring_max_alloc_order,
+		 "ioat2+: upper limit for ring size (default: 16)");
+
+void __ioat2_issue_pending(struct ioat2_dma_chan *ioat)
+{
+	void * __iomem reg_base = ioat->base.reg_base;
+
+	ioat->pending = 0;
+	ioat->dmacount += ioat2_ring_pending(ioat);
+	ioat->issued = ioat->head;
+	/* make descriptor updates globally visible before notifying channel */
+	wmb();
+	writew(ioat->dmacount, reg_base + IOAT_CHAN_DMACOUNT_OFFSET);
+	dev_dbg(to_dev(&ioat->base),
+		"%s: head: %#x tail: %#x issued: %#x count: %#x\n",
+		__func__, ioat->head, ioat->tail, ioat->issued, ioat->dmacount);
+}
+
+void ioat2_issue_pending(struct dma_chan *chan)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(chan);
+
+	spin_lock_bh(&ioat->ring_lock);
+	if (ioat->pending == 1)
+		__ioat2_issue_pending(ioat);
+	spin_unlock_bh(&ioat->ring_lock);
+}
+
+/**
+ * ioat2_update_pending - log pending descriptors
+ * @ioat: ioat2+ channel
+ *
+ * set pending to '1' unless pending is already set to '2', pending == 2
+ * indicates that submission is temporarily blocked due to an in-flight
+ * reset.  If we are already above the ioat_pending_level threshold then
+ * just issue pending.
+ *
+ * called with ring_lock held
+ */
+static void ioat2_update_pending(struct ioat2_dma_chan *ioat)
+{
+	if (unlikely(ioat->pending == 2))
+		return;
+	else if (ioat2_ring_pending(ioat) > ioat_pending_level)
+		__ioat2_issue_pending(ioat);
+	else
+		ioat->pending = 1;
+}
+
+static void __ioat2_start_null_desc(struct ioat2_dma_chan *ioat)
+{
+	struct ioat_ring_ent *desc;
+	struct ioat_dma_descriptor *hw;
+	int idx;
+
+	if (ioat2_ring_space(ioat) < 1) {
+		dev_err(to_dev(&ioat->base),
+			"Unable to start null desc - ring full\n");
+		return;
+	}
+
+	dev_dbg(to_dev(&ioat->base), "%s: head: %#x tail: %#x issued: %#x\n",
+		__func__, ioat->head, ioat->tail, ioat->issued);
+	idx = ioat2_desc_alloc(ioat, 1);
+	desc = ioat2_get_ring_ent(ioat, idx);
+
+	hw = desc->hw;
+	hw->ctl = 0;
+	hw->ctl_f.null = 1;
+	hw->ctl_f.int_en = 1;
+	hw->ctl_f.compl_write = 1;
+	/* set size to non-zero value (channel returns error when size is 0) */
+	hw->size = NULL_DESC_BUFFER_SIZE;
+	hw->src_addr = 0;
+	hw->dst_addr = 0;
+	async_tx_ack(&desc->txd);
+	ioat2_set_chainaddr(ioat, desc->txd.phys);
+	dump_desc_dbg(ioat, desc);
+	__ioat2_issue_pending(ioat);
+}
+
+static void ioat2_start_null_desc(struct ioat2_dma_chan *ioat)
+{
+	spin_lock_bh(&ioat->ring_lock);
+	__ioat2_start_null_desc(ioat);
+	spin_unlock_bh(&ioat->ring_lock);
+}
+
+static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	struct dma_async_tx_descriptor *tx;
+	struct ioat_ring_ent *desc;
+	bool seen_current = false;
+	u16 active;
+	int i;
+
+	dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n",
+		__func__, ioat->head, ioat->tail, ioat->issued);
+
+	active = ioat2_ring_active(ioat);
+	for (i = 0; i < active && !seen_current; i++) {
+		prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1));
+		desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
+		tx = &desc->txd;
+		dump_desc_dbg(ioat, desc);
+		if (tx->cookie) {
+			ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw);
+			chan->completed_cookie = tx->cookie;
+			tx->cookie = 0;
+			if (tx->callback) {
+				tx->callback(tx->callback_param);
+				tx->callback = NULL;
+			}
+		}
+
+		if (tx->phys == phys_complete)
+			seen_current = true;
+	}
+	ioat->tail += i;
+	BUG_ON(!seen_current); /* no active descs have written a completion? */
+
+	chan->last_completion = phys_complete;
+	if (ioat->head == ioat->tail) {
+		dev_dbg(to_dev(chan), "%s: cancel completion timeout\n",
+			__func__);
+		clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
+		mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
+	}
+}
+
+/**
+ * ioat2_cleanup - clean finished descriptors (advance tail pointer)
+ * @chan: ioat channel to be cleaned up
+ */
+static void ioat2_cleanup(struct ioat2_dma_chan *ioat)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	unsigned long phys_complete;
+
+	prefetch(chan->completion);
+
+	if (!spin_trylock_bh(&chan->cleanup_lock))
+		return;
+
+	if (!ioat_cleanup_preamble(chan, &phys_complete)) {
+		spin_unlock_bh(&chan->cleanup_lock);
+		return;
+	}
+
+	if (!spin_trylock_bh(&ioat->ring_lock)) {
+		spin_unlock_bh(&chan->cleanup_lock);
+		return;
+	}
+
+	__cleanup(ioat, phys_complete);
+
+	spin_unlock_bh(&ioat->ring_lock);
+	spin_unlock_bh(&chan->cleanup_lock);
+}
+
+void ioat2_cleanup_tasklet(unsigned long data)
+{
+	struct ioat2_dma_chan *ioat = (void *) data;
+
+	ioat2_cleanup(ioat);
+	writew(IOAT_CHANCTRL_RUN, ioat->base.reg_base + IOAT_CHANCTRL_OFFSET);
+}
+
+void __ioat2_restart_chan(struct ioat2_dma_chan *ioat)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+
+	/* set the tail to be re-issued */
+	ioat->issued = ioat->tail;
+	ioat->dmacount = 0;
+	set_bit(IOAT_COMPLETION_PENDING, &chan->state);
+	mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+
+	dev_dbg(to_dev(chan),
+		"%s: head: %#x tail: %#x issued: %#x count: %#x\n",
+		__func__, ioat->head, ioat->tail, ioat->issued, ioat->dmacount);
+
+	if (ioat2_ring_pending(ioat)) {
+		struct ioat_ring_ent *desc;
+
+		desc = ioat2_get_ring_ent(ioat, ioat->tail);
+		ioat2_set_chainaddr(ioat, desc->txd.phys);
+		__ioat2_issue_pending(ioat);
+	} else
+		__ioat2_start_null_desc(ioat);
+}
+
+static void ioat2_restart_channel(struct ioat2_dma_chan *ioat)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	unsigned long phys_complete;
+	u32 status;
+
+	status = ioat_chansts(chan);
+	if (is_ioat_active(status) || is_ioat_idle(status))
+		ioat_suspend(chan);
+	while (is_ioat_active(status) || is_ioat_idle(status)) {
+		status = ioat_chansts(chan);
+		cpu_relax();
+	}
+
+	if (ioat_cleanup_preamble(chan, &phys_complete))
+		__cleanup(ioat, phys_complete);
+
+	__ioat2_restart_chan(ioat);
+}
+
+void ioat2_timer_event(unsigned long data)
+{
+	struct ioat2_dma_chan *ioat = (void *) data;
+	struct ioat_chan_common *chan = &ioat->base;
+
+	spin_lock_bh(&chan->cleanup_lock);
+	if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
+		unsigned long phys_complete;
+		u64 status;
+
+		spin_lock_bh(&ioat->ring_lock);
+		status = ioat_chansts(chan);
+
+		/* when halted due to errors check for channel
+		 * programming errors before advancing the completion state
+		 */
+		if (is_ioat_halted(status)) {
+			u32 chanerr;
+
+			chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+			BUG_ON(is_ioat_bug(chanerr));
+		}
+
+		/* if we haven't made progress and we have already
+		 * acknowledged a pending completion once, then be more
+		 * forceful with a restart
+		 */
+		if (ioat_cleanup_preamble(chan, &phys_complete))
+			__cleanup(ioat, phys_complete);
+		else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
+			ioat2_restart_channel(ioat);
+		else {
+			set_bit(IOAT_COMPLETION_ACK, &chan->state);
+			mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+		}
+		spin_unlock_bh(&ioat->ring_lock);
+	} else {
+		u16 active;
+
+		/* if the ring is idle, empty, and oversized try to step
+		 * down the size
+		 */
+		spin_lock_bh(&ioat->ring_lock);
+		active = ioat2_ring_active(ioat);
+		if (active == 0 && ioat->alloc_order > ioat_get_alloc_order())
+			reshape_ring(ioat, ioat->alloc_order-1);
+		spin_unlock_bh(&ioat->ring_lock);
+
+		/* keep shrinking until we get back to our minimum
+		 * default size
+		 */
+		if (ioat->alloc_order > ioat_get_alloc_order())
+			mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
+	}
+	spin_unlock_bh(&chan->cleanup_lock);
+}
+
+/**
+ * ioat2_enumerate_channels - find and initialize the device's channels
+ * @device: the device to be enumerated
+ */
+int ioat2_enumerate_channels(struct ioatdma_device *device)
+{
+	struct ioat2_dma_chan *ioat;
+	struct device *dev = &device->pdev->dev;
+	struct dma_device *dma = &device->common;
+	u8 xfercap_log;
+	int i;
+
+	INIT_LIST_HEAD(&dma->channels);
+	dma->chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET);
+	dma->chancnt &= 0x1f; /* bits [4:0] valid */
+	if (dma->chancnt > ARRAY_SIZE(device->idx)) {
+		dev_warn(dev, "(%d) exceeds max supported channels (%zu)\n",
+			 dma->chancnt, ARRAY_SIZE(device->idx));
+		dma->chancnt = ARRAY_SIZE(device->idx);
+	}
+	xfercap_log = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
+	xfercap_log &= 0x1f; /* bits [4:0] valid */
+	if (xfercap_log == 0)
+		return 0;
+	dev_dbg(dev, "%s: xfercap = %d\n", __func__, 1 << xfercap_log);
+
+	/* FIXME which i/oat version is i7300? */
+#ifdef CONFIG_I7300_IDLE_IOAT_CHANNEL
+	if (i7300_idle_platform_probe(NULL, NULL, 1) == 0)
+		dma->chancnt--;
+#endif
+	for (i = 0; i < dma->chancnt; i++) {
+		ioat = devm_kzalloc(dev, sizeof(*ioat), GFP_KERNEL);
+		if (!ioat)
+			break;
+
+		ioat_init_channel(device, &ioat->base, i,
+				  device->timer_fn,
+				  device->cleanup_tasklet,
+				  (unsigned long) ioat);
+		ioat->xfercap_log = xfercap_log;
+		spin_lock_init(&ioat->ring_lock);
+	}
+	dma->chancnt = i;
+	return i;
+}
+
+static dma_cookie_t ioat2_tx_submit_unlock(struct dma_async_tx_descriptor *tx)
+{
+	struct dma_chan *c = tx->chan;
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+	struct ioat_chan_common *chan = &ioat->base;
+	dma_cookie_t cookie = c->cookie;
+
+	cookie++;
+	if (cookie < 0)
+		cookie = 1;
+	tx->cookie = cookie;
+	c->cookie = cookie;
+	dev_dbg(to_dev(&ioat->base), "%s: cookie: %d\n", __func__, cookie);
+
+	if (!test_and_set_bit(IOAT_COMPLETION_PENDING, &chan->state))
+		mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+	ioat2_update_pending(ioat);
+	spin_unlock_bh(&ioat->ring_lock);
+
+	return cookie;
+}
+
+static struct ioat_ring_ent *ioat2_alloc_ring_ent(struct dma_chan *chan, gfp_t flags)
+{
+	struct ioat_dma_descriptor *hw;
+	struct ioat_ring_ent *desc;
+	struct ioatdma_device *dma;
+	dma_addr_t phys;
+
+	dma = to_ioatdma_device(chan->device);
+	hw = pci_pool_alloc(dma->dma_pool, flags, &phys);
+	if (!hw)
+		return NULL;
+	memset(hw, 0, sizeof(*hw));
+
+	desc = kmem_cache_alloc(ioat2_cache, flags);
+	if (!desc) {
+		pci_pool_free(dma->dma_pool, hw, phys);
+		return NULL;
+	}
+	memset(desc, 0, sizeof(*desc));
+
+	dma_async_tx_descriptor_init(&desc->txd, chan);
+	desc->txd.tx_submit = ioat2_tx_submit_unlock;
+	desc->hw = hw;
+	desc->txd.phys = phys;
+	return desc;
+}
+
+static void ioat2_free_ring_ent(struct ioat_ring_ent *desc, struct dma_chan *chan)
+{
+	struct ioatdma_device *dma;
+
+	dma = to_ioatdma_device(chan->device);
+	pci_pool_free(dma->dma_pool, desc->hw, desc->txd.phys);
+	kmem_cache_free(ioat2_cache, desc);
+}
+
+static struct ioat_ring_ent **ioat2_alloc_ring(struct dma_chan *c, int order, gfp_t flags)
+{
+	struct ioat_ring_ent **ring;
+	int descs = 1 << order;
+	int i;
+
+	if (order > ioat_get_max_alloc_order())
+		return NULL;
+
+	/* allocate the array to hold the software ring */
+	ring = kcalloc(descs, sizeof(*ring), flags);
+	if (!ring)
+		return NULL;
+	for (i = 0; i < descs; i++) {
+		ring[i] = ioat2_alloc_ring_ent(c, flags);
+		if (!ring[i]) {
+			while (i--)
+				ioat2_free_ring_ent(ring[i], c);
+			kfree(ring);
+			return NULL;
+		}
+		set_desc_id(ring[i], i);
+	}
+
+	/* link descs */
+	for (i = 0; i < descs-1; i++) {
+		struct ioat_ring_ent *next = ring[i+1];
+		struct ioat_dma_descriptor *hw = ring[i]->hw;
+
+		hw->next = next->txd.phys;
+	}
+	ring[i]->hw->next = ring[0]->txd.phys;
+
+	return ring;
+}
+
+/* ioat2_alloc_chan_resources - allocate/initialize ioat2 descriptor ring
+ * @chan: channel to be initialized
+ */
+int ioat2_alloc_chan_resources(struct dma_chan *c)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+	struct ioat_chan_common *chan = &ioat->base;
+	struct ioat_ring_ent **ring;
+	u32 chanerr;
+	int order;
+
+	/* have we already been set up? */
+	if (ioat->ring)
+		return 1 << ioat->alloc_order;
+
+	/* Setup register to interrupt and write completion status on error */
+	writew(IOAT_CHANCTRL_RUN, chan->reg_base + IOAT_CHANCTRL_OFFSET);
+
+	chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+	if (chanerr) {
+		dev_err(to_dev(chan), "CHANERR = %x, clearing\n", chanerr);
+		writel(chanerr, chan->reg_base + IOAT_CHANERR_OFFSET);
+	}
+
+	/* allocate a completion writeback area */
+	/* doing 2 32bit writes to mmio since 1 64b write doesn't work */
+	chan->completion = pci_pool_alloc(chan->device->completion_pool,
+					  GFP_KERNEL, &chan->completion_dma);
+	if (!chan->completion)
+		return -ENOMEM;
+
+	memset(chan->completion, 0, sizeof(*chan->completion));
+	writel(((u64) chan->completion_dma) & 0x00000000FFFFFFFF,
+	       chan->reg_base + IOAT_CHANCMP_OFFSET_LOW);
+	writel(((u64) chan->completion_dma) >> 32,
+	       chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH);
+
+	order = ioat_get_alloc_order();
+	ring = ioat2_alloc_ring(c, order, GFP_KERNEL);
+	if (!ring)
+		return -ENOMEM;
+
+	spin_lock_bh(&ioat->ring_lock);
+	ioat->ring = ring;
+	ioat->head = 0;
+	ioat->issued = 0;
+	ioat->tail = 0;
+	ioat->pending = 0;
+	ioat->alloc_order = order;
+	spin_unlock_bh(&ioat->ring_lock);
+
+	tasklet_enable(&chan->cleanup_task);
+	ioat2_start_null_desc(ioat);
+
+	return 1 << ioat->alloc_order;
+}
+
+bool reshape_ring(struct ioat2_dma_chan *ioat, int order)
+{
+	/* reshape differs from normal ring allocation in that we want
+	 * to allocate a new software ring while only
+	 * extending/truncating the hardware ring
+	 */
+	struct ioat_chan_common *chan = &ioat->base;
+	struct dma_chan *c = &chan->common;
+	const u16 curr_size = ioat2_ring_mask(ioat) + 1;
+	const u16 active = ioat2_ring_active(ioat);
+	const u16 new_size = 1 << order;
+	struct ioat_ring_ent **ring;
+	u16 i;
+
+	if (order > ioat_get_max_alloc_order())
+		return false;
+
+	/* double check that we have at least 1 free descriptor */
+	if (active == curr_size)
+		return false;
+
+	/* when shrinking, verify that we can hold the current active
+	 * set in the new ring
+	 */
+	if (active >= new_size)
+		return false;
+
+	/* allocate the array to hold the software ring */
+	ring = kcalloc(new_size, sizeof(*ring), GFP_NOWAIT);
+	if (!ring)
+		return false;
+
+	/* allocate/trim descriptors as needed */
+	if (new_size > curr_size) {
+		/* copy current descriptors to the new ring */
+		for (i = 0; i < curr_size; i++) {
+			u16 curr_idx = (ioat->tail+i) & (curr_size-1);
+			u16 new_idx = (ioat->tail+i) & (new_size-1);
+
+			ring[new_idx] = ioat->ring[curr_idx];
+			set_desc_id(ring[new_idx], new_idx);
+		}
+
+		/* add new descriptors to the ring */
+		for (i = curr_size; i < new_size; i++) {
+			u16 new_idx = (ioat->tail+i) & (new_size-1);
+
+			ring[new_idx] = ioat2_alloc_ring_ent(c, GFP_NOWAIT);
+			if (!ring[new_idx]) {
+				while (i--) {
+					u16 new_idx = (ioat->tail+i) & (new_size-1);
+
+					ioat2_free_ring_ent(ring[new_idx], c);
+				}
+				kfree(ring);
+				return false;
+			}
+			set_desc_id(ring[new_idx], new_idx);
+		}
+
+		/* hw link new descriptors */
+		for (i = curr_size-1; i < new_size; i++) {
+			u16 new_idx = (ioat->tail+i) & (new_size-1);
+			struct ioat_ring_ent *next = ring[(new_idx+1) & (new_size-1)];
+			struct ioat_dma_descriptor *hw = ring[new_idx]->hw;
+
+			hw->next = next->txd.phys;
+		}
+	} else {
+		struct ioat_dma_descriptor *hw;
+		struct ioat_ring_ent *next;
+
+		/* copy current descriptors to the new ring, dropping the
+		 * removed descriptors
+		 */
+		for (i = 0; i < new_size; i++) {
+			u16 curr_idx = (ioat->tail+i) & (curr_size-1);
+			u16 new_idx = (ioat->tail+i) & (new_size-1);
+
+			ring[new_idx] = ioat->ring[curr_idx];
+			set_desc_id(ring[new_idx], new_idx);
+		}
+
+		/* free deleted descriptors */
+		for (i = new_size; i < curr_size; i++) {
+			struct ioat_ring_ent *ent;
+
+			ent = ioat2_get_ring_ent(ioat, ioat->tail+i);
+			ioat2_free_ring_ent(ent, c);
+		}
+
+		/* fix up hardware ring */
+		hw = ring[(ioat->tail+new_size-1) & (new_size-1)]->hw;
+		next = ring[(ioat->tail+new_size) & (new_size-1)];
+		hw->next = next->txd.phys;
+	}
+
+	dev_dbg(to_dev(chan), "%s: allocated %d descriptors\n",
+		__func__, new_size);
+
+	kfree(ioat->ring);
+	ioat->ring = ring;
+	ioat->alloc_order = order;
+
+	return true;
+}
+
+/**
+ * ioat2_alloc_and_lock - common descriptor alloc boilerplate for ioat2,3 ops
+ * @idx: gets starting descriptor index on successful allocation
+ * @ioat: ioat2,3 channel (ring) to operate on
+ * @num_descs: allocation length
+ */
+int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+
+	spin_lock_bh(&ioat->ring_lock);
+	/* never allow the last descriptor to be consumed, we need at
+	 * least one free at all times to allow for on-the-fly ring
+	 * resizing.
+	 */
+	while (unlikely(ioat2_ring_space(ioat) <= num_descs)) {
+		if (reshape_ring(ioat, ioat->alloc_order + 1) &&
+		    ioat2_ring_space(ioat) > num_descs)
+				break;
+
+		if (printk_ratelimit())
+			dev_dbg(to_dev(chan),
+				"%s: ring full! num_descs: %d (%x:%x:%x)\n",
+				__func__, num_descs, ioat->head, ioat->tail,
+				ioat->issued);
+		spin_unlock_bh(&ioat->ring_lock);
+
+		/* progress reclaim in the allocation failure case we
+		 * may be called under bh_disabled so we need to trigger
+		 * the timer event directly
+		 */
+		spin_lock_bh(&chan->cleanup_lock);
+		if (jiffies > chan->timer.expires &&
+		    timer_pending(&chan->timer)) {
+			struct ioatdma_device *device = chan->device;
+
+			mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+			spin_unlock_bh(&chan->cleanup_lock);
+			device->timer_fn((unsigned long) ioat);
+		} else
+			spin_unlock_bh(&chan->cleanup_lock);
+		return -ENOMEM;
+	}
+
+	dev_dbg(to_dev(chan), "%s: num_descs: %d (%x:%x:%x)\n",
+		__func__, num_descs, ioat->head, ioat->tail, ioat->issued);
+
+	*idx = ioat2_desc_alloc(ioat, num_descs);
+	return 0;  /* with ioat->ring_lock held */
+}
+
+struct dma_async_tx_descriptor *
+ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest,
+			   dma_addr_t dma_src, size_t len, unsigned long flags)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+	struct ioat_dma_descriptor *hw;
+	struct ioat_ring_ent *desc;
+	dma_addr_t dst = dma_dest;
+	dma_addr_t src = dma_src;
+	size_t total_len = len;
+	int num_descs;
+	u16 idx;
+	int i;
+
+	num_descs = ioat2_xferlen_to_descs(ioat, len);
+	if (likely(num_descs) &&
+	    ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0)
+		/* pass */;
+	else
+		return NULL;
+	i = 0;
+	do {
+		size_t copy = min_t(size_t, len, 1 << ioat->xfercap_log);
+
+		desc = ioat2_get_ring_ent(ioat, idx + i);
+		hw = desc->hw;
+
+		hw->size = copy;
+		hw->ctl = 0;
+		hw->src_addr = src;
+		hw->dst_addr = dst;
+
+		len -= copy;
+		dst += copy;
+		src += copy;
+		dump_desc_dbg(ioat, desc);
+	} while (++i < num_descs);
+
+	desc->txd.flags = flags;
+	desc->len = total_len;
+	hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+	hw->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+	hw->ctl_f.compl_write = 1;
+	dump_desc_dbg(ioat, desc);
+	/* we leave the channel locked to ensure in order submission */
+
+	return &desc->txd;
+}
+
+/**
+ * ioat2_free_chan_resources - release all the descriptors
+ * @chan: the channel to be cleaned
+ */
+void ioat2_free_chan_resources(struct dma_chan *c)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+	struct ioat_chan_common *chan = &ioat->base;
+	struct ioatdma_device *device = chan->device;
+	struct ioat_ring_ent *desc;
+	const u16 total_descs = 1 << ioat->alloc_order;
+	int descs;
+	int i;
+
+	/* Before freeing channel resources first check
+	 * if they have been previously allocated for this channel.
+	 */
+	if (!ioat->ring)
+		return;
+
+	tasklet_disable(&chan->cleanup_task);
+	del_timer_sync(&chan->timer);
+	device->cleanup_tasklet((unsigned long) ioat);
+
+	/* Delay 100ms after reset to allow internal DMA logic to quiesce
+	 * before removing DMA descriptor resources.
+	 */
+	writeb(IOAT_CHANCMD_RESET,
+	       chan->reg_base + IOAT_CHANCMD_OFFSET(chan->device->version));
+	mdelay(100);
+
+	spin_lock_bh(&ioat->ring_lock);
+	descs = ioat2_ring_space(ioat);
+	dev_dbg(to_dev(chan), "freeing %d idle descriptors\n", descs);
+	for (i = 0; i < descs; i++) {
+		desc = ioat2_get_ring_ent(ioat, ioat->head + i);
+		ioat2_free_ring_ent(desc, c);
+	}
+
+	if (descs < total_descs)
+		dev_err(to_dev(chan), "Freeing %d in use descriptors!\n",
+			total_descs - descs);
+
+	for (i = 0; i < total_descs - descs; i++) {
+		desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
+		dump_desc_dbg(ioat, desc);
+		ioat2_free_ring_ent(desc, c);
+	}
+
+	kfree(ioat->ring);
+	ioat->ring = NULL;
+	ioat->alloc_order = 0;
+	pci_pool_free(device->completion_pool, chan->completion,
+		      chan->completion_dma);
+	spin_unlock_bh(&ioat->ring_lock);
+
+	chan->last_completion = 0;
+	chan->completion_dma = 0;
+	ioat->pending = 0;
+	ioat->dmacount = 0;
+}
+
+enum dma_status
+ioat2_is_complete(struct dma_chan *c, dma_cookie_t cookie,
+		     dma_cookie_t *done, dma_cookie_t *used)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+	struct ioatdma_device *device = ioat->base.device;
+
+	if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
+		return DMA_SUCCESS;
+
+	device->cleanup_tasklet((unsigned long) ioat);
+
+	return ioat_is_complete(c, cookie, done, used);
+}
+
+static ssize_t ring_size_show(struct dma_chan *c, char *page)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+
+	return sprintf(page, "%d\n", (1 << ioat->alloc_order) & ~1);
+}
+static struct ioat_sysfs_entry ring_size_attr = __ATTR_RO(ring_size);
+
+static ssize_t ring_active_show(struct dma_chan *c, char *page)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+
+	/* ...taken outside the lock, no need to be precise */
+	return sprintf(page, "%d\n", ioat2_ring_active(ioat));
+}
+static struct ioat_sysfs_entry ring_active_attr = __ATTR_RO(ring_active);
+
+static struct attribute *ioat2_attrs[] = {
+	&ring_size_attr.attr,
+	&ring_active_attr.attr,
+	&ioat_cap_attr.attr,
+	&ioat_version_attr.attr,
+	NULL,
+};
+
+struct kobj_type ioat2_ktype = {
+	.sysfs_ops = &ioat_sysfs_ops,
+	.default_attrs = ioat2_attrs,
+};
+
+int __devinit ioat2_dma_probe(struct ioatdma_device *device, int dca)
+{
+	struct pci_dev *pdev = device->pdev;
+	struct dma_device *dma;
+	struct dma_chan *c;
+	struct ioat_chan_common *chan;
+	int err;
+
+	device->enumerate_channels = ioat2_enumerate_channels;
+	device->cleanup_tasklet = ioat2_cleanup_tasklet;
+	device->timer_fn = ioat2_timer_event;
+	device->self_test = ioat_dma_self_test;
+	dma = &device->common;
+	dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock;
+	dma->device_issue_pending = ioat2_issue_pending;
+	dma->device_alloc_chan_resources = ioat2_alloc_chan_resources;
+	dma->device_free_chan_resources = ioat2_free_chan_resources;
+	dma->device_is_tx_complete = ioat2_is_complete;
+
+	err = ioat_probe(device);
+	if (err)
+		return err;
+	ioat_set_tcp_copy_break(2048);
+
+	list_for_each_entry(c, &dma->channels, device_node) {
+		chan = to_chan_common(c);
+		writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE | IOAT_DMA_DCA_ANY_CPU,
+		       chan->reg_base + IOAT_DCACTRL_OFFSET);
+	}
+
+	err = ioat_register(device);
+	if (err)
+		return err;
+
+	ioat_kobject_add(device, &ioat2_ktype);
+
+	if (dca)
+		device->dca = ioat2_dca_init(pdev, device->reg_base);
+
+	return err;
+}
diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h
new file mode 100644
index 00000000000..1d849ef74d5
--- /dev/null
+++ b/drivers/dma/ioat/dma_v2.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef IOATDMA_V2_H
+#define IOATDMA_V2_H
+
+#include <linux/dmaengine.h>
+#include "dma.h"
+#include "hw.h"
+
+
+extern int ioat_pending_level;
+extern int ioat_ring_alloc_order;
+
+/*
+ * workaround for IOAT ver.3.0 null descriptor issue
+ * (channel returns error when size is 0)
+ */
+#define NULL_DESC_BUFFER_SIZE 1
+
+#define IOAT_MAX_ORDER 16
+#define ioat_get_alloc_order() \
+	(min(ioat_ring_alloc_order, IOAT_MAX_ORDER))
+#define ioat_get_max_alloc_order() \
+	(min(ioat_ring_max_alloc_order, IOAT_MAX_ORDER))
+
+/* struct ioat2_dma_chan - ioat v2 / v3 channel attributes
+ * @base: common ioat channel parameters
+ * @xfercap_log; log2 of channel max transfer length (for fast division)
+ * @head: allocated index
+ * @issued: hardware notification point
+ * @tail: cleanup index
+ * @pending: lock free indicator for issued != head
+ * @dmacount: identical to 'head' except for occasionally resetting to zero
+ * @alloc_order: log2 of the number of allocated descriptors
+ * @ring: software ring buffer implementation of hardware ring
+ * @ring_lock: protects ring attributes
+ */
+struct ioat2_dma_chan {
+	struct ioat_chan_common base;
+	size_t xfercap_log;
+	u16 head;
+	u16 issued;
+	u16 tail;
+	u16 dmacount;
+	u16 alloc_order;
+	int pending;
+	struct ioat_ring_ent **ring;
+	spinlock_t ring_lock;
+};
+
+static inline struct ioat2_dma_chan *to_ioat2_chan(struct dma_chan *c)
+{
+	struct ioat_chan_common *chan = to_chan_common(c);
+
+	return container_of(chan, struct ioat2_dma_chan, base);
+}
+
+static inline u16 ioat2_ring_mask(struct ioat2_dma_chan *ioat)
+{
+	return (1 << ioat->alloc_order) - 1;
+}
+
+/* count of descriptors in flight with the engine */
+static inline u16 ioat2_ring_active(struct ioat2_dma_chan *ioat)
+{
+	return (ioat->head - ioat->tail) & ioat2_ring_mask(ioat);
+}
+
+/* count of descriptors pending submission to hardware */
+static inline u16 ioat2_ring_pending(struct ioat2_dma_chan *ioat)
+{
+	return (ioat->head - ioat->issued) & ioat2_ring_mask(ioat);
+}
+
+static inline u16 ioat2_ring_space(struct ioat2_dma_chan *ioat)
+{
+	u16 num_descs = ioat2_ring_mask(ioat) + 1;
+	u16 active = ioat2_ring_active(ioat);
+
+	BUG_ON(active > num_descs);
+
+	return num_descs - active;
+}
+
+/* assumes caller already checked space */
+static inline u16 ioat2_desc_alloc(struct ioat2_dma_chan *ioat, u16 len)
+{
+	ioat->head += len;
+	return ioat->head - len;
+}
+
+static inline u16 ioat2_xferlen_to_descs(struct ioat2_dma_chan *ioat, size_t len)
+{
+	u16 num_descs = len >> ioat->xfercap_log;
+
+	num_descs += !!(len & ((1 << ioat->xfercap_log) - 1));
+	return num_descs;
+}
+
+/**
+ * struct ioat_ring_ent - wrapper around hardware descriptor
+ * @hw: hardware DMA descriptor (for memcpy)
+ * @fill: hardware fill descriptor
+ * @xor: hardware xor descriptor
+ * @xor_ex: hardware xor extension descriptor
+ * @pq: hardware pq descriptor
+ * @pq_ex: hardware pq extension descriptor
+ * @pqu: hardware pq update descriptor
+ * @raw: hardware raw (un-typed) descriptor
+ * @txd: the generic software descriptor for all engines
+ * @len: total transaction length for unmap
+ * @result: asynchronous result of validate operations
+ * @id: identifier for debug
+ */
+
+struct ioat_ring_ent {
+	union {
+		struct ioat_dma_descriptor *hw;
+		struct ioat_fill_descriptor *fill;
+		struct ioat_xor_descriptor *xor;
+		struct ioat_xor_ext_descriptor *xor_ex;
+		struct ioat_pq_descriptor *pq;
+		struct ioat_pq_ext_descriptor *pq_ex;
+		struct ioat_pq_update_descriptor *pqu;
+		struct ioat_raw_descriptor *raw;
+	};
+	size_t len;
+	struct dma_async_tx_descriptor txd;
+	enum sum_check_flags *result;
+	#ifdef DEBUG
+	int id;
+	#endif
+};
+
+static inline struct ioat_ring_ent *
+ioat2_get_ring_ent(struct ioat2_dma_chan *ioat, u16 idx)
+{
+	return ioat->ring[idx & ioat2_ring_mask(ioat)];
+}
+
+static inline void ioat2_set_chainaddr(struct ioat2_dma_chan *ioat, u64 addr)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+
+	writel(addr & 0x00000000FFFFFFFF,
+	       chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW);
+	writel(addr >> 32,
+	       chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH);
+}
+
+int __devinit ioat2_dma_probe(struct ioatdma_device *dev, int dca);
+int __devinit ioat3_dma_probe(struct ioatdma_device *dev, int dca);
+struct dca_provider * __devinit ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase);
+struct dca_provider * __devinit ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase);
+int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs);
+int ioat2_enumerate_channels(struct ioatdma_device *device);
+struct dma_async_tx_descriptor *
+ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest,
+			   dma_addr_t dma_src, size_t len, unsigned long flags);
+void ioat2_issue_pending(struct dma_chan *chan);
+int ioat2_alloc_chan_resources(struct dma_chan *c);
+void ioat2_free_chan_resources(struct dma_chan *c);
+enum dma_status ioat2_is_complete(struct dma_chan *c, dma_cookie_t cookie,
+				  dma_cookie_t *done, dma_cookie_t *used);
+void __ioat2_restart_chan(struct ioat2_dma_chan *ioat);
+bool reshape_ring(struct ioat2_dma_chan *ioat, int order);
+void __ioat2_issue_pending(struct ioat2_dma_chan *ioat);
+void ioat2_cleanup_tasklet(unsigned long data);
+void ioat2_timer_event(unsigned long data);
+extern struct kobj_type ioat2_ktype;
+extern struct kmem_cache *ioat2_cache;
+#endif /* IOATDMA_V2_H */
diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
new file mode 100644
index 00000000000..35d1e33afd5
--- /dev/null
+++ b/drivers/dma/ioat/dma_v3.c
@@ -0,0 +1,1223 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2004-2009 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Support routines for v3+ hardware
+ */
+
+#include <linux/pci.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
+#include "registers.h"
+#include "hw.h"
+#include "dma.h"
+#include "dma_v2.h"
+
+/* ioat hardware assumes at least two sources for raid operations */
+#define src_cnt_to_sw(x) ((x) + 2)
+#define src_cnt_to_hw(x) ((x) - 2)
+
+/* provide a lookup table for setting the source address in the base or
+ * extended descriptor of an xor or pq descriptor
+ */
+static const u8 xor_idx_to_desc __read_mostly = 0xd0;
+static const u8 xor_idx_to_field[] __read_mostly = { 1, 4, 5, 6, 7, 0, 1, 2 };
+static const u8 pq_idx_to_desc __read_mostly = 0xf8;
+static const u8 pq_idx_to_field[] __read_mostly = { 1, 4, 5, 0, 1, 2, 4, 5 };
+
+static dma_addr_t xor_get_src(struct ioat_raw_descriptor *descs[2], int idx)
+{
+	struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1];
+
+	return raw->field[xor_idx_to_field[idx]];
+}
+
+static void xor_set_src(struct ioat_raw_descriptor *descs[2],
+			dma_addr_t addr, u32 offset, int idx)
+{
+	struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1];
+
+	raw->field[xor_idx_to_field[idx]] = addr + offset;
+}
+
+static dma_addr_t pq_get_src(struct ioat_raw_descriptor *descs[2], int idx)
+{
+	struct ioat_raw_descriptor *raw = descs[pq_idx_to_desc >> idx & 1];
+
+	return raw->field[pq_idx_to_field[idx]];
+}
+
+static void pq_set_src(struct ioat_raw_descriptor *descs[2],
+		       dma_addr_t addr, u32 offset, u8 coef, int idx)
+{
+	struct ioat_pq_descriptor *pq = (struct ioat_pq_descriptor *) descs[0];
+	struct ioat_raw_descriptor *raw = descs[pq_idx_to_desc >> idx & 1];
+
+	raw->field[pq_idx_to_field[idx]] = addr + offset;
+	pq->coef[idx] = coef;
+}
+
+static void ioat3_dma_unmap(struct ioat2_dma_chan *ioat,
+			    struct ioat_ring_ent *desc, int idx)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	struct pci_dev *pdev = chan->device->pdev;
+	size_t len = desc->len;
+	size_t offset = len - desc->hw->size;
+	struct dma_async_tx_descriptor *tx = &desc->txd;
+	enum dma_ctrl_flags flags = tx->flags;
+
+	switch (desc->hw->ctl_f.op) {
+	case IOAT_OP_COPY:
+		if (!desc->hw->ctl_f.null) /* skip 'interrupt' ops */
+			ioat_dma_unmap(chan, flags, len, desc->hw);
+		break;
+	case IOAT_OP_FILL: {
+		struct ioat_fill_descriptor *hw = desc->fill;
+
+		if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
+			ioat_unmap(pdev, hw->dst_addr - offset, len,
+				   PCI_DMA_FROMDEVICE, flags, 1);
+		break;
+	}
+	case IOAT_OP_XOR_VAL:
+	case IOAT_OP_XOR: {
+		struct ioat_xor_descriptor *xor = desc->xor;
+		struct ioat_ring_ent *ext;
+		struct ioat_xor_ext_descriptor *xor_ex = NULL;
+		int src_cnt = src_cnt_to_sw(xor->ctl_f.src_cnt);
+		struct ioat_raw_descriptor *descs[2];
+		int i;
+
+		if (src_cnt > 5) {
+			ext = ioat2_get_ring_ent(ioat, idx + 1);
+			xor_ex = ext->xor_ex;
+		}
+
+		if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+			descs[0] = (struct ioat_raw_descriptor *) xor;
+			descs[1] = (struct ioat_raw_descriptor *) xor_ex;
+			for (i = 0; i < src_cnt; i++) {
+				dma_addr_t src = xor_get_src(descs, i);
+
+				ioat_unmap(pdev, src - offset, len,
+					   PCI_DMA_TODEVICE, flags, 0);
+			}
+
+			/* dest is a source in xor validate operations */
+			if (xor->ctl_f.op == IOAT_OP_XOR_VAL) {
+				ioat_unmap(pdev, xor->dst_addr - offset, len,
+					   PCI_DMA_TODEVICE, flags, 1);
+				break;
+			}
+		}
+
+		if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
+			ioat_unmap(pdev, xor->dst_addr - offset, len,
+				   PCI_DMA_FROMDEVICE, flags, 1);
+		break;
+	}
+	case IOAT_OP_PQ_VAL:
+	case IOAT_OP_PQ: {
+		struct ioat_pq_descriptor *pq = desc->pq;
+		struct ioat_ring_ent *ext;
+		struct ioat_pq_ext_descriptor *pq_ex = NULL;
+		int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt);
+		struct ioat_raw_descriptor *descs[2];
+		int i;
+
+		if (src_cnt > 3) {
+			ext = ioat2_get_ring_ent(ioat, idx + 1);
+			pq_ex = ext->pq_ex;
+		}
+
+		/* in the 'continue' case don't unmap the dests as sources */
+		if (dmaf_p_disabled_continue(flags))
+			src_cnt--;
+		else if (dmaf_continue(flags))
+			src_cnt -= 3;
+
+		if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+			descs[0] = (struct ioat_raw_descriptor *) pq;
+			descs[1] = (struct ioat_raw_descriptor *) pq_ex;
+			for (i = 0; i < src_cnt; i++) {
+				dma_addr_t src = pq_get_src(descs, i);
+
+				ioat_unmap(pdev, src - offset, len,
+					   PCI_DMA_TODEVICE, flags, 0);
+			}
+
+			/* the dests are sources in pq validate operations */
+			if (pq->ctl_f.op == IOAT_OP_XOR_VAL) {
+				if (!(flags & DMA_PREP_PQ_DISABLE_P))
+					ioat_unmap(pdev, pq->p_addr - offset,
+						   len, PCI_DMA_TODEVICE, flags, 0);
+				if (!(flags & DMA_PREP_PQ_DISABLE_Q))
+					ioat_unmap(pdev, pq->q_addr - offset,
+						   len, PCI_DMA_TODEVICE, flags, 0);
+				break;
+			}
+		}
+
+		if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+			if (!(flags & DMA_PREP_PQ_DISABLE_P))
+				ioat_unmap(pdev, pq->p_addr - offset, len,
+					   PCI_DMA_BIDIRECTIONAL, flags, 1);
+			if (!(flags & DMA_PREP_PQ_DISABLE_Q))
+				ioat_unmap(pdev, pq->q_addr - offset, len,
+					   PCI_DMA_BIDIRECTIONAL, flags, 1);
+		}
+		break;
+	}
+	default:
+		dev_err(&pdev->dev, "%s: unknown op type: %#x\n",
+			__func__, desc->hw->ctl_f.op);
+	}
+}
+
+static bool desc_has_ext(struct ioat_ring_ent *desc)
+{
+	struct ioat_dma_descriptor *hw = desc->hw;
+
+	if (hw->ctl_f.op == IOAT_OP_XOR ||
+	    hw->ctl_f.op == IOAT_OP_XOR_VAL) {
+		struct ioat_xor_descriptor *xor = desc->xor;
+
+		if (src_cnt_to_sw(xor->ctl_f.src_cnt) > 5)
+			return true;
+	} else if (hw->ctl_f.op == IOAT_OP_PQ ||
+		   hw->ctl_f.op == IOAT_OP_PQ_VAL) {
+		struct ioat_pq_descriptor *pq = desc->pq;
+
+		if (src_cnt_to_sw(pq->ctl_f.src_cnt) > 3)
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * __cleanup - reclaim used descriptors
+ * @ioat: channel (ring) to clean
+ *
+ * The difference from the dma_v2.c __cleanup() is that this routine
+ * handles extended descriptors and dma-unmapping raid operations.
+ */
+static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	struct ioat_ring_ent *desc;
+	bool seen_current = false;
+	u16 active;
+	int i;
+
+	dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n",
+		__func__, ioat->head, ioat->tail, ioat->issued);
+
+	active = ioat2_ring_active(ioat);
+	for (i = 0; i < active && !seen_current; i++) {
+		struct dma_async_tx_descriptor *tx;
+
+		prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1));
+		desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
+		dump_desc_dbg(ioat, desc);
+		tx = &desc->txd;
+		if (tx->cookie) {
+			chan->completed_cookie = tx->cookie;
+			ioat3_dma_unmap(ioat, desc, ioat->tail + i);
+			tx->cookie = 0;
+			if (tx->callback) {
+				tx->callback(tx->callback_param);
+				tx->callback = NULL;
+			}
+		}
+
+		if (tx->phys == phys_complete)
+			seen_current = true;
+
+		/* skip extended descriptors */
+		if (desc_has_ext(desc)) {
+			BUG_ON(i + 1 >= active);
+			i++;
+		}
+	}
+	ioat->tail += i;
+	BUG_ON(!seen_current); /* no active descs have written a completion? */
+	chan->last_completion = phys_complete;
+	if (ioat->head == ioat->tail) {
+		dev_dbg(to_dev(chan), "%s: cancel completion timeout\n",
+			__func__);
+		clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
+		mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
+	}
+}
+
+static void ioat3_cleanup(struct ioat2_dma_chan *ioat)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	unsigned long phys_complete;
+
+	prefetch(chan->completion);
+
+	if (!spin_trylock_bh(&chan->cleanup_lock))
+		return;
+
+	if (!ioat_cleanup_preamble(chan, &phys_complete)) {
+		spin_unlock_bh(&chan->cleanup_lock);
+		return;
+	}
+
+	if (!spin_trylock_bh(&ioat->ring_lock)) {
+		spin_unlock_bh(&chan->cleanup_lock);
+		return;
+	}
+
+	__cleanup(ioat, phys_complete);
+
+	spin_unlock_bh(&ioat->ring_lock);
+	spin_unlock_bh(&chan->cleanup_lock);
+}
+
+static void ioat3_cleanup_tasklet(unsigned long data)
+{
+	struct ioat2_dma_chan *ioat = (void *) data;
+
+	ioat3_cleanup(ioat);
+	writew(IOAT_CHANCTRL_RUN | IOAT3_CHANCTRL_COMPL_DCA_EN,
+	       ioat->base.reg_base + IOAT_CHANCTRL_OFFSET);
+}
+
+static void ioat3_restart_channel(struct ioat2_dma_chan *ioat)
+{
+	struct ioat_chan_common *chan = &ioat->base;
+	unsigned long phys_complete;
+	u32 status;
+
+	status = ioat_chansts(chan);
+	if (is_ioat_active(status) || is_ioat_idle(status))
+		ioat_suspend(chan);
+	while (is_ioat_active(status) || is_ioat_idle(status)) {
+		status = ioat_chansts(chan);
+		cpu_relax();
+	}
+
+	if (ioat_cleanup_preamble(chan, &phys_complete))
+		__cleanup(ioat, phys_complete);
+
+	__ioat2_restart_chan(ioat);
+}
+
+static void ioat3_timer_event(unsigned long data)
+{
+	struct ioat2_dma_chan *ioat = (void *) data;
+	struct ioat_chan_common *chan = &ioat->base;
+
+	spin_lock_bh(&chan->cleanup_lock);
+	if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
+		unsigned long phys_complete;
+		u64 status;
+
+		spin_lock_bh(&ioat->ring_lock);
+		status = ioat_chansts(chan);
+
+		/* when halted due to errors check for channel
+		 * programming errors before advancing the completion state
+		 */
+		if (is_ioat_halted(status)) {
+			u32 chanerr;
+
+			chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+			BUG_ON(is_ioat_bug(chanerr));
+		}
+
+		/* if we haven't made progress and we have already
+		 * acknowledged a pending completion once, then be more
+		 * forceful with a restart
+		 */
+		if (ioat_cleanup_preamble(chan, &phys_complete))
+			__cleanup(ioat, phys_complete);
+		else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
+			ioat3_restart_channel(ioat);
+		else {
+			set_bit(IOAT_COMPLETION_ACK, &chan->state);
+			mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+		}
+		spin_unlock_bh(&ioat->ring_lock);
+	} else {
+		u16 active;
+
+		/* if the ring is idle, empty, and oversized try to step
+		 * down the size
+		 */
+		spin_lock_bh(&ioat->ring_lock);
+		active = ioat2_ring_active(ioat);
+		if (active == 0 && ioat->alloc_order > ioat_get_alloc_order())
+			reshape_ring(ioat, ioat->alloc_order-1);
+		spin_unlock_bh(&ioat->ring_lock);
+
+		/* keep shrinking until we get back to our minimum
+		 * default size
+		 */
+		if (ioat->alloc_order > ioat_get_alloc_order())
+			mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
+	}
+	spin_unlock_bh(&chan->cleanup_lock);
+}
+
+static enum dma_status
+ioat3_is_complete(struct dma_chan *c, dma_cookie_t cookie,
+		  dma_cookie_t *done, dma_cookie_t *used)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+
+	if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
+		return DMA_SUCCESS;
+
+	ioat3_cleanup(ioat);
+
+	return ioat_is_complete(c, cookie, done, used);
+}
+
+static struct dma_async_tx_descriptor *
+ioat3_prep_memset_lock(struct dma_chan *c, dma_addr_t dest, int value,
+		       size_t len, unsigned long flags)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+	struct ioat_ring_ent *desc;
+	size_t total_len = len;
+	struct ioat_fill_descriptor *fill;
+	int num_descs;
+	u64 src_data = (0x0101010101010101ULL) * (value & 0xff);
+	u16 idx;
+	int i;
+
+	num_descs = ioat2_xferlen_to_descs(ioat, len);
+	if (likely(num_descs) &&
+	    ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0)
+		/* pass */;
+	else
+		return NULL;
+	i = 0;
+	do {
+		size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
+
+		desc = ioat2_get_ring_ent(ioat, idx + i);
+		fill = desc->fill;
+
+		fill->size = xfer_size;
+		fill->src_data = src_data;
+		fill->dst_addr = dest;
+		fill->ctl = 0;
+		fill->ctl_f.op = IOAT_OP_FILL;
+
+		len -= xfer_size;
+		dest += xfer_size;
+		dump_desc_dbg(ioat, desc);
+	} while (++i < num_descs);
+
+	desc->txd.flags = flags;
+	desc->len = total_len;
+	fill->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+	fill->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+	fill->ctl_f.compl_write = 1;
+	dump_desc_dbg(ioat, desc);
+
+	/* we leave the channel locked to ensure in order submission */
+	return &desc->txd;
+}
+
+static struct dma_async_tx_descriptor *
+__ioat3_prep_xor_lock(struct dma_chan *c, enum sum_check_flags *result,
+		      dma_addr_t dest, dma_addr_t *src, unsigned int src_cnt,
+		      size_t len, unsigned long flags)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+	struct ioat_ring_ent *compl_desc;
+	struct ioat_ring_ent *desc;
+	struct ioat_ring_ent *ext;
+	size_t total_len = len;
+	struct ioat_xor_descriptor *xor;
+	struct ioat_xor_ext_descriptor *xor_ex = NULL;
+	struct ioat_dma_descriptor *hw;
+	u32 offset = 0;
+	int num_descs;
+	int with_ext;
+	int i;
+	u16 idx;
+	u8 op = result ? IOAT_OP_XOR_VAL : IOAT_OP_XOR;
+
+	BUG_ON(src_cnt < 2);
+
+	num_descs = ioat2_xferlen_to_descs(ioat, len);
+	/* we need 2x the number of descriptors to cover greater than 5
+	 * sources
+	 */
+	if (src_cnt > 5) {
+		with_ext = 1;
+		num_descs *= 2;
+	} else
+		with_ext = 0;
+
+	/* completion writes from the raid engine may pass completion
+	 * writes from the legacy engine, so we need one extra null
+	 * (legacy) descriptor to ensure all completion writes arrive in
+	 * order.
+	 */
+	if (likely(num_descs) &&
+	    ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0)
+		/* pass */;
+	else
+		return NULL;
+	i = 0;
+	do {
+		struct ioat_raw_descriptor *descs[2];
+		size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
+		int s;
+
+		desc = ioat2_get_ring_ent(ioat, idx + i);
+		xor = desc->xor;
+
+		/* save a branch by unconditionally retrieving the
+		 * extended descriptor xor_set_src() knows to not write
+		 * to it in the single descriptor case
+		 */
+		ext = ioat2_get_ring_ent(ioat, idx + i + 1);
+		xor_ex = ext->xor_ex;
+
+		descs[0] = (struct ioat_raw_descriptor *) xor;
+		descs[1] = (struct ioat_raw_descriptor *) xor_ex;
+		for (s = 0; s < src_cnt; s++)
+			xor_set_src(descs, src[s], offset, s);
+		xor->size = xfer_size;
+		xor->dst_addr = dest + offset;
+		xor->ctl = 0;
+		xor->ctl_f.op = op;
+		xor->ctl_f.src_cnt = src_cnt_to_hw(src_cnt);
+
+		len -= xfer_size;
+		offset += xfer_size;
+		dump_desc_dbg(ioat, desc);
+	} while ((i += 1 + with_ext) < num_descs);
+
+	/* last xor descriptor carries the unmap parameters and fence bit */
+	desc->txd.flags = flags;
+	desc->len = total_len;
+	if (result)
+		desc->result = result;
+	xor->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+
+	/* completion descriptor carries interrupt bit */
+	compl_desc = ioat2_get_ring_ent(ioat, idx + i);
+	compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT;
+	hw = compl_desc->hw;
+	hw->ctl = 0;
+	hw->ctl_f.null = 1;
+	hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+	hw->ctl_f.compl_write = 1;
+	hw->size = NULL_DESC_BUFFER_SIZE;
+	dump_desc_dbg(ioat, compl_desc);
+
+	/* we leave the channel locked to ensure in order submission */
+	return &desc->txd;
+}
+
+static struct dma_async_tx_descriptor *
+ioat3_prep_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
+	       unsigned int src_cnt, size_t len, unsigned long flags)
+{
+	return __ioat3_prep_xor_lock(chan, NULL, dest, src, src_cnt, len, flags);
+}
+
+struct dma_async_tx_descriptor *
+ioat3_prep_xor_val(struct dma_chan *chan, dma_addr_t *src,
+		    unsigned int src_cnt, size_t len,
+		    enum sum_check_flags *result, unsigned long flags)
+{
+	/* the cleanup routine only sets bits on validate failure, it
+	 * does not clear bits on validate success... so clear it here
+	 */
+	*result = 0;
+
+	return __ioat3_prep_xor_lock(chan, result, src[0], &src[1],
+				     src_cnt - 1, len, flags);
+}
+
+static void
+dump_pq_desc_dbg(struct ioat2_dma_chan *ioat, struct ioat_ring_ent *desc, struct ioat_ring_ent *ext)
+{
+	struct device *dev = to_dev(&ioat->base);
+	struct ioat_pq_descriptor *pq = desc->pq;
+	struct ioat_pq_ext_descriptor *pq_ex = ext ? ext->pq_ex : NULL;
+	struct ioat_raw_descriptor *descs[] = { (void *) pq, (void *) pq_ex };
+	int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt);
+	int i;
+
+	dev_dbg(dev, "desc[%d]: (%#llx->%#llx) flags: %#x"
+		" sz: %#x ctl: %#x (op: %d int: %d compl: %d pq: '%s%s' src_cnt: %d)\n",
+		desc_id(desc), (unsigned long long) desc->txd.phys,
+		(unsigned long long) (pq_ex ? pq_ex->next : pq->next),
+		desc->txd.flags, pq->size, pq->ctl, pq->ctl_f.op, pq->ctl_f.int_en,
+		pq->ctl_f.compl_write,
+		pq->ctl_f.p_disable ? "" : "p", pq->ctl_f.q_disable ? "" : "q",
+		pq->ctl_f.src_cnt);
+	for (i = 0; i < src_cnt; i++)
+		dev_dbg(dev, "\tsrc[%d]: %#llx coef: %#x\n", i,
+			(unsigned long long) pq_get_src(descs, i), pq->coef[i]);
+	dev_dbg(dev, "\tP: %#llx\n", pq->p_addr);
+	dev_dbg(dev, "\tQ: %#llx\n", pq->q_addr);
+}
+
+static struct dma_async_tx_descriptor *
+__ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
+		     const dma_addr_t *dst, const dma_addr_t *src,
+		     unsigned int src_cnt, const unsigned char *scf,
+		     size_t len, unsigned long flags)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+	struct ioat_chan_common *chan = &ioat->base;
+	struct ioat_ring_ent *compl_desc;
+	struct ioat_ring_ent *desc;
+	struct ioat_ring_ent *ext;
+	size_t total_len = len;
+	struct ioat_pq_descriptor *pq;
+	struct ioat_pq_ext_descriptor *pq_ex = NULL;
+	struct ioat_dma_descriptor *hw;
+	u32 offset = 0;
+	int num_descs;
+	int with_ext;
+	int i, s;
+	u16 idx;
+	u8 op = result ? IOAT_OP_PQ_VAL : IOAT_OP_PQ;
+
+	dev_dbg(to_dev(chan), "%s\n", __func__);
+	/* the engine requires at least two sources (we provide
+	 * at least 1 implied source in the DMA_PREP_CONTINUE case)
+	 */
+	BUG_ON(src_cnt + dmaf_continue(flags) < 2);
+
+	num_descs = ioat2_xferlen_to_descs(ioat, len);
+	/* we need 2x the number of descriptors to cover greater than 3
+	 * sources
+	 */
+	if (src_cnt > 3 || flags & DMA_PREP_CONTINUE) {
+		with_ext = 1;
+		num_descs *= 2;
+	} else
+		with_ext = 0;
+
+	/* completion writes from the raid engine may pass completion
+	 * writes from the legacy engine, so we need one extra null
+	 * (legacy) descriptor to ensure all completion writes arrive in
+	 * order.
+	 */
+	if (likely(num_descs) &&
+	    ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0)
+		/* pass */;
+	else
+		return NULL;
+	i = 0;
+	do {
+		struct ioat_raw_descriptor *descs[2];
+		size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
+
+		desc = ioat2_get_ring_ent(ioat, idx + i);
+		pq = desc->pq;
+
+		/* save a branch by unconditionally retrieving the
+		 * extended descriptor pq_set_src() knows to not write
+		 * to it in the single descriptor case
+		 */
+		ext = ioat2_get_ring_ent(ioat, idx + i + with_ext);
+		pq_ex = ext->pq_ex;
+
+		descs[0] = (struct ioat_raw_descriptor *) pq;
+		descs[1] = (struct ioat_raw_descriptor *) pq_ex;
+
+		for (s = 0; s < src_cnt; s++)
+			pq_set_src(descs, src[s], offset, scf[s], s);
+
+		/* see the comment for dma_maxpq in include/linux/dmaengine.h */
+		if (dmaf_p_disabled_continue(flags))
+			pq_set_src(descs, dst[1], offset, 1, s++);
+		else if (dmaf_continue(flags)) {
+			pq_set_src(descs, dst[0], offset, 0, s++);
+			pq_set_src(descs, dst[1], offset, 1, s++);
+			pq_set_src(descs, dst[1], offset, 0, s++);
+		}
+		pq->size = xfer_size;
+		pq->p_addr = dst[0] + offset;
+		pq->q_addr = dst[1] + offset;
+		pq->ctl = 0;
+		pq->ctl_f.op = op;
+		pq->ctl_f.src_cnt = src_cnt_to_hw(s);
+		pq->ctl_f.p_disable = !!(flags & DMA_PREP_PQ_DISABLE_P);
+		pq->ctl_f.q_disable = !!(flags & DMA_PREP_PQ_DISABLE_Q);
+
+		len -= xfer_size;
+		offset += xfer_size;
+	} while ((i += 1 + with_ext) < num_descs);
+
+	/* last pq descriptor carries the unmap parameters and fence bit */
+	desc->txd.flags = flags;
+	desc->len = total_len;
+	if (result)
+		desc->result = result;
+	pq->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+	dump_pq_desc_dbg(ioat, desc, ext);
+
+	/* completion descriptor carries interrupt bit */
+	compl_desc = ioat2_get_ring_ent(ioat, idx + i);
+	compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT;
+	hw = compl_desc->hw;
+	hw->ctl = 0;
+	hw->ctl_f.null = 1;
+	hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+	hw->ctl_f.compl_write = 1;
+	hw->size = NULL_DESC_BUFFER_SIZE;
+	dump_desc_dbg(ioat, compl_desc);
+
+	/* we leave the channel locked to ensure in order submission */
+	return &desc->txd;
+}
+
+static struct dma_async_tx_descriptor *
+ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
+	      unsigned int src_cnt, const unsigned char *scf, size_t len,
+	      unsigned long flags)
+{
+	/* handle the single source multiply case from the raid6
+	 * recovery path
+	 */
+	if (unlikely((flags & DMA_PREP_PQ_DISABLE_P) && src_cnt == 1)) {
+		dma_addr_t single_source[2];
+		unsigned char single_source_coef[2];
+
+		BUG_ON(flags & DMA_PREP_PQ_DISABLE_Q);
+		single_source[0] = src[0];
+		single_source[1] = src[0];
+		single_source_coef[0] = scf[0];
+		single_source_coef[1] = 0;
+
+		return __ioat3_prep_pq_lock(chan, NULL, dst, single_source, 2,
+					    single_source_coef, len, flags);
+	} else
+		return __ioat3_prep_pq_lock(chan, NULL, dst, src, src_cnt, scf,
+					    len, flags);
+}
+
+struct dma_async_tx_descriptor *
+ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
+		  unsigned int src_cnt, const unsigned char *scf, size_t len,
+		  enum sum_check_flags *pqres, unsigned long flags)
+{
+	/* the cleanup routine only sets bits on validate failure, it
+	 * does not clear bits on validate success... so clear it here
+	 */
+	*pqres = 0;
+
+	return __ioat3_prep_pq_lock(chan, pqres, pq, src, src_cnt, scf, len,
+				    flags);
+}
+
+static struct dma_async_tx_descriptor *
+ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
+		 unsigned int src_cnt, size_t len, unsigned long flags)
+{
+	unsigned char scf[src_cnt];
+	dma_addr_t pq[2];
+
+	memset(scf, 0, src_cnt);
+	flags |= DMA_PREP_PQ_DISABLE_Q;
+	pq[0] = dst;
+	pq[1] = ~0;
+
+	return __ioat3_prep_pq_lock(chan, NULL, pq, src, src_cnt, scf, len,
+				    flags);
+}
+
+struct dma_async_tx_descriptor *
+ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src,
+		     unsigned int src_cnt, size_t len,
+		     enum sum_check_flags *result, unsigned long flags)
+{
+	unsigned char scf[src_cnt];
+	dma_addr_t pq[2];
+
+	/* the cleanup routine only sets bits on validate failure, it
+	 * does not clear bits on validate success... so clear it here
+	 */
+	*result = 0;
+
+	memset(scf, 0, src_cnt);
+	flags |= DMA_PREP_PQ_DISABLE_Q;
+	pq[0] = src[0];
+	pq[1] = ~0;
+
+	return __ioat3_prep_pq_lock(chan, result, pq, &src[1], src_cnt - 1, scf,
+				    len, flags);
+}
+
+static struct dma_async_tx_descriptor *
+ioat3_prep_interrupt_lock(struct dma_chan *c, unsigned long flags)
+{
+	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+	struct ioat_ring_ent *desc;
+	struct ioat_dma_descriptor *hw;
+	u16 idx;
+
+	if (ioat2_alloc_and_lock(&idx, ioat, 1) == 0)
+		desc = ioat2_get_ring_ent(ioat, idx);
+	else
+		return NULL;
+
+	hw = desc->hw;
+	hw->ctl = 0;
+	hw->ctl_f.null = 1;
+	hw->ctl_f.int_en = 1;
+	hw->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+	hw->ctl_f.compl_write = 1;
+	hw->size = NULL_DESC_BUFFER_SIZE;
+	hw->src_addr = 0;
+	hw->dst_addr = 0;
+
+	desc->txd.flags = flags;
+	desc->len = 1;
+
+	dump_desc_dbg(ioat, desc);
+
+	/* we leave the channel locked to ensure in order submission */
+	return &desc->txd;
+}
+
+static void __devinit ioat3_dma_test_callback(void *dma_async_param)
+{
+	struct completion *cmp = dma_async_param;
+
+	complete(cmp);
+}
+
+#define IOAT_NUM_SRC_TEST 6 /* must be <= 8 */
+static int __devinit ioat_xor_val_self_test(struct ioatdma_device *device)
+{
+	int i, src_idx;
+	struct page *dest;
+	struct page *xor_srcs[IOAT_NUM_SRC_TEST];
+	struct page *xor_val_srcs[IOAT_NUM_SRC_TEST + 1];
+	dma_addr_t dma_srcs[IOAT_NUM_SRC_TEST + 1];
+	dma_addr_t dma_addr, dest_dma;
+	struct dma_async_tx_descriptor *tx;
+	struct dma_chan *dma_chan;
+	dma_cookie_t cookie;
+	u8 cmp_byte = 0;
+	u32 cmp_word;
+	u32 xor_val_result;
+	int err = 0;
+	struct completion cmp;
+	unsigned long tmo;
+	struct device *dev = &device->pdev->dev;
+	struct dma_device *dma = &device->common;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	if (!dma_has_cap(DMA_XOR, dma->cap_mask))
+		return 0;
+
+	for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) {
+		xor_srcs[src_idx] = alloc_page(GFP_KERNEL);
+		if (!xor_srcs[src_idx]) {
+			while (src_idx--)
+				__free_page(xor_srcs[src_idx]);
+			return -ENOMEM;
+		}
+	}
+
+	dest = alloc_page(GFP_KERNEL);
+	if (!dest) {
+		while (src_idx--)
+			__free_page(xor_srcs[src_idx]);
+		return -ENOMEM;
+	}
+
+	/* Fill in src buffers */
+	for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) {
+		u8 *ptr = page_address(xor_srcs[src_idx]);
+		for (i = 0; i < PAGE_SIZE; i++)
+			ptr[i] = (1 << src_idx);
+	}
+
+	for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++)
+		cmp_byte ^= (u8) (1 << src_idx);
+
+	cmp_word = (cmp_byte << 24) | (cmp_byte << 16) |
+			(cmp_byte << 8) | cmp_byte;
+
+	memset(page_address(dest), 0, PAGE_SIZE);
+
+	dma_chan = container_of(dma->channels.next, struct dma_chan,
+				device_node);
+	if (dma->device_alloc_chan_resources(dma_chan) < 1) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	/* test xor */
+	dest_dma = dma_map_page(dev, dest, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+	for (i = 0; i < IOAT_NUM_SRC_TEST; i++)
+		dma_srcs[i] = dma_map_page(dev, xor_srcs[i], 0, PAGE_SIZE,
+					   DMA_TO_DEVICE);
+	tx = dma->device_prep_dma_xor(dma_chan, dest_dma, dma_srcs,
+				      IOAT_NUM_SRC_TEST, PAGE_SIZE,
+				      DMA_PREP_INTERRUPT);
+
+	if (!tx) {
+		dev_err(dev, "Self-test xor prep failed\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	async_tx_ack(tx);
+	init_completion(&cmp);
+	tx->callback = ioat3_dma_test_callback;
+	tx->callback_param = &cmp;
+	cookie = tx->tx_submit(tx);
+	if (cookie < 0) {
+		dev_err(dev, "Self-test xor setup failed\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+	dma->device_issue_pending(dma_chan);
+
+	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+
+	if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+		dev_err(dev, "Self-test xor timed out\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	dma_sync_single_for_cpu(dev, dest_dma, PAGE_SIZE, DMA_FROM_DEVICE);
+	for (i = 0; i < (PAGE_SIZE / sizeof(u32)); i++) {
+		u32 *ptr = page_address(dest);
+		if (ptr[i] != cmp_word) {
+			dev_err(dev, "Self-test xor failed compare\n");
+			err = -ENODEV;
+			goto free_resources;
+		}
+	}
+	dma_sync_single_for_device(dev, dest_dma, PAGE_SIZE, DMA_TO_DEVICE);
+
+	/* skip validate if the capability is not present */
+	if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask))
+		goto free_resources;
+
+	/* validate the sources with the destintation page */
+	for (i = 0; i < IOAT_NUM_SRC_TEST; i++)
+		xor_val_srcs[i] = xor_srcs[i];
+	xor_val_srcs[i] = dest;
+
+	xor_val_result = 1;
+
+	for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++)
+		dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE,
+					   DMA_TO_DEVICE);
+	tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs,
+					  IOAT_NUM_SRC_TEST + 1, PAGE_SIZE,
+					  &xor_val_result, DMA_PREP_INTERRUPT);
+	if (!tx) {
+		dev_err(dev, "Self-test zero prep failed\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	async_tx_ack(tx);
+	init_completion(&cmp);
+	tx->callback = ioat3_dma_test_callback;
+	tx->callback_param = &cmp;
+	cookie = tx->tx_submit(tx);
+	if (cookie < 0) {
+		dev_err(dev, "Self-test zero setup failed\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+	dma->device_issue_pending(dma_chan);
+
+	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+
+	if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+		dev_err(dev, "Self-test validate timed out\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	if (xor_val_result != 0) {
+		dev_err(dev, "Self-test validate failed compare\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	/* skip memset if the capability is not present */
+	if (!dma_has_cap(DMA_MEMSET, dma_chan->device->cap_mask))
+		goto free_resources;
+
+	/* test memset */
+	dma_addr = dma_map_page(dev, dest, 0,
+			PAGE_SIZE, DMA_FROM_DEVICE);
+	tx = dma->device_prep_dma_memset(dma_chan, dma_addr, 0, PAGE_SIZE,
+					 DMA_PREP_INTERRUPT);
+	if (!tx) {
+		dev_err(dev, "Self-test memset prep failed\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	async_tx_ack(tx);
+	init_completion(&cmp);
+	tx->callback = ioat3_dma_test_callback;
+	tx->callback_param = &cmp;
+	cookie = tx->tx_submit(tx);
+	if (cookie < 0) {
+		dev_err(dev, "Self-test memset setup failed\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+	dma->device_issue_pending(dma_chan);
+
+	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+
+	if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+		dev_err(dev, "Self-test memset timed out\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	for (i = 0; i < PAGE_SIZE/sizeof(u32); i++) {
+		u32 *ptr = page_address(dest);
+		if (ptr[i]) {
+			dev_err(dev, "Self-test memset failed compare\n");
+			err = -ENODEV;
+			goto free_resources;
+		}
+	}
+
+	/* test for non-zero parity sum */
+	xor_val_result = 0;
+	for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++)
+		dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE,
+					   DMA_TO_DEVICE);
+	tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs,
+					  IOAT_NUM_SRC_TEST + 1, PAGE_SIZE,
+					  &xor_val_result, DMA_PREP_INTERRUPT);
+	if (!tx) {
+		dev_err(dev, "Self-test 2nd zero prep failed\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	async_tx_ack(tx);
+	init_completion(&cmp);
+	tx->callback = ioat3_dma_test_callback;
+	tx->callback_param = &cmp;
+	cookie = tx->tx_submit(tx);
+	if (cookie < 0) {
+		dev_err(dev, "Self-test  2nd zero setup failed\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+	dma->device_issue_pending(dma_chan);
+
+	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+
+	if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+		dev_err(dev, "Self-test 2nd validate timed out\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	if (xor_val_result != SUM_CHECK_P_RESULT) {
+		dev_err(dev, "Self-test validate failed compare\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+free_resources:
+	dma->device_free_chan_resources(dma_chan);
+out:
+	src_idx = IOAT_NUM_SRC_TEST;
+	while (src_idx--)
+		__free_page(xor_srcs[src_idx]);
+	__free_page(dest);
+	return err;
+}
+
+static int __devinit ioat3_dma_self_test(struct ioatdma_device *device)
+{
+	int rc = ioat_dma_self_test(device);
+
+	if (rc)
+		return rc;
+
+	rc = ioat_xor_val_self_test(device);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca)
+{
+	struct pci_dev *pdev = device->pdev;
+	struct dma_device *dma;
+	struct dma_chan *c;
+	struct ioat_chan_common *chan;
+	bool is_raid_device = false;
+	int err;
+	u16 dev_id;
+	u32 cap;
+
+	device->enumerate_channels = ioat2_enumerate_channels;
+	device->self_test = ioat3_dma_self_test;
+	dma = &device->common;
+	dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock;
+	dma->device_issue_pending = ioat2_issue_pending;
+	dma->device_alloc_chan_resources = ioat2_alloc_chan_resources;
+	dma->device_free_chan_resources = ioat2_free_chan_resources;
+
+	dma_cap_set(DMA_INTERRUPT, dma->cap_mask);
+	dma->device_prep_dma_interrupt = ioat3_prep_interrupt_lock;
+
+	cap = readl(device->reg_base + IOAT_DMA_CAP_OFFSET);
+	if (cap & IOAT_CAP_XOR) {
+		is_raid_device = true;
+		dma->max_xor = 8;
+		dma->xor_align = 2;
+
+		dma_cap_set(DMA_XOR, dma->cap_mask);
+		dma->device_prep_dma_xor = ioat3_prep_xor;
+
+		dma_cap_set(DMA_XOR_VAL, dma->cap_mask);
+		dma->device_prep_dma_xor_val = ioat3_prep_xor_val;
+	}
+	if (cap & IOAT_CAP_PQ) {
+		is_raid_device = true;
+		dma_set_maxpq(dma, 8, 0);
+		dma->pq_align = 2;
+
+		dma_cap_set(DMA_PQ, dma->cap_mask);
+		dma->device_prep_dma_pq = ioat3_prep_pq;
+
+		dma_cap_set(DMA_PQ_VAL, dma->cap_mask);
+		dma->device_prep_dma_pq_val = ioat3_prep_pq_val;
+
+		if (!(cap & IOAT_CAP_XOR)) {
+			dma->max_xor = 8;
+			dma->xor_align = 2;
+
+			dma_cap_set(DMA_XOR, dma->cap_mask);
+			dma->device_prep_dma_xor = ioat3_prep_pqxor;
+
+			dma_cap_set(DMA_XOR_VAL, dma->cap_mask);
+			dma->device_prep_dma_xor_val = ioat3_prep_pqxor_val;
+		}
+	}
+	if (is_raid_device && (cap & IOAT_CAP_FILL_BLOCK)) {
+		dma_cap_set(DMA_MEMSET, dma->cap_mask);
+		dma->device_prep_dma_memset = ioat3_prep_memset_lock;
+	}
+
+
+	if (is_raid_device) {
+		dma->device_is_tx_complete = ioat3_is_complete;
+		device->cleanup_tasklet = ioat3_cleanup_tasklet;
+		device->timer_fn = ioat3_timer_event;
+	} else {
+		dma->device_is_tx_complete = ioat2_is_complete;
+		device->cleanup_tasklet = ioat2_cleanup_tasklet;
+		device->timer_fn = ioat2_timer_event;
+	}
+
+	/* -= IOAT ver.3 workarounds =- */
+	/* Write CHANERRMSK_INT with 3E07h to mask out the errors
+	 * that can cause stability issues for IOAT ver.3
+	 */
+	pci_write_config_dword(pdev, IOAT_PCI_CHANERRMASK_INT_OFFSET, 0x3e07);
+
+	/* Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit
+	 * (workaround for spurious config parity error after restart)
+	 */
+	pci_read_config_word(pdev, IOAT_PCI_DEVICE_ID_OFFSET, &dev_id);
+	if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0)
+		pci_write_config_dword(pdev, IOAT_PCI_DMAUNCERRSTS_OFFSET, 0x10);
+
+	err = ioat_probe(device);
+	if (err)
+		return err;
+	ioat_set_tcp_copy_break(262144);
+
+	list_for_each_entry(c, &dma->channels, device_node) {
+		chan = to_chan_common(c);
+		writel(IOAT_DMA_DCA_ANY_CPU,
+		       chan->reg_base + IOAT_DCACTRL_OFFSET);
+	}
+
+	err = ioat_register(device);
+	if (err)
+		return err;
+
+	ioat_kobject_add(device, &ioat2_ktype);
+
+	if (dca)
+		device->dca = ioat3_dca_init(pdev, device->reg_base);
+
+	return 0;
+}
diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h
new file mode 100644
index 00000000000..99afb12bd40
--- /dev/null
+++ b/drivers/dma/ioat/hw.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef _IOAT_HW_H_
+#define _IOAT_HW_H_
+
+/* PCI Configuration Space Values */
+#define IOAT_PCI_VID            0x8086
+#define IOAT_MMIO_BAR		0
+
+/* CB device ID's */
+#define IOAT_PCI_DID_5000       0x1A38
+#define IOAT_PCI_DID_CNB        0x360B
+#define IOAT_PCI_DID_SCNB       0x65FF
+#define IOAT_PCI_DID_SNB        0x402F
+
+#define IOAT_PCI_RID            0x00
+#define IOAT_PCI_SVID           0x8086
+#define IOAT_PCI_SID            0x8086
+#define IOAT_VER_1_2            0x12    /* Version 1.2 */
+#define IOAT_VER_2_0            0x20    /* Version 2.0 */
+#define IOAT_VER_3_0            0x30    /* Version 3.0 */
+#define IOAT_VER_3_2            0x32    /* Version 3.2 */
+
+struct ioat_dma_descriptor {
+	uint32_t	size;
+	union {
+		uint32_t ctl;
+		struct {
+			unsigned int int_en:1;
+			unsigned int src_snoop_dis:1;
+			unsigned int dest_snoop_dis:1;
+			unsigned int compl_write:1;
+			unsigned int fence:1;
+			unsigned int null:1;
+			unsigned int src_brk:1;
+			unsigned int dest_brk:1;
+			unsigned int bundle:1;
+			unsigned int dest_dca:1;
+			unsigned int hint:1;
+			unsigned int rsvd2:13;
+			#define IOAT_OP_COPY 0x00
+			unsigned int op:8;
+		} ctl_f;
+	};
+	uint64_t	src_addr;
+	uint64_t	dst_addr;
+	uint64_t	next;
+	uint64_t	rsv1;
+	uint64_t	rsv2;
+	/* store some driver data in an unused portion of the descriptor */
+	union {
+		uint64_t	user1;
+		uint64_t	tx_cnt;
+	};
+	uint64_t	user2;
+};
+
+struct ioat_fill_descriptor {
+	uint32_t	size;
+	union {
+		uint32_t ctl;
+		struct {
+			unsigned int int_en:1;
+			unsigned int rsvd:1;
+			unsigned int dest_snoop_dis:1;
+			unsigned int compl_write:1;
+			unsigned int fence:1;
+			unsigned int rsvd2:2;
+			unsigned int dest_brk:1;
+			unsigned int bundle:1;
+			unsigned int rsvd4:15;
+			#define IOAT_OP_FILL 0x01
+			unsigned int op:8;
+		} ctl_f;
+	};
+	uint64_t	src_data;
+	uint64_t	dst_addr;
+	uint64_t	next;
+	uint64_t	rsv1;
+	uint64_t	next_dst_addr;
+	uint64_t	user1;
+	uint64_t	user2;
+};
+
+struct ioat_xor_descriptor {
+	uint32_t	size;
+	union {
+		uint32_t ctl;
+		struct {
+			unsigned int int_en:1;
+			unsigned int src_snoop_dis:1;
+			unsigned int dest_snoop_dis:1;
+			unsigned int compl_write:1;
+			unsigned int fence:1;
+			unsigned int src_cnt:3;
+			unsigned int bundle:1;
+			unsigned int dest_dca:1;
+			unsigned int hint:1;
+			unsigned int rsvd:13;
+			#define IOAT_OP_XOR 0x87
+			#define IOAT_OP_XOR_VAL 0x88
+			unsigned int op:8;
+		} ctl_f;
+	};
+	uint64_t	src_addr;
+	uint64_t	dst_addr;
+	uint64_t	next;
+	uint64_t	src_addr2;
+	uint64_t	src_addr3;
+	uint64_t	src_addr4;
+	uint64_t	src_addr5;
+};
+
+struct ioat_xor_ext_descriptor {
+	uint64_t	src_addr6;
+	uint64_t	src_addr7;
+	uint64_t	src_addr8;
+	uint64_t	next;
+	uint64_t	rsvd[4];
+};
+
+struct ioat_pq_descriptor {
+	uint32_t	size;
+	union {
+		uint32_t ctl;
+		struct {
+			unsigned int int_en:1;
+			unsigned int src_snoop_dis:1;
+			unsigned int dest_snoop_dis:1;
+			unsigned int compl_write:1;
+			unsigned int fence:1;
+			unsigned int src_cnt:3;
+			unsigned int bundle:1;
+			unsigned int dest_dca:1;
+			unsigned int hint:1;
+			unsigned int p_disable:1;
+			unsigned int q_disable:1;
+			unsigned int rsvd:11;
+			#define IOAT_OP_PQ 0x89
+			#define IOAT_OP_PQ_VAL 0x8a
+			unsigned int op:8;
+		} ctl_f;
+	};
+	uint64_t	src_addr;
+	uint64_t	p_addr;
+	uint64_t	next;
+	uint64_t	src_addr2;
+	uint64_t	src_addr3;
+	uint8_t		coef[8];
+	uint64_t	q_addr;
+};
+
+struct ioat_pq_ext_descriptor {
+	uint64_t	src_addr4;
+	uint64_t	src_addr5;
+	uint64_t	src_addr6;
+	uint64_t	next;
+	uint64_t	src_addr7;
+	uint64_t	src_addr8;
+	uint64_t	rsvd[2];
+};
+
+struct ioat_pq_update_descriptor {
+	uint32_t	size;
+	union {
+		uint32_t ctl;
+		struct {
+			unsigned int int_en:1;
+			unsigned int src_snoop_dis:1;
+			unsigned int dest_snoop_dis:1;
+			unsigned int compl_write:1;
+			unsigned int fence:1;
+			unsigned int src_cnt:3;
+			unsigned int bundle:1;
+			unsigned int dest_dca:1;
+			unsigned int hint:1;
+			unsigned int p_disable:1;
+			unsigned int q_disable:1;
+			unsigned int rsvd:3;
+			unsigned int coef:8;
+			#define IOAT_OP_PQ_UP 0x8b
+			unsigned int op:8;
+		} ctl_f;
+	};
+	uint64_t	src_addr;
+	uint64_t	p_addr;
+	uint64_t	next;
+	uint64_t	src_addr2;
+	uint64_t	p_src;
+	uint64_t	q_src;
+	uint64_t	q_addr;
+};
+
+struct ioat_raw_descriptor {
+	uint64_t	field[8];
+};
+#endif
diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c
new file mode 100644
index 00000000000..d545fae30f3
--- /dev/null
+++ b/drivers/dma/ioat/pci.c
@@ -0,0 +1,210 @@
+/*
+ * Intel I/OAT DMA Linux driver
+ * Copyright(c) 2007 - 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ */
+
+/*
+ * This driver supports an Intel I/OAT DMA engine, which does asynchronous
+ * copy operations.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/dca.h>
+#include "dma.h"
+#include "dma_v2.h"
+#include "registers.h"
+#include "hw.h"
+
+MODULE_VERSION(IOAT_DMA_VERSION);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel Corporation");
+
+static struct pci_device_id ioat_pci_tbl[] = {
+	/* I/OAT v1 platforms */
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB)  },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SCNB) },
+	{ PCI_VDEVICE(UNISYS, PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) },
+
+	/* I/OAT v2 platforms */
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB) },
+
+	/* I/OAT v3 platforms */
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG0) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG1) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG2) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG3) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG4) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) },
+
+	/* I/OAT v3.2 platforms */
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF0) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF1) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF2) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF3) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF4) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF5) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF6) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF7) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF8) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF9) },
+
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, ioat_pci_tbl);
+
+static int __devinit ioat_pci_probe(struct pci_dev *pdev,
+				    const struct pci_device_id *id);
+static void __devexit ioat_remove(struct pci_dev *pdev);
+
+static int ioat_dca_enabled = 1;
+module_param(ioat_dca_enabled, int, 0644);
+MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)");
+
+struct kmem_cache *ioat2_cache;
+
+#define DRV_NAME "ioatdma"
+
+static struct pci_driver ioat_pci_driver = {
+	.name		= DRV_NAME,
+	.id_table	= ioat_pci_tbl,
+	.probe		= ioat_pci_probe,
+	.remove		= __devexit_p(ioat_remove),
+};
+
+static struct ioatdma_device *
+alloc_ioatdma(struct pci_dev *pdev, void __iomem *iobase)
+{
+	struct device *dev = &pdev->dev;
+	struct ioatdma_device *d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
+
+	if (!d)
+		return NULL;
+	d->pdev = pdev;
+	d->reg_base = iobase;
+	return d;
+}
+
+static int __devinit ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	void __iomem * const *iomap;
+	struct device *dev = &pdev->dev;
+	struct ioatdma_device *device;
+	int err;
+
+	err = pcim_enable_device(pdev);
+	if (err)
+		return err;
+
+	err = pcim_iomap_regions(pdev, 1 << IOAT_MMIO_BAR, DRV_NAME);
+	if (err)
+		return err;
+	iomap = pcim_iomap_table(pdev);
+	if (!iomap)
+		return -ENOMEM;
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err)
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (err)
+		return err;
+
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err)
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (err)
+		return err;
+
+	device = devm_kzalloc(dev, sizeof(*device), GFP_KERNEL);
+	if (!device)
+		return -ENOMEM;
+
+	pci_set_master(pdev);
+
+	device = alloc_ioatdma(pdev, iomap[IOAT_MMIO_BAR]);
+	if (!device)
+		return -ENOMEM;
+	pci_set_drvdata(pdev, device);
+
+	device->version = readb(device->reg_base + IOAT_VER_OFFSET);
+	if (device->version == IOAT_VER_1_2)
+		err = ioat1_dma_probe(device, ioat_dca_enabled);
+	else if (device->version == IOAT_VER_2_0)
+		err = ioat2_dma_probe(device, ioat_dca_enabled);
+	else if (device->version >= IOAT_VER_3_0)
+		err = ioat3_dma_probe(device, ioat_dca_enabled);
+	else
+		return -ENODEV;
+
+	if (err) {
+		dev_err(dev, "Intel(R) I/OAT DMA Engine init failed\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void __devexit ioat_remove(struct pci_dev *pdev)
+{
+	struct ioatdma_device *device = pci_get_drvdata(pdev);
+
+	if (!device)
+		return;
+
+	dev_err(&pdev->dev, "Removing dma and dca services\n");
+	if (device->dca) {
+		unregister_dca_provider(device->dca, &pdev->dev);
+		free_dca_provider(device->dca);
+		device->dca = NULL;
+	}
+	ioat_dma_remove(device);
+}
+
+static int __init ioat_init_module(void)
+{
+	int err;
+
+	pr_info("%s: Intel(R) QuickData Technology Driver %s\n",
+		DRV_NAME, IOAT_DMA_VERSION);
+
+	ioat2_cache = kmem_cache_create("ioat2", sizeof(struct ioat_ring_ent),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ioat2_cache)
+		return -ENOMEM;
+
+	err = pci_register_driver(&ioat_pci_driver);
+	if (err)
+		kmem_cache_destroy(ioat2_cache);
+
+	return err;
+}
+module_init(ioat_init_module);
+
+static void __exit ioat_exit_module(void)
+{
+	pci_unregister_driver(&ioat_pci_driver);
+	kmem_cache_destroy(ioat2_cache);
+}
+module_exit(ioat_exit_module);
diff --git a/drivers/dma/ioatdma_registers.h b/drivers/dma/ioat/registers.h
index 49bc277424f..63038e18ab0 100644
--- a/drivers/dma/ioatdma_registers.h
+++ b/drivers/dma/ioat/registers.h
@@ -64,18 +64,37 @@
 
 #define IOAT_DEVICE_STATUS_OFFSET		0x0E	/* 16-bit */
 #define IOAT_DEVICE_STATUS_DEGRADED_MODE	0x0001
+#define IOAT_DEVICE_MMIO_RESTRICTED		0x0002
+#define IOAT_DEVICE_MEMORY_BYPASS		0x0004
+#define IOAT_DEVICE_ADDRESS_REMAPPING		0x0008
+
+#define IOAT_DMA_CAP_OFFSET			0x10	/* 32-bit */
+#define IOAT_CAP_PAGE_BREAK			0x00000001
+#define IOAT_CAP_CRC				0x00000002
+#define IOAT_CAP_SKIP_MARKER			0x00000004
+#define IOAT_CAP_DCA				0x00000010
+#define IOAT_CAP_CRC_MOVE			0x00000020
+#define IOAT_CAP_FILL_BLOCK			0x00000040
+#define IOAT_CAP_APIC				0x00000080
+#define IOAT_CAP_XOR				0x00000100
+#define IOAT_CAP_PQ				0x00000200
 
 #define IOAT_CHANNEL_MMIO_SIZE			0x80	/* Each Channel MMIO space is this size */
 
 /* DMA Channel Registers */
 #define IOAT_CHANCTRL_OFFSET			0x00	/* 16-bit Channel Control Register */
 #define IOAT_CHANCTRL_CHANNEL_PRIORITY_MASK	0xF000
+#define IOAT3_CHANCTRL_COMPL_DCA_EN		0x0200
 #define IOAT_CHANCTRL_CHANNEL_IN_USE		0x0100
 #define IOAT_CHANCTRL_DESCRIPTOR_ADDR_SNOOP_CONTROL	0x0020
 #define IOAT_CHANCTRL_ERR_INT_EN		0x0010
 #define IOAT_CHANCTRL_ANY_ERR_ABORT_EN		0x0008
 #define IOAT_CHANCTRL_ERR_COMPLETION_EN		0x0004
-#define IOAT_CHANCTRL_INT_DISABLE		0x0001
+#define IOAT_CHANCTRL_INT_REARM			0x0001
+#define IOAT_CHANCTRL_RUN			(IOAT_CHANCTRL_INT_REARM |\
+						 IOAT_CHANCTRL_ERR_COMPLETION_EN |\
+						 IOAT_CHANCTRL_ANY_ERR_ABORT_EN |\
+						 IOAT_CHANCTRL_ERR_INT_EN)
 
 #define IOAT_DMA_COMP_OFFSET			0x02	/* 16-bit DMA channel compatibility */
 #define IOAT_DMA_COMP_V1			0x0001	/* Compatibility with DMA version 1 */
@@ -94,14 +113,14 @@
 #define IOAT2_CHANSTS_OFFSET_HIGH	0x0C
 #define IOAT_CHANSTS_OFFSET_HIGH(ver)		((ver) < IOAT_VER_2_0 \
 						? IOAT1_CHANSTS_OFFSET_HIGH : IOAT2_CHANSTS_OFFSET_HIGH)
-#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR	~0x3F
-#define IOAT_CHANSTS_SOFT_ERR			0x0000000000000010
-#define IOAT_CHANSTS_UNAFFILIATED_ERR		0x0000000000000008
-#define IOAT_CHANSTS_DMA_TRANSFER_STATUS	0x0000000000000007
-#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE	0x0
-#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE	0x1
-#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_SUSPENDED	0x2
-#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED	0x3
+#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR	(~0x3fULL)
+#define IOAT_CHANSTS_SOFT_ERR			0x10ULL
+#define IOAT_CHANSTS_UNAFFILIATED_ERR		0x8ULL
+#define IOAT_CHANSTS_STATUS	0x7ULL
+#define IOAT_CHANSTS_ACTIVE	0x0
+#define IOAT_CHANSTS_DONE	0x1
+#define IOAT_CHANSTS_SUSPENDED	0x2
+#define IOAT_CHANSTS_HALTED	0x3
 
 
 
@@ -204,22 +223,27 @@
 #define IOAT_CDAR_OFFSET_HIGH			0x24
 
 #define IOAT_CHANERR_OFFSET			0x28	/* 32-bit Channel Error Register */
-#define IOAT_CHANERR_DMA_TRANSFER_SRC_ADDR_ERR	0x0001
-#define IOAT_CHANERR_DMA_TRANSFER_DEST_ADDR_ERR	0x0002
-#define IOAT_CHANERR_NEXT_DESCRIPTOR_ADDR_ERR	0x0004
-#define IOAT_CHANERR_NEXT_DESCRIPTOR_ALIGNMENT_ERR	0x0008
+#define IOAT_CHANERR_SRC_ADDR_ERR	0x0001
+#define IOAT_CHANERR_DEST_ADDR_ERR	0x0002
+#define IOAT_CHANERR_NEXT_ADDR_ERR	0x0004
+#define IOAT_CHANERR_NEXT_DESC_ALIGN_ERR	0x0008
 #define IOAT_CHANERR_CHAIN_ADDR_VALUE_ERR	0x0010
 #define IOAT_CHANERR_CHANCMD_ERR		0x0020
 #define IOAT_CHANERR_CHIPSET_UNCORRECTABLE_DATA_INTEGRITY_ERR	0x0040
 #define IOAT_CHANERR_DMA_UNCORRECTABLE_DATA_INTEGRITY_ERR	0x0080
 #define IOAT_CHANERR_READ_DATA_ERR		0x0100
 #define IOAT_CHANERR_WRITE_DATA_ERR		0x0200
-#define IOAT_CHANERR_DESCRIPTOR_CONTROL_ERR	0x0400
-#define IOAT_CHANERR_DESCRIPTOR_LENGTH_ERR	0x0800
+#define IOAT_CHANERR_CONTROL_ERR	0x0400
+#define IOAT_CHANERR_LENGTH_ERR	0x0800
 #define IOAT_CHANERR_COMPLETION_ADDR_ERR	0x1000
 #define IOAT_CHANERR_INT_CONFIGURATION_ERR	0x2000
 #define IOAT_CHANERR_SOFT_ERR			0x4000
 #define IOAT_CHANERR_UNAFFILIATED_ERR		0x8000
+#define IOAT_CHANERR_XOR_P_OR_CRC_ERR		0x10000
+#define IOAT_CHANERR_XOR_Q_ERR			0x20000
+#define IOAT_CHANERR_DESCRIPTOR_COUNT_ERR	0x40000
+
+#define IOAT_CHANERR_HANDLE_MASK (IOAT_CHANERR_XOR_P_OR_CRC_ERR | IOAT_CHANERR_XOR_Q_ERR)
 
 #define IOAT_CHANERR_MASK_OFFSET		0x2C	/* 32-bit Channel Error Register */
 
diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
deleted file mode 100644
index a600fc0f796..00000000000
--- a/drivers/dma/ioat_dma.c
+++ /dev/null
@@ -1,1741 +0,0 @@
-/*
- * Intel I/OAT DMA Linux driver
- * Copyright(c) 2004 - 2009 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- */
-
-/*
- * This driver supports an Intel I/OAT DMA engine, which does asynchronous
- * copy operations.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/dmaengine.h>
-#include <linux/delay.h>
-#include <linux/dma-mapping.h>
-#include <linux/workqueue.h>
-#include <linux/i7300_idle.h>
-#include "ioatdma.h"
-#include "ioatdma_registers.h"
-#include "ioatdma_hw.h"
-
-#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common)
-#define to_ioatdma_device(dev) container_of(dev, struct ioatdma_device, common)
-#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
-#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, async_tx)
-
-#define chan_num(ch) ((int)((ch)->reg_base - (ch)->device->reg_base) / 0x80)
-static int ioat_pending_level = 4;
-module_param(ioat_pending_level, int, 0644);
-MODULE_PARM_DESC(ioat_pending_level,
-		 "high-water mark for pushing ioat descriptors (default: 4)");
-
-#define RESET_DELAY  msecs_to_jiffies(100)
-#define WATCHDOG_DELAY  round_jiffies(msecs_to_jiffies(2000))
-static void ioat_dma_chan_reset_part2(struct work_struct *work);
-static void ioat_dma_chan_watchdog(struct work_struct *work);
-
-/*
- * workaround for IOAT ver.3.0 null descriptor issue
- * (channel returns error when size is 0)
- */
-#define NULL_DESC_BUFFER_SIZE 1
-
-/* internal functions */
-static void ioat_dma_start_null_desc(struct ioat_dma_chan *ioat_chan);
-static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan);
-
-static struct ioat_desc_sw *
-ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan);
-static struct ioat_desc_sw *
-ioat2_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan);
-
-static inline struct ioat_dma_chan *ioat_lookup_chan_by_index(
-						struct ioatdma_device *device,
-						int index)
-{
-	return device->idx[index];
-}
-
-/**
- * ioat_dma_do_interrupt - handler used for single vector interrupt mode
- * @irq: interrupt id
- * @data: interrupt data
- */
-static irqreturn_t ioat_dma_do_interrupt(int irq, void *data)
-{
-	struct ioatdma_device *instance = data;
-	struct ioat_dma_chan *ioat_chan;
-	unsigned long attnstatus;
-	int bit;
-	u8 intrctrl;
-
-	intrctrl = readb(instance->reg_base + IOAT_INTRCTRL_OFFSET);
-
-	if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN))
-		return IRQ_NONE;
-
-	if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) {
-		writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
-		return IRQ_NONE;
-	}
-
-	attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET);
-	for_each_bit(bit, &attnstatus, BITS_PER_LONG) {
-		ioat_chan = ioat_lookup_chan_by_index(instance, bit);
-		tasklet_schedule(&ioat_chan->cleanup_task);
-	}
-
-	writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
-	return IRQ_HANDLED;
-}
-
-/**
- * ioat_dma_do_interrupt_msix - handler used for vector-per-channel interrupt mode
- * @irq: interrupt id
- * @data: interrupt data
- */
-static irqreturn_t ioat_dma_do_interrupt_msix(int irq, void *data)
-{
-	struct ioat_dma_chan *ioat_chan = data;
-
-	tasklet_schedule(&ioat_chan->cleanup_task);
-
-	return IRQ_HANDLED;
-}
-
-static void ioat_dma_cleanup_tasklet(unsigned long data);
-
-/**
- * ioat_dma_enumerate_channels - find and initialize the device's channels
- * @device: the device to be enumerated
- */
-static int ioat_dma_enumerate_channels(struct ioatdma_device *device)
-{
-	u8 xfercap_scale;
-	u32 xfercap;
-	int i;
-	struct ioat_dma_chan *ioat_chan;
-
-	/*
-	 * IOAT ver.3 workarounds
-	 */
-	if (device->version == IOAT_VER_3_0) {
-		u32 chan_err_mask;
-		u16 dev_id;
-		u32 dmauncerrsts;
-
-		/*
-		 * Write CHANERRMSK_INT with 3E07h to mask out the errors
-		 * that can cause stability issues for IOAT ver.3
-		 */
-		chan_err_mask = 0x3E07;
-		pci_write_config_dword(device->pdev,
-			IOAT_PCI_CHANERRMASK_INT_OFFSET,
-			chan_err_mask);
-
-		/*
-		 * Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit
-		 * (workaround for spurious config parity error after restart)
-		 */
-		pci_read_config_word(device->pdev,
-			IOAT_PCI_DEVICE_ID_OFFSET,
-			&dev_id);
-		if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0) {
-			dmauncerrsts = 0x10;
-			pci_write_config_dword(device->pdev,
-				IOAT_PCI_DMAUNCERRSTS_OFFSET,
-				dmauncerrsts);
-		}
-	}
-
-	device->common.chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET);
-	xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
-	xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
-
-#ifdef  CONFIG_I7300_IDLE_IOAT_CHANNEL
-	if (i7300_idle_platform_probe(NULL, NULL, 1) == 0) {
-		device->common.chancnt--;
-	}
-#endif
-	for (i = 0; i < device->common.chancnt; i++) {
-		ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL);
-		if (!ioat_chan) {
-			device->common.chancnt = i;
-			break;
-		}
-
-		ioat_chan->device = device;
-		ioat_chan->reg_base = device->reg_base + (0x80 * (i + 1));
-		ioat_chan->xfercap = xfercap;
-		ioat_chan->desccount = 0;
-		INIT_DELAYED_WORK(&ioat_chan->work, ioat_dma_chan_reset_part2);
-		if (ioat_chan->device->version == IOAT_VER_2_0)
-			writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE |
-			       IOAT_DMA_DCA_ANY_CPU,
-			       ioat_chan->reg_base + IOAT_DCACTRL_OFFSET);
-		else if (ioat_chan->device->version == IOAT_VER_3_0)
-			writel(IOAT_DMA_DCA_ANY_CPU,
-			       ioat_chan->reg_base + IOAT_DCACTRL_OFFSET);
-		spin_lock_init(&ioat_chan->cleanup_lock);
-		spin_lock_init(&ioat_chan->desc_lock);
-		INIT_LIST_HEAD(&ioat_chan->free_desc);
-		INIT_LIST_HEAD(&ioat_chan->used_desc);
-		/* This should be made common somewhere in dmaengine.c */
-		ioat_chan->common.device = &device->common;
-		list_add_tail(&ioat_chan->common.device_node,
-			      &device->common.channels);
-		device->idx[i] = ioat_chan;
-		tasklet_init(&ioat_chan->cleanup_task,
-			     ioat_dma_cleanup_tasklet,
-			     (unsigned long) ioat_chan);
-		tasklet_disable(&ioat_chan->cleanup_task);
-	}
-	return device->common.chancnt;
-}
-
-/**
- * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended
- *                                 descriptors to hw
- * @chan: DMA channel handle
- */
-static inline void __ioat1_dma_memcpy_issue_pending(
-						struct ioat_dma_chan *ioat_chan)
-{
-	ioat_chan->pending = 0;
-	writeb(IOAT_CHANCMD_APPEND, ioat_chan->reg_base + IOAT1_CHANCMD_OFFSET);
-}
-
-static void ioat1_dma_memcpy_issue_pending(struct dma_chan *chan)
-{
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-
-	if (ioat_chan->pending > 0) {
-		spin_lock_bh(&ioat_chan->desc_lock);
-		__ioat1_dma_memcpy_issue_pending(ioat_chan);
-		spin_unlock_bh(&ioat_chan->desc_lock);
-	}
-}
-
-static inline void __ioat2_dma_memcpy_issue_pending(
-						struct ioat_dma_chan *ioat_chan)
-{
-	ioat_chan->pending = 0;
-	writew(ioat_chan->dmacount,
-	       ioat_chan->reg_base + IOAT_CHAN_DMACOUNT_OFFSET);
-}
-
-static void ioat2_dma_memcpy_issue_pending(struct dma_chan *chan)
-{
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-
-	if (ioat_chan->pending > 0) {
-		spin_lock_bh(&ioat_chan->desc_lock);
-		__ioat2_dma_memcpy_issue_pending(ioat_chan);
-		spin_unlock_bh(&ioat_chan->desc_lock);
-	}
-}
-
-
-/**
- * ioat_dma_chan_reset_part2 - reinit the channel after a reset
- */
-static void ioat_dma_chan_reset_part2(struct work_struct *work)
-{
-	struct ioat_dma_chan *ioat_chan =
-		container_of(work, struct ioat_dma_chan, work.work);
-	struct ioat_desc_sw *desc;
-
-	spin_lock_bh(&ioat_chan->cleanup_lock);
-	spin_lock_bh(&ioat_chan->desc_lock);
-
-	ioat_chan->completion_virt->low = 0;
-	ioat_chan->completion_virt->high = 0;
-	ioat_chan->pending = 0;
-
-	/*
-	 * count the descriptors waiting, and be sure to do it
-	 * right for both the CB1 line and the CB2 ring
-	 */
-	ioat_chan->dmacount = 0;
-	if (ioat_chan->used_desc.prev) {
-		desc = to_ioat_desc(ioat_chan->used_desc.prev);
-		do {
-			ioat_chan->dmacount++;
-			desc = to_ioat_desc(desc->node.next);
-		} while (&desc->node != ioat_chan->used_desc.next);
-	}
-
-	/*
-	 * write the new starting descriptor address
-	 * this puts channel engine into ARMED state
-	 */
-	desc = to_ioat_desc(ioat_chan->used_desc.prev);
-	switch (ioat_chan->device->version) {
-	case IOAT_VER_1_2:
-		writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
-		       ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW);
-		writel(((u64) desc->async_tx.phys) >> 32,
-		       ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH);
-
-		writeb(IOAT_CHANCMD_START, ioat_chan->reg_base
-			+ IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
-		break;
-	case IOAT_VER_2_0:
-		writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
-		       ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW);
-		writel(((u64) desc->async_tx.phys) >> 32,
-		       ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH);
-
-		/* tell the engine to go with what's left to be done */
-		writew(ioat_chan->dmacount,
-		       ioat_chan->reg_base + IOAT_CHAN_DMACOUNT_OFFSET);
-
-		break;
-	}
-	dev_err(&ioat_chan->device->pdev->dev,
-		"chan%d reset - %d descs waiting, %d total desc\n",
-		chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount);
-
-	spin_unlock_bh(&ioat_chan->desc_lock);
-	spin_unlock_bh(&ioat_chan->cleanup_lock);
-}
-
-/**
- * ioat_dma_reset_channel - restart a channel
- * @ioat_chan: IOAT DMA channel handle
- */
-static void ioat_dma_reset_channel(struct ioat_dma_chan *ioat_chan)
-{
-	u32 chansts, chanerr;
-
-	if (!ioat_chan->used_desc.prev)
-		return;
-
-	chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
-	chansts = (ioat_chan->completion_virt->low
-					& IOAT_CHANSTS_DMA_TRANSFER_STATUS);
-	if (chanerr) {
-		dev_err(&ioat_chan->device->pdev->dev,
-			"chan%d, CHANSTS = 0x%08x CHANERR = 0x%04x, clearing\n",
-			chan_num(ioat_chan), chansts, chanerr);
-		writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
-	}
-
-	/*
-	 * whack it upside the head with a reset
-	 * and wait for things to settle out.
-	 * force the pending count to a really big negative
-	 * to make sure no one forces an issue_pending
-	 * while we're waiting.
-	 */
-
-	spin_lock_bh(&ioat_chan->desc_lock);
-	ioat_chan->pending = INT_MIN;
-	writeb(IOAT_CHANCMD_RESET,
-	       ioat_chan->reg_base
-	       + IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
-	spin_unlock_bh(&ioat_chan->desc_lock);
-
-	/* schedule the 2nd half instead of sleeping a long time */
-	schedule_delayed_work(&ioat_chan->work, RESET_DELAY);
-}
-
-/**
- * ioat_dma_chan_watchdog - watch for stuck channels
- */
-static void ioat_dma_chan_watchdog(struct work_struct *work)
-{
-	struct ioatdma_device *device =
-		container_of(work, struct ioatdma_device, work.work);
-	struct ioat_dma_chan *ioat_chan;
-	int i;
-
-	union {
-		u64 full;
-		struct {
-			u32 low;
-			u32 high;
-		};
-	} completion_hw;
-	unsigned long compl_desc_addr_hw;
-
-	for (i = 0; i < device->common.chancnt; i++) {
-		ioat_chan = ioat_lookup_chan_by_index(device, i);
-
-		if (ioat_chan->device->version == IOAT_VER_1_2
-			/* have we started processing anything yet */
-		    && ioat_chan->last_completion
-			/* have we completed any since last watchdog cycle? */
-		    && (ioat_chan->last_completion ==
-				ioat_chan->watchdog_completion)
-			/* has TCP stuck on one cookie since last watchdog? */
-		    && (ioat_chan->watchdog_tcp_cookie ==
-				ioat_chan->watchdog_last_tcp_cookie)
-		    && (ioat_chan->watchdog_tcp_cookie !=
-				ioat_chan->completed_cookie)
-			/* is there something in the chain to be processed? */
-			/* CB1 chain always has at least the last one processed */
-		    && (ioat_chan->used_desc.prev != ioat_chan->used_desc.next)
-		    && ioat_chan->pending == 0) {
-
-			/*
-			 * check CHANSTS register for completed
-			 * descriptor address.
-			 * if it is different than completion writeback,
-			 * it is not zero
-			 * and it has changed since the last watchdog
-			 *     we can assume that channel
-			 *     is still working correctly
-			 *     and the problem is in completion writeback.
-			 *     update completion writeback
-			 *     with actual CHANSTS value
-			 * else
-			 *     try resetting the channel
-			 */
-
-			completion_hw.low = readl(ioat_chan->reg_base +
-				IOAT_CHANSTS_OFFSET_LOW(ioat_chan->device->version));
-			completion_hw.high = readl(ioat_chan->reg_base +
-				IOAT_CHANSTS_OFFSET_HIGH(ioat_chan->device->version));
-#if (BITS_PER_LONG == 64)
-			compl_desc_addr_hw =
-				completion_hw.full
-				& IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
-#else
-			compl_desc_addr_hw =
-				completion_hw.low & IOAT_LOW_COMPLETION_MASK;
-#endif
-
-			if ((compl_desc_addr_hw != 0)
-			   && (compl_desc_addr_hw != ioat_chan->watchdog_completion)
-			   && (compl_desc_addr_hw != ioat_chan->last_compl_desc_addr_hw)) {
-				ioat_chan->last_compl_desc_addr_hw = compl_desc_addr_hw;
-				ioat_chan->completion_virt->low = completion_hw.low;
-				ioat_chan->completion_virt->high = completion_hw.high;
-			} else {
-				ioat_dma_reset_channel(ioat_chan);
-				ioat_chan->watchdog_completion = 0;
-				ioat_chan->last_compl_desc_addr_hw = 0;
-			}
-
-		/*
-		 * for version 2.0 if there are descriptors yet to be processed
-		 * and the last completed hasn't changed since the last watchdog
-		 *      if they haven't hit the pending level
-		 *          issue the pending to push them through
-		 *      else
-		 *          try resetting the channel
-		 */
-		} else if (ioat_chan->device->version == IOAT_VER_2_0
-		    && ioat_chan->used_desc.prev
-		    && ioat_chan->last_completion
-		    && ioat_chan->last_completion == ioat_chan->watchdog_completion) {
-
-			if (ioat_chan->pending < ioat_pending_level)
-				ioat2_dma_memcpy_issue_pending(&ioat_chan->common);
-			else {
-				ioat_dma_reset_channel(ioat_chan);
-				ioat_chan->watchdog_completion = 0;
-			}
-		} else {
-			ioat_chan->last_compl_desc_addr_hw = 0;
-			ioat_chan->watchdog_completion
-					= ioat_chan->last_completion;
-		}
-
-		ioat_chan->watchdog_last_tcp_cookie =
-			ioat_chan->watchdog_tcp_cookie;
-	}
-
-	schedule_delayed_work(&device->work, WATCHDOG_DELAY);
-}
-
-static dma_cookie_t ioat1_tx_submit(struct dma_async_tx_descriptor *tx)
-{
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
-	struct ioat_desc_sw *first = tx_to_ioat_desc(tx);
-	struct ioat_desc_sw *prev, *new;
-	struct ioat_dma_descriptor *hw;
-	dma_cookie_t cookie;
-	LIST_HEAD(new_chain);
-	u32 copy;
-	size_t len;
-	dma_addr_t src, dst;
-	unsigned long orig_flags;
-	unsigned int desc_count = 0;
-
-	/* src and dest and len are stored in the initial descriptor */
-	len = first->len;
-	src = first->src;
-	dst = first->dst;
-	orig_flags = first->async_tx.flags;
-	new = first;
-
-	spin_lock_bh(&ioat_chan->desc_lock);
-	prev = to_ioat_desc(ioat_chan->used_desc.prev);
-	prefetch(prev->hw);
-	do {
-		copy = min_t(size_t, len, ioat_chan->xfercap);
-
-		async_tx_ack(&new->async_tx);
-
-		hw = new->hw;
-		hw->size = copy;
-		hw->ctl = 0;
-		hw->src_addr = src;
-		hw->dst_addr = dst;
-		hw->next = 0;
-
-		/* chain together the physical address list for the HW */
-		wmb();
-		prev->hw->next = (u64) new->async_tx.phys;
-
-		len -= copy;
-		dst += copy;
-		src += copy;
-
-		list_add_tail(&new->node, &new_chain);
-		desc_count++;
-		prev = new;
-	} while (len && (new = ioat1_dma_get_next_descriptor(ioat_chan)));
-
-	if (!new) {
-		dev_err(&ioat_chan->device->pdev->dev,
-			"tx submit failed\n");
-		spin_unlock_bh(&ioat_chan->desc_lock);
-		return -ENOMEM;
-	}
-
-	hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
-	if (first->async_tx.callback) {
-		hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_INT_GN;
-		if (first != new) {
-			/* move callback into to last desc */
-			new->async_tx.callback = first->async_tx.callback;
-			new->async_tx.callback_param
-					= first->async_tx.callback_param;
-			first->async_tx.callback = NULL;
-			first->async_tx.callback_param = NULL;
-		}
-	}
-
-	new->tx_cnt = desc_count;
-	new->async_tx.flags = orig_flags; /* client is in control of this ack */
-
-	/* store the original values for use in later cleanup */
-	if (new != first) {
-		new->src = first->src;
-		new->dst = first->dst;
-		new->len = first->len;
-	}
-
-	/* cookie incr and addition to used_list must be atomic */
-	cookie = ioat_chan->common.cookie;
-	cookie++;
-	if (cookie < 0)
-		cookie = 1;
-	ioat_chan->common.cookie = new->async_tx.cookie = cookie;
-
-	/* write address into NextDescriptor field of last desc in chain */
-	to_ioat_desc(ioat_chan->used_desc.prev)->hw->next =
-							first->async_tx.phys;
-	list_splice_tail(&new_chain, &ioat_chan->used_desc);
-
-	ioat_chan->dmacount += desc_count;
-	ioat_chan->pending += desc_count;
-	if (ioat_chan->pending >= ioat_pending_level)
-		__ioat1_dma_memcpy_issue_pending(ioat_chan);
-	spin_unlock_bh(&ioat_chan->desc_lock);
-
-	return cookie;
-}
-
-static dma_cookie_t ioat2_tx_submit(struct dma_async_tx_descriptor *tx)
-{
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
-	struct ioat_desc_sw *first = tx_to_ioat_desc(tx);
-	struct ioat_desc_sw *new;
-	struct ioat_dma_descriptor *hw;
-	dma_cookie_t cookie;
-	u32 copy;
-	size_t len;
-	dma_addr_t src, dst;
-	unsigned long orig_flags;
-	unsigned int desc_count = 0;
-
-	/* src and dest and len are stored in the initial descriptor */
-	len = first->len;
-	src = first->src;
-	dst = first->dst;
-	orig_flags = first->async_tx.flags;
-	new = first;
-
-	/*
-	 * ioat_chan->desc_lock is still in force in version 2 path
-	 * it gets unlocked at end of this function
-	 */
-	do {
-		copy = min_t(size_t, len, ioat_chan->xfercap);
-
-		async_tx_ack(&new->async_tx);
-
-		hw = new->hw;
-		hw->size = copy;
-		hw->ctl = 0;
-		hw->src_addr = src;
-		hw->dst_addr = dst;
-
-		len -= copy;
-		dst += copy;
-		src += copy;
-		desc_count++;
-	} while (len && (new = ioat2_dma_get_next_descriptor(ioat_chan)));
-
-	if (!new) {
-		dev_err(&ioat_chan->device->pdev->dev,
-			"tx submit failed\n");
-		spin_unlock_bh(&ioat_chan->desc_lock);
-		return -ENOMEM;
-	}
-
-	hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
-	if (first->async_tx.callback) {
-		hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_INT_GN;
-		if (first != new) {
-			/* move callback into to last desc */
-			new->async_tx.callback = first->async_tx.callback;
-			new->async_tx.callback_param
-					= first->async_tx.callback_param;
-			first->async_tx.callback = NULL;
-			first->async_tx.callback_param = NULL;
-		}
-	}
-
-	new->tx_cnt = desc_count;
-	new->async_tx.flags = orig_flags; /* client is in control of this ack */
-
-	/* store the original values for use in later cleanup */
-	if (new != first) {
-		new->src = first->src;
-		new->dst = first->dst;
-		new->len = first->len;
-	}
-
-	/* cookie incr and addition to used_list must be atomic */
-	cookie = ioat_chan->common.cookie;
-	cookie++;
-	if (cookie < 0)
-		cookie = 1;
-	ioat_chan->common.cookie = new->async_tx.cookie = cookie;
-
-	ioat_chan->dmacount += desc_count;
-	ioat_chan->pending += desc_count;
-	if (ioat_chan->pending >= ioat_pending_level)
-		__ioat2_dma_memcpy_issue_pending(ioat_chan);
-	spin_unlock_bh(&ioat_chan->desc_lock);
-
-	return cookie;
-}
-
-/**
- * ioat_dma_alloc_descriptor - allocate and return a sw and hw descriptor pair
- * @ioat_chan: the channel supplying the memory pool for the descriptors
- * @flags: allocation flags
- */
-static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
-					struct ioat_dma_chan *ioat_chan,
-					gfp_t flags)
-{
-	struct ioat_dma_descriptor *desc;
-	struct ioat_desc_sw *desc_sw;
-	struct ioatdma_device *ioatdma_device;
-	dma_addr_t phys;
-
-	ioatdma_device = to_ioatdma_device(ioat_chan->common.device);
-	desc = pci_pool_alloc(ioatdma_device->dma_pool, flags, &phys);
-	if (unlikely(!desc))
-		return NULL;
-
-	desc_sw = kzalloc(sizeof(*desc_sw), flags);
-	if (unlikely(!desc_sw)) {
-		pci_pool_free(ioatdma_device->dma_pool, desc, phys);
-		return NULL;
-	}
-
-	memset(desc, 0, sizeof(*desc));
-	dma_async_tx_descriptor_init(&desc_sw->async_tx, &ioat_chan->common);
-	switch (ioat_chan->device->version) {
-	case IOAT_VER_1_2:
-		desc_sw->async_tx.tx_submit = ioat1_tx_submit;
-		break;
-	case IOAT_VER_2_0:
-	case IOAT_VER_3_0:
-		desc_sw->async_tx.tx_submit = ioat2_tx_submit;
-		break;
-	}
-
-	desc_sw->hw = desc;
-	desc_sw->async_tx.phys = phys;
-
-	return desc_sw;
-}
-
-static int ioat_initial_desc_count = 256;
-module_param(ioat_initial_desc_count, int, 0644);
-MODULE_PARM_DESC(ioat_initial_desc_count,
-		 "initial descriptors per channel (default: 256)");
-
-/**
- * ioat2_dma_massage_chan_desc - link the descriptors into a circle
- * @ioat_chan: the channel to be massaged
- */
-static void ioat2_dma_massage_chan_desc(struct ioat_dma_chan *ioat_chan)
-{
-	struct ioat_desc_sw *desc, *_desc;
-
-	/* setup used_desc */
-	ioat_chan->used_desc.next = ioat_chan->free_desc.next;
-	ioat_chan->used_desc.prev = NULL;
-
-	/* pull free_desc out of the circle so that every node is a hw
-	 * descriptor, but leave it pointing to the list
-	 */
-	ioat_chan->free_desc.prev->next = ioat_chan->free_desc.next;
-	ioat_chan->free_desc.next->prev = ioat_chan->free_desc.prev;
-
-	/* circle link the hw descriptors */
-	desc = to_ioat_desc(ioat_chan->free_desc.next);
-	desc->hw->next = to_ioat_desc(desc->node.next)->async_tx.phys;
-	list_for_each_entry_safe(desc, _desc, ioat_chan->free_desc.next, node) {
-		desc->hw->next = to_ioat_desc(desc->node.next)->async_tx.phys;
-	}
-}
-
-/**
- * ioat_dma_alloc_chan_resources - returns the number of allocated descriptors
- * @chan: the channel to be filled out
- */
-static int ioat_dma_alloc_chan_resources(struct dma_chan *chan)
-{
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-	struct ioat_desc_sw *desc;
-	u16 chanctrl;
-	u32 chanerr;
-	int i;
-	LIST_HEAD(tmp_list);
-
-	/* have we already been set up? */
-	if (!list_empty(&ioat_chan->free_desc))
-		return ioat_chan->desccount;
-
-	/* Setup register to interrupt and write completion status on error */
-	chanctrl = IOAT_CHANCTRL_ERR_INT_EN |
-		IOAT_CHANCTRL_ANY_ERR_ABORT_EN |
-		IOAT_CHANCTRL_ERR_COMPLETION_EN;
-	writew(chanctrl, ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET);
-
-	chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
-	if (chanerr) {
-		dev_err(&ioat_chan->device->pdev->dev,
-			"CHANERR = %x, clearing\n", chanerr);
-		writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
-	}
-
-	/* Allocate descriptors */
-	for (i = 0; i < ioat_initial_desc_count; i++) {
-		desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_KERNEL);
-		if (!desc) {
-			dev_err(&ioat_chan->device->pdev->dev,
-				"Only %d initial descriptors\n", i);
-			break;
-		}
-		list_add_tail(&desc->node, &tmp_list);
-	}
-	spin_lock_bh(&ioat_chan->desc_lock);
-	ioat_chan->desccount = i;
-	list_splice(&tmp_list, &ioat_chan->free_desc);
-	if (ioat_chan->device->version != IOAT_VER_1_2)
-		ioat2_dma_massage_chan_desc(ioat_chan);
-	spin_unlock_bh(&ioat_chan->desc_lock);
-
-	/* allocate a completion writeback area */
-	/* doing 2 32bit writes to mmio since 1 64b write doesn't work */
-	ioat_chan->completion_virt =
-		pci_pool_alloc(ioat_chan->device->completion_pool,
-			       GFP_KERNEL,
-			       &ioat_chan->completion_addr);
-	memset(ioat_chan->completion_virt, 0,
-	       sizeof(*ioat_chan->completion_virt));
-	writel(((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF,
-	       ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_LOW);
-	writel(((u64) ioat_chan->completion_addr) >> 32,
-	       ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH);
-
-	tasklet_enable(&ioat_chan->cleanup_task);
-	ioat_dma_start_null_desc(ioat_chan);  /* give chain to dma device */
-	return ioat_chan->desccount;
-}
-
-/**
- * ioat_dma_free_chan_resources - release all the descriptors
- * @chan: the channel to be cleaned
- */
-static void ioat_dma_free_chan_resources(struct dma_chan *chan)
-{
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-	struct ioatdma_device *ioatdma_device = to_ioatdma_device(chan->device);
-	struct ioat_desc_sw *desc, *_desc;
-	int in_use_descs = 0;
-
-	/* Before freeing channel resources first check
-	 * if they have been previously allocated for this channel.
-	 */
-	if (ioat_chan->desccount == 0)
-		return;
-
-	tasklet_disable(&ioat_chan->cleanup_task);
-	ioat_dma_memcpy_cleanup(ioat_chan);
-
-	/* Delay 100ms after reset to allow internal DMA logic to quiesce
-	 * before removing DMA descriptor resources.
-	 */
-	writeb(IOAT_CHANCMD_RESET,
-	       ioat_chan->reg_base
-			+ IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
-	mdelay(100);
-
-	spin_lock_bh(&ioat_chan->desc_lock);
-	switch (ioat_chan->device->version) {
-	case IOAT_VER_1_2:
-		list_for_each_entry_safe(desc, _desc,
-					 &ioat_chan->used_desc, node) {
-			in_use_descs++;
-			list_del(&desc->node);
-			pci_pool_free(ioatdma_device->dma_pool, desc->hw,
-				      desc->async_tx.phys);
-			kfree(desc);
-		}
-		list_for_each_entry_safe(desc, _desc,
-					 &ioat_chan->free_desc, node) {
-			list_del(&desc->node);
-			pci_pool_free(ioatdma_device->dma_pool, desc->hw,
-				      desc->async_tx.phys);
-			kfree(desc);
-		}
-		break;
-	case IOAT_VER_2_0:
-	case IOAT_VER_3_0:
-		list_for_each_entry_safe(desc, _desc,
-					 ioat_chan->free_desc.next, node) {
-			list_del(&desc->node);
-			pci_pool_free(ioatdma_device->dma_pool, desc->hw,
-				      desc->async_tx.phys);
-			kfree(desc);
-		}
-		desc = to_ioat_desc(ioat_chan->free_desc.next);
-		pci_pool_free(ioatdma_device->dma_pool, desc->hw,
-			      desc->async_tx.phys);
-		kfree(desc);
-		INIT_LIST_HEAD(&ioat_chan->free_desc);
-		INIT_LIST_HEAD(&ioat_chan->used_desc);
-		break;
-	}
-	spin_unlock_bh(&ioat_chan->desc_lock);
-
-	pci_pool_free(ioatdma_device->completion_pool,
-		      ioat_chan->completion_virt,
-		      ioat_chan->completion_addr);
-
-	/* one is ok since we left it on there on purpose */
-	if (in_use_descs > 1)
-		dev_err(&ioat_chan->device->pdev->dev,
-			"Freeing %d in use descriptors!\n",
-			in_use_descs - 1);
-
-	ioat_chan->last_completion = ioat_chan->completion_addr = 0;
-	ioat_chan->pending = 0;
-	ioat_chan->dmacount = 0;
-	ioat_chan->desccount = 0;
-	ioat_chan->watchdog_completion = 0;
-	ioat_chan->last_compl_desc_addr_hw = 0;
-	ioat_chan->watchdog_tcp_cookie =
-		ioat_chan->watchdog_last_tcp_cookie = 0;
-}
-
-/**
- * ioat_dma_get_next_descriptor - return the next available descriptor
- * @ioat_chan: IOAT DMA channel handle
- *
- * Gets the next descriptor from the chain, and must be called with the
- * channel's desc_lock held.  Allocates more descriptors if the channel
- * has run out.
- */
-static struct ioat_desc_sw *
-ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan)
-{
-	struct ioat_desc_sw *new;
-
-	if (!list_empty(&ioat_chan->free_desc)) {
-		new = to_ioat_desc(ioat_chan->free_desc.next);
-		list_del(&new->node);
-	} else {
-		/* try to get another desc */
-		new = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC);
-		if (!new) {
-			dev_err(&ioat_chan->device->pdev->dev,
-				"alloc failed\n");
-			return NULL;
-		}
-	}
-
-	prefetch(new->hw);
-	return new;
-}
-
-static struct ioat_desc_sw *
-ioat2_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan)
-{
-	struct ioat_desc_sw *new;
-
-	/*
-	 * used.prev points to where to start processing
-	 * used.next points to next free descriptor
-	 * if used.prev == NULL, there are none waiting to be processed
-	 * if used.next == used.prev.prev, there is only one free descriptor,
-	 *      and we need to use it to as a noop descriptor before
-	 *      linking in a new set of descriptors, since the device
-	 *      has probably already read the pointer to it
-	 */
-	if (ioat_chan->used_desc.prev &&
-	    ioat_chan->used_desc.next == ioat_chan->used_desc.prev->prev) {
-
-		struct ioat_desc_sw *desc;
-		struct ioat_desc_sw *noop_desc;
-		int i;
-
-		/* set up the noop descriptor */
-		noop_desc = to_ioat_desc(ioat_chan->used_desc.next);
-		/* set size to non-zero value (channel returns error when size is 0) */
-		noop_desc->hw->size = NULL_DESC_BUFFER_SIZE;
-		noop_desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL;
-		noop_desc->hw->src_addr = 0;
-		noop_desc->hw->dst_addr = 0;
-
-		ioat_chan->used_desc.next = ioat_chan->used_desc.next->next;
-		ioat_chan->pending++;
-		ioat_chan->dmacount++;
-
-		/* try to get a few more descriptors */
-		for (i = 16; i; i--) {
-			desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC);
-			if (!desc) {
-				dev_err(&ioat_chan->device->pdev->dev,
-					"alloc failed\n");
-				break;
-			}
-			list_add_tail(&desc->node, ioat_chan->used_desc.next);
-
-			desc->hw->next
-				= to_ioat_desc(desc->node.next)->async_tx.phys;
-			to_ioat_desc(desc->node.prev)->hw->next
-				= desc->async_tx.phys;
-			ioat_chan->desccount++;
-		}
-
-		ioat_chan->used_desc.next = noop_desc->node.next;
-	}
-	new = to_ioat_desc(ioat_chan->used_desc.next);
-	prefetch(new);
-	ioat_chan->used_desc.next = new->node.next;
-
-	if (ioat_chan->used_desc.prev == NULL)
-		ioat_chan->used_desc.prev = &new->node;
-
-	prefetch(new->hw);
-	return new;
-}
-
-static struct ioat_desc_sw *ioat_dma_get_next_descriptor(
-						struct ioat_dma_chan *ioat_chan)
-{
-	if (!ioat_chan)
-		return NULL;
-
-	switch (ioat_chan->device->version) {
-	case IOAT_VER_1_2:
-		return ioat1_dma_get_next_descriptor(ioat_chan);
-	case IOAT_VER_2_0:
-	case IOAT_VER_3_0:
-		return ioat2_dma_get_next_descriptor(ioat_chan);
-	}
-	return NULL;
-}
-
-static struct dma_async_tx_descriptor *ioat1_dma_prep_memcpy(
-						struct dma_chan *chan,
-						dma_addr_t dma_dest,
-						dma_addr_t dma_src,
-						size_t len,
-						unsigned long flags)
-{
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-	struct ioat_desc_sw *new;
-
-	spin_lock_bh(&ioat_chan->desc_lock);
-	new = ioat_dma_get_next_descriptor(ioat_chan);
-	spin_unlock_bh(&ioat_chan->desc_lock);
-
-	if (new) {
-		new->len = len;
-		new->dst = dma_dest;
-		new->src = dma_src;
-		new->async_tx.flags = flags;
-		return &new->async_tx;
-	} else {
-		dev_err(&ioat_chan->device->pdev->dev,
-			"chan%d - get_next_desc failed: %d descs waiting, %d total desc\n",
-			chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount);
-		return NULL;
-	}
-}
-
-static struct dma_async_tx_descriptor *ioat2_dma_prep_memcpy(
-						struct dma_chan *chan,
-						dma_addr_t dma_dest,
-						dma_addr_t dma_src,
-						size_t len,
-						unsigned long flags)
-{
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-	struct ioat_desc_sw *new;
-
-	spin_lock_bh(&ioat_chan->desc_lock);
-	new = ioat2_dma_get_next_descriptor(ioat_chan);
-
-	/*
-	 * leave ioat_chan->desc_lock set in ioat 2 path
-	 * it will get unlocked at end of tx_submit
-	 */
-
-	if (new) {
-		new->len = len;
-		new->dst = dma_dest;
-		new->src = dma_src;
-		new->async_tx.flags = flags;
-		return &new->async_tx;
-	} else {
-		spin_unlock_bh(&ioat_chan->desc_lock);
-		dev_err(&ioat_chan->device->pdev->dev,
-			"chan%d - get_next_desc failed: %d descs waiting, %d total desc\n",
-			chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount);
-		return NULL;
-	}
-}
-
-static void ioat_dma_cleanup_tasklet(unsigned long data)
-{
-	struct ioat_dma_chan *chan = (void *)data;
-	ioat_dma_memcpy_cleanup(chan);
-	writew(IOAT_CHANCTRL_INT_DISABLE,
-	       chan->reg_base + IOAT_CHANCTRL_OFFSET);
-}
-
-static void
-ioat_dma_unmap(struct ioat_dma_chan *ioat_chan, struct ioat_desc_sw *desc)
-{
-	if (!(desc->async_tx.flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-		if (desc->async_tx.flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-			pci_unmap_single(ioat_chan->device->pdev,
-					 pci_unmap_addr(desc, dst),
-					 pci_unmap_len(desc, len),
-					 PCI_DMA_FROMDEVICE);
-		else
-			pci_unmap_page(ioat_chan->device->pdev,
-				       pci_unmap_addr(desc, dst),
-				       pci_unmap_len(desc, len),
-				       PCI_DMA_FROMDEVICE);
-	}
-
-	if (!(desc->async_tx.flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-		if (desc->async_tx.flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-			pci_unmap_single(ioat_chan->device->pdev,
-					 pci_unmap_addr(desc, src),
-					 pci_unmap_len(desc, len),
-					 PCI_DMA_TODEVICE);
-		else
-			pci_unmap_page(ioat_chan->device->pdev,
-				       pci_unmap_addr(desc, src),
-				       pci_unmap_len(desc, len),
-				       PCI_DMA_TODEVICE);
-	}
-}
-
-/**
- * ioat_dma_memcpy_cleanup - cleanup up finished descriptors
- * @chan: ioat channel to be cleaned up
- */
-static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan)
-{
-	unsigned long phys_complete;
-	struct ioat_desc_sw *desc, *_desc;
-	dma_cookie_t cookie = 0;
-	unsigned long desc_phys;
-	struct ioat_desc_sw *latest_desc;
-
-	prefetch(ioat_chan->completion_virt);
-
-	if (!spin_trylock_bh(&ioat_chan->cleanup_lock))
-		return;
-
-	/* The completion writeback can happen at any time,
-	   so reads by the driver need to be atomic operations
-	   The descriptor physical addresses are limited to 32-bits
-	   when the CPU can only do a 32-bit mov */
-
-#if (BITS_PER_LONG == 64)
-	phys_complete =
-		ioat_chan->completion_virt->full
-		& IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
-#else
-	phys_complete =
-		ioat_chan->completion_virt->low & IOAT_LOW_COMPLETION_MASK;
-#endif
-
-	if ((ioat_chan->completion_virt->full
-		& IOAT_CHANSTS_DMA_TRANSFER_STATUS) ==
-				IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED) {
-		dev_err(&ioat_chan->device->pdev->dev,
-			"Channel halted, chanerr = %x\n",
-			readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET));
-
-		/* TODO do something to salvage the situation */
-	}
-
-	if (phys_complete == ioat_chan->last_completion) {
-		spin_unlock_bh(&ioat_chan->cleanup_lock);
-		/*
-		 * perhaps we're stuck so hard that the watchdog can't go off?
-		 * try to catch it after 2 seconds
-		 */
-		if (ioat_chan->device->version != IOAT_VER_3_0) {
-			if (time_after(jiffies,
-				       ioat_chan->last_completion_time + HZ*WATCHDOG_DELAY)) {
-				ioat_dma_chan_watchdog(&(ioat_chan->device->work.work));
-				ioat_chan->last_completion_time = jiffies;
-			}
-		}
-		return;
-	}
-	ioat_chan->last_completion_time = jiffies;
-
-	cookie = 0;
-	if (!spin_trylock_bh(&ioat_chan->desc_lock)) {
-		spin_unlock_bh(&ioat_chan->cleanup_lock);
-		return;
-	}
-
-	switch (ioat_chan->device->version) {
-	case IOAT_VER_1_2:
-		list_for_each_entry_safe(desc, _desc,
-					 &ioat_chan->used_desc, node) {
-
-			/*
-			 * Incoming DMA requests may use multiple descriptors,
-			 * due to exceeding xfercap, perhaps. If so, only the
-			 * last one will have a cookie, and require unmapping.
-			 */
-			if (desc->async_tx.cookie) {
-				cookie = desc->async_tx.cookie;
-				ioat_dma_unmap(ioat_chan, desc);
-				if (desc->async_tx.callback) {
-					desc->async_tx.callback(desc->async_tx.callback_param);
-					desc->async_tx.callback = NULL;
-				}
-			}
-
-			if (desc->async_tx.phys != phys_complete) {
-				/*
-				 * a completed entry, but not the last, so clean
-				 * up if the client is done with the descriptor
-				 */
-				if (async_tx_test_ack(&desc->async_tx)) {
-					list_move_tail(&desc->node,
-						       &ioat_chan->free_desc);
-				} else
-					desc->async_tx.cookie = 0;
-			} else {
-				/*
-				 * last used desc. Do not remove, so we can
-				 * append from it, but don't look at it next
-				 * time, either
-				 */
-				desc->async_tx.cookie = 0;
-
-				/* TODO check status bits? */
-				break;
-			}
-		}
-		break;
-	case IOAT_VER_2_0:
-	case IOAT_VER_3_0:
-		/* has some other thread has already cleaned up? */
-		if (ioat_chan->used_desc.prev == NULL)
-			break;
-
-		/* work backwards to find latest finished desc */
-		desc = to_ioat_desc(ioat_chan->used_desc.next);
-		latest_desc = NULL;
-		do {
-			desc = to_ioat_desc(desc->node.prev);
-			desc_phys = (unsigned long)desc->async_tx.phys
-				       & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
-			if (desc_phys == phys_complete) {
-				latest_desc = desc;
-				break;
-			}
-		} while (&desc->node != ioat_chan->used_desc.prev);
-
-		if (latest_desc != NULL) {
-
-			/* work forwards to clear finished descriptors */
-			for (desc = to_ioat_desc(ioat_chan->used_desc.prev);
-			     &desc->node != latest_desc->node.next &&
-			     &desc->node != ioat_chan->used_desc.next;
-			     desc = to_ioat_desc(desc->node.next)) {
-				if (desc->async_tx.cookie) {
-					cookie = desc->async_tx.cookie;
-					desc->async_tx.cookie = 0;
-					ioat_dma_unmap(ioat_chan, desc);
-					if (desc->async_tx.callback) {
-						desc->async_tx.callback(desc->async_tx.callback_param);
-						desc->async_tx.callback = NULL;
-					}
-				}
-			}
-
-			/* move used.prev up beyond those that are finished */
-			if (&desc->node == ioat_chan->used_desc.next)
-				ioat_chan->used_desc.prev = NULL;
-			else
-				ioat_chan->used_desc.prev = &desc->node;
-		}
-		break;
-	}
-
-	spin_unlock_bh(&ioat_chan->desc_lock);
-
-	ioat_chan->last_completion = phys_complete;
-	if (cookie != 0)
-		ioat_chan->completed_cookie = cookie;
-
-	spin_unlock_bh(&ioat_chan->cleanup_lock);
-}
-
-/**
- * ioat_dma_is_complete - poll the status of a IOAT DMA transaction
- * @chan: IOAT DMA channel handle
- * @cookie: DMA transaction identifier
- * @done: if not %NULL, updated with last completed transaction
- * @used: if not %NULL, updated with last used transaction
- */
-static enum dma_status ioat_dma_is_complete(struct dma_chan *chan,
-					    dma_cookie_t cookie,
-					    dma_cookie_t *done,
-					    dma_cookie_t *used)
-{
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-	dma_cookie_t last_used;
-	dma_cookie_t last_complete;
-	enum dma_status ret;
-
-	last_used = chan->cookie;
-	last_complete = ioat_chan->completed_cookie;
-	ioat_chan->watchdog_tcp_cookie = cookie;
-
-	if (done)
-		*done = last_complete;
-	if (used)
-		*used = last_used;
-
-	ret = dma_async_is_complete(cookie, last_complete, last_used);
-	if (ret == DMA_SUCCESS)
-		return ret;
-
-	ioat_dma_memcpy_cleanup(ioat_chan);
-
-	last_used = chan->cookie;
-	last_complete = ioat_chan->completed_cookie;
-
-	if (done)
-		*done = last_complete;
-	if (used)
-		*used = last_used;
-
-	return dma_async_is_complete(cookie, last_complete, last_used);
-}
-
-static void ioat_dma_start_null_desc(struct ioat_dma_chan *ioat_chan)
-{
-	struct ioat_desc_sw *desc;
-
-	spin_lock_bh(&ioat_chan->desc_lock);
-
-	desc = ioat_dma_get_next_descriptor(ioat_chan);
-
-	if (!desc) {
-		dev_err(&ioat_chan->device->pdev->dev,
-			"Unable to start null desc - get next desc failed\n");
-		spin_unlock_bh(&ioat_chan->desc_lock);
-		return;
-	}
-
-	desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL
-				| IOAT_DMA_DESCRIPTOR_CTL_INT_GN
-				| IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
-	/* set size to non-zero value (channel returns error when size is 0) */
-	desc->hw->size = NULL_DESC_BUFFER_SIZE;
-	desc->hw->src_addr = 0;
-	desc->hw->dst_addr = 0;
-	async_tx_ack(&desc->async_tx);
-	switch (ioat_chan->device->version) {
-	case IOAT_VER_1_2:
-		desc->hw->next = 0;
-		list_add_tail(&desc->node, &ioat_chan->used_desc);
-
-		writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
-		       ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW);
-		writel(((u64) desc->async_tx.phys) >> 32,
-		       ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH);
-
-		writeb(IOAT_CHANCMD_START, ioat_chan->reg_base
-			+ IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
-		break;
-	case IOAT_VER_2_0:
-	case IOAT_VER_3_0:
-		writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
-		       ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW);
-		writel(((u64) desc->async_tx.phys) >> 32,
-		       ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH);
-
-		ioat_chan->dmacount++;
-		__ioat2_dma_memcpy_issue_pending(ioat_chan);
-		break;
-	}
-	spin_unlock_bh(&ioat_chan->desc_lock);
-}
-
-/*
- * Perform a IOAT transaction to verify the HW works.
- */
-#define IOAT_TEST_SIZE 2000
-
-static void ioat_dma_test_callback(void *dma_async_param)
-{
-	struct completion *cmp = dma_async_param;
-
-	complete(cmp);
-}
-
-/**
- * ioat_dma_self_test - Perform a IOAT transaction to verify the HW works.
- * @device: device to be tested
- */
-static int ioat_dma_self_test(struct ioatdma_device *device)
-{
-	int i;
-	u8 *src;
-	u8 *dest;
-	struct dma_chan *dma_chan;
-	struct dma_async_tx_descriptor *tx;
-	dma_addr_t dma_dest, dma_src;
-	dma_cookie_t cookie;
-	int err = 0;
-	struct completion cmp;
-	unsigned long tmo;
-	unsigned long flags;
-
-	src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
-	if (!src)
-		return -ENOMEM;
-	dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
-	if (!dest) {
-		kfree(src);
-		return -ENOMEM;
-	}
-
-	/* Fill in src buffer */
-	for (i = 0; i < IOAT_TEST_SIZE; i++)
-		src[i] = (u8)i;
-
-	/* Start copy, using first DMA channel */
-	dma_chan = container_of(device->common.channels.next,
-				struct dma_chan,
-				device_node);
-	if (device->common.device_alloc_chan_resources(dma_chan) < 1) {
-		dev_err(&device->pdev->dev,
-			"selftest cannot allocate chan resource\n");
-		err = -ENODEV;
-		goto out;
-	}
-
-	dma_src = dma_map_single(dma_chan->device->dev, src, IOAT_TEST_SIZE,
-				 DMA_TO_DEVICE);
-	dma_dest = dma_map_single(dma_chan->device->dev, dest, IOAT_TEST_SIZE,
-				  DMA_FROM_DEVICE);
-	flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_COMPL_DEST_UNMAP_SINGLE;
-	tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src,
-						   IOAT_TEST_SIZE, flags);
-	if (!tx) {
-		dev_err(&device->pdev->dev,
-			"Self-test prep failed, disabling\n");
-		err = -ENODEV;
-		goto free_resources;
-	}
-
-	async_tx_ack(tx);
-	init_completion(&cmp);
-	tx->callback = ioat_dma_test_callback;
-	tx->callback_param = &cmp;
-	cookie = tx->tx_submit(tx);
-	if (cookie < 0) {
-		dev_err(&device->pdev->dev,
-			"Self-test setup failed, disabling\n");
-		err = -ENODEV;
-		goto free_resources;
-	}
-	device->common.device_issue_pending(dma_chan);
-
-	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
-
-	if (tmo == 0 ||
-	    device->common.device_is_tx_complete(dma_chan, cookie, NULL, NULL)
-					!= DMA_SUCCESS) {
-		dev_err(&device->pdev->dev,
-			"Self-test copy timed out, disabling\n");
-		err = -ENODEV;
-		goto free_resources;
-	}
-	if (memcmp(src, dest, IOAT_TEST_SIZE)) {
-		dev_err(&device->pdev->dev,
-			"Self-test copy failed compare, disabling\n");
-		err = -ENODEV;
-		goto free_resources;
-	}
-
-free_resources:
-	device->common.device_free_chan_resources(dma_chan);
-out:
-	kfree(src);
-	kfree(dest);
-	return err;
-}
-
-static char ioat_interrupt_style[32] = "msix";
-module_param_string(ioat_interrupt_style, ioat_interrupt_style,
-		    sizeof(ioat_interrupt_style), 0644);
-MODULE_PARM_DESC(ioat_interrupt_style,
-		 "set ioat interrupt style: msix (default), "
-		 "msix-single-vector, msi, intx)");
-
-/**
- * ioat_dma_setup_interrupts - setup interrupt handler
- * @device: ioat device
- */
-static int ioat_dma_setup_interrupts(struct ioatdma_device *device)
-{
-	struct ioat_dma_chan *ioat_chan;
-	int err, i, j, msixcnt;
-	u8 intrctrl = 0;
-
-	if (!strcmp(ioat_interrupt_style, "msix"))
-		goto msix;
-	if (!strcmp(ioat_interrupt_style, "msix-single-vector"))
-		goto msix_single_vector;
-	if (!strcmp(ioat_interrupt_style, "msi"))
-		goto msi;
-	if (!strcmp(ioat_interrupt_style, "intx"))
-		goto intx;
-	dev_err(&device->pdev->dev, "invalid ioat_interrupt_style %s\n",
-		ioat_interrupt_style);
-	goto err_no_irq;
-
-msix:
-	/* The number of MSI-X vectors should equal the number of channels */
-	msixcnt = device->common.chancnt;
-	for (i = 0; i < msixcnt; i++)
-		device->msix_entries[i].entry = i;
-
-	err = pci_enable_msix(device->pdev, device->msix_entries, msixcnt);
-	if (err < 0)
-		goto msi;
-	if (err > 0)
-		goto msix_single_vector;
-
-	for (i = 0; i < msixcnt; i++) {
-		ioat_chan = ioat_lookup_chan_by_index(device, i);
-		err = request_irq(device->msix_entries[i].vector,
-				  ioat_dma_do_interrupt_msix,
-				  0, "ioat-msix", ioat_chan);
-		if (err) {
-			for (j = 0; j < i; j++) {
-				ioat_chan =
-					ioat_lookup_chan_by_index(device, j);
-				free_irq(device->msix_entries[j].vector,
-					 ioat_chan);
-			}
-			goto msix_single_vector;
-		}
-	}
-	intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL;
-	device->irq_mode = msix_multi_vector;
-	goto done;
-
-msix_single_vector:
-	device->msix_entries[0].entry = 0;
-	err = pci_enable_msix(device->pdev, device->msix_entries, 1);
-	if (err)
-		goto msi;
-
-	err = request_irq(device->msix_entries[0].vector, ioat_dma_do_interrupt,
-			  0, "ioat-msix", device);
-	if (err) {
-		pci_disable_msix(device->pdev);
-		goto msi;
-	}
-	device->irq_mode = msix_single_vector;
-	goto done;
-
-msi:
-	err = pci_enable_msi(device->pdev);
-	if (err)
-		goto intx;
-
-	err = request_irq(device->pdev->irq, ioat_dma_do_interrupt,
-			  0, "ioat-msi", device);
-	if (err) {
-		pci_disable_msi(device->pdev);
-		goto intx;
-	}
-	/*
-	 * CB 1.2 devices need a bit set in configuration space to enable MSI
-	 */
-	if (device->version == IOAT_VER_1_2) {
-		u32 dmactrl;
-		pci_read_config_dword(device->pdev,
-				      IOAT_PCI_DMACTRL_OFFSET, &dmactrl);
-		dmactrl |= IOAT_PCI_DMACTRL_MSI_EN;
-		pci_write_config_dword(device->pdev,
-				       IOAT_PCI_DMACTRL_OFFSET, dmactrl);
-	}
-	device->irq_mode = msi;
-	goto done;
-
-intx:
-	err = request_irq(device->pdev->irq, ioat_dma_do_interrupt,
-			  IRQF_SHARED, "ioat-intx", device);
-	if (err)
-		goto err_no_irq;
-	device->irq_mode = intx;
-
-done:
-	intrctrl |= IOAT_INTRCTRL_MASTER_INT_EN;
-	writeb(intrctrl, device->reg_base + IOAT_INTRCTRL_OFFSET);
-	return 0;
-
-err_no_irq:
-	/* Disable all interrupt generation */
-	writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
-	dev_err(&device->pdev->dev, "no usable interrupts\n");
-	device->irq_mode = none;
-	return -1;
-}
-
-/**
- * ioat_dma_remove_interrupts - remove whatever interrupts were set
- * @device: ioat device
- */
-static void ioat_dma_remove_interrupts(struct ioatdma_device *device)
-{
-	struct ioat_dma_chan *ioat_chan;
-	int i;
-
-	/* Disable all interrupt generation */
-	writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
-
-	switch (device->irq_mode) {
-	case msix_multi_vector:
-		for (i = 0; i < device->common.chancnt; i++) {
-			ioat_chan = ioat_lookup_chan_by_index(device, i);
-			free_irq(device->msix_entries[i].vector, ioat_chan);
-		}
-		pci_disable_msix(device->pdev);
-		break;
-	case msix_single_vector:
-		free_irq(device->msix_entries[0].vector, device);
-		pci_disable_msix(device->pdev);
-		break;
-	case msi:
-		free_irq(device->pdev->irq, device);
-		pci_disable_msi(device->pdev);
-		break;
-	case intx:
-		free_irq(device->pdev->irq, device);
-		break;
-	case none:
-		dev_warn(&device->pdev->dev,
-			 "call to %s without interrupts setup\n", __func__);
-	}
-	device->irq_mode = none;
-}
-
-struct ioatdma_device *ioat_dma_probe(struct pci_dev *pdev,
-				      void __iomem *iobase)
-{
-	int err;
-	struct ioatdma_device *device;
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device) {
-		err = -ENOMEM;
-		goto err_kzalloc;
-	}
-	device->pdev = pdev;
-	device->reg_base = iobase;
-	device->version = readb(device->reg_base + IOAT_VER_OFFSET);
-
-	/* DMA coherent memory pool for DMA descriptor allocations */
-	device->dma_pool = pci_pool_create("dma_desc_pool", pdev,
-					   sizeof(struct ioat_dma_descriptor),
-					   64, 0);
-	if (!device->dma_pool) {
-		err = -ENOMEM;
-		goto err_dma_pool;
-	}
-
-	device->completion_pool = pci_pool_create("completion_pool", pdev,
-						  sizeof(u64), SMP_CACHE_BYTES,
-						  SMP_CACHE_BYTES);
-	if (!device->completion_pool) {
-		err = -ENOMEM;
-		goto err_completion_pool;
-	}
-
-	INIT_LIST_HEAD(&device->common.channels);
-	ioat_dma_enumerate_channels(device);
-
-	device->common.device_alloc_chan_resources =
-						ioat_dma_alloc_chan_resources;
-	device->common.device_free_chan_resources =
-						ioat_dma_free_chan_resources;
-	device->common.dev = &pdev->dev;
-
-	dma_cap_set(DMA_MEMCPY, device->common.cap_mask);
-	device->common.device_is_tx_complete = ioat_dma_is_complete;
-	switch (device->version) {
-	case IOAT_VER_1_2:
-		device->common.device_prep_dma_memcpy = ioat1_dma_prep_memcpy;
-		device->common.device_issue_pending =
-						ioat1_dma_memcpy_issue_pending;
-		break;
-	case IOAT_VER_2_0:
-	case IOAT_VER_3_0:
-		device->common.device_prep_dma_memcpy = ioat2_dma_prep_memcpy;
-		device->common.device_issue_pending =
-						ioat2_dma_memcpy_issue_pending;
-		break;
-	}
-
-	dev_err(&device->pdev->dev,
-		"Intel(R) I/OAT DMA Engine found,"
-		" %d channels, device version 0x%02x, driver version %s\n",
-		device->common.chancnt, device->version, IOAT_DMA_VERSION);
-
-	if (!device->common.chancnt) {
-		dev_err(&device->pdev->dev,
-			"Intel(R) I/OAT DMA Engine problem found: "
-			"zero channels detected\n");
-		goto err_setup_interrupts;
-	}
-
-	err = ioat_dma_setup_interrupts(device);
-	if (err)
-		goto err_setup_interrupts;
-
-	err = ioat_dma_self_test(device);
-	if (err)
-		goto err_self_test;
-
-	ioat_set_tcp_copy_break(device);
-
-	dma_async_device_register(&device->common);
-
-	if (device->version != IOAT_VER_3_0) {
-		INIT_DELAYED_WORK(&device->work, ioat_dma_chan_watchdog);
-		schedule_delayed_work(&device->work,
-				      WATCHDOG_DELAY);
-	}
-
-	return device;
-
-err_self_test:
-	ioat_dma_remove_interrupts(device);
-err_setup_interrupts:
-	pci_pool_destroy(device->completion_pool);
-err_completion_pool:
-	pci_pool_destroy(device->dma_pool);
-err_dma_pool:
-	kfree(device);
-err_kzalloc:
-	dev_err(&pdev->dev,
-		"Intel(R) I/OAT DMA Engine initialization failed\n");
-	return NULL;
-}
-
-void ioat_dma_remove(struct ioatdma_device *device)
-{
-	struct dma_chan *chan, *_chan;
-	struct ioat_dma_chan *ioat_chan;
-
-	if (device->version != IOAT_VER_3_0)
-		cancel_delayed_work(&device->work);
-
-	ioat_dma_remove_interrupts(device);
-
-	dma_async_device_unregister(&device->common);
-
-	pci_pool_destroy(device->dma_pool);
-	pci_pool_destroy(device->completion_pool);
-
-	iounmap(device->reg_base);
-	pci_release_regions(device->pdev);
-	pci_disable_device(device->pdev);
-
-	list_for_each_entry_safe(chan, _chan,
-				 &device->common.channels, device_node) {
-		ioat_chan = to_ioat_chan(chan);
-		list_del(&chan->device_node);
-		kfree(ioat_chan);
-	}
-	kfree(device);
-}
-
diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
deleted file mode 100644
index a52ff4bd460..00000000000
--- a/drivers/dma/ioatdma.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-#ifndef IOATDMA_H
-#define IOATDMA_H
-
-#include <linux/dmaengine.h>
-#include "ioatdma_hw.h"
-#include <linux/init.h>
-#include <linux/dmapool.h>
-#include <linux/cache.h>
-#include <linux/pci_ids.h>
-#include <net/tcp.h>
-
-#define IOAT_DMA_VERSION  "3.64"
-
-enum ioat_interrupt {
-	none = 0,
-	msix_multi_vector = 1,
-	msix_single_vector = 2,
-	msi = 3,
-	intx = 4,
-};
-
-#define IOAT_LOW_COMPLETION_MASK	0xffffffc0
-#define IOAT_DMA_DCA_ANY_CPU		~0
-#define IOAT_WATCHDOG_PERIOD		(2 * HZ)
-
-
-/**
- * struct ioatdma_device - internal representation of a IOAT device
- * @pdev: PCI-Express device
- * @reg_base: MMIO register space base address
- * @dma_pool: for allocating DMA descriptors
- * @common: embedded struct dma_device
- * @version: version of ioatdma device
- * @irq_mode: which style irq to use
- * @msix_entries: irq handlers
- * @idx: per channel data
- */
-
-struct ioatdma_device {
-	struct pci_dev *pdev;
-	void __iomem *reg_base;
-	struct pci_pool *dma_pool;
-	struct pci_pool *completion_pool;
-	struct dma_device common;
-	u8 version;
-	enum ioat_interrupt irq_mode;
-	struct delayed_work work;
-	struct msix_entry msix_entries[4];
-	struct ioat_dma_chan *idx[4];
-};
-
-/**
- * struct ioat_dma_chan - internal representation of a DMA channel
- */
-struct ioat_dma_chan {
-
-	void __iomem *reg_base;
-
-	dma_cookie_t completed_cookie;
-	unsigned long last_completion;
-	unsigned long last_completion_time;
-
-	size_t xfercap;	/* XFERCAP register value expanded out */
-
-	spinlock_t cleanup_lock;
-	spinlock_t desc_lock;
-	struct list_head free_desc;
-	struct list_head used_desc;
-	unsigned long watchdog_completion;
-	int watchdog_tcp_cookie;
-	u32 watchdog_last_tcp_cookie;
-	struct delayed_work work;
-
-	int pending;
-	int dmacount;
-	int desccount;
-
-	struct ioatdma_device *device;
-	struct dma_chan common;
-
-	dma_addr_t completion_addr;
-	union {
-		u64 full; /* HW completion writeback */
-		struct {
-			u32 low;
-			u32 high;
-		};
-	} *completion_virt;
-	unsigned long last_compl_desc_addr_hw;
-	struct tasklet_struct cleanup_task;
-};
-
-/* wrapper around hardware descriptor format + additional software fields */
-
-/**
- * struct ioat_desc_sw - wrapper around hardware descriptor
- * @hw: hardware DMA descriptor
- * @node: this descriptor will either be on the free list,
- *     or attached to a transaction list (async_tx.tx_list)
- * @tx_cnt: number of descriptors required to complete the transaction
- * @async_tx: the generic software descriptor for all engines
- */
-struct ioat_desc_sw {
-	struct ioat_dma_descriptor *hw;
-	struct list_head node;
-	int tx_cnt;
-	size_t len;
-	dma_addr_t src;
-	dma_addr_t dst;
-	struct dma_async_tx_descriptor async_tx;
-};
-
-static inline void ioat_set_tcp_copy_break(struct ioatdma_device *dev)
-{
-	#ifdef CONFIG_NET_DMA
-	switch (dev->version) {
-	case IOAT_VER_1_2:
-		sysctl_tcp_dma_copybreak = 4096;
-		break;
-	case IOAT_VER_2_0:
-		sysctl_tcp_dma_copybreak = 2048;
-		break;
-	case IOAT_VER_3_0:
-		sysctl_tcp_dma_copybreak = 262144;
-		break;
-	}
-	#endif
-}
-
-#if defined(CONFIG_INTEL_IOATDMA) || defined(CONFIG_INTEL_IOATDMA_MODULE)
-struct ioatdma_device *ioat_dma_probe(struct pci_dev *pdev,
-				      void __iomem *iobase);
-void ioat_dma_remove(struct ioatdma_device *device);
-struct dca_provider *ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase);
-struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase);
-struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase);
-#else
-#define ioat_dma_probe(pdev, iobase)    NULL
-#define ioat_dma_remove(device)         do { } while (0)
-#define ioat_dca_init(pdev, iobase)	NULL
-#define ioat2_dca_init(pdev, iobase)	NULL
-#define ioat3_dca_init(pdev, iobase)	NULL
-#endif
-
-#endif /* IOATDMA_H */
diff --git a/drivers/dma/ioatdma_hw.h b/drivers/dma/ioatdma_hw.h
deleted file mode 100644
index afa57eef86c..00000000000
--- a/drivers/dma/ioatdma_hw.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-#ifndef _IOAT_HW_H_
-#define _IOAT_HW_H_
-
-/* PCI Configuration Space Values */
-#define IOAT_PCI_VID            0x8086
-
-/* CB device ID's */
-#define IOAT_PCI_DID_5000       0x1A38
-#define IOAT_PCI_DID_CNB        0x360B
-#define IOAT_PCI_DID_SCNB       0x65FF
-#define IOAT_PCI_DID_SNB        0x402F
-
-#define IOAT_PCI_RID            0x00
-#define IOAT_PCI_SVID           0x8086
-#define IOAT_PCI_SID            0x8086
-#define IOAT_VER_1_2            0x12    /* Version 1.2 */
-#define IOAT_VER_2_0            0x20    /* Version 2.0 */
-#define IOAT_VER_3_0            0x30    /* Version 3.0 */
-
-struct ioat_dma_descriptor {
-	uint32_t	size;
-	uint32_t	ctl;
-	uint64_t	src_addr;
-	uint64_t	dst_addr;
-	uint64_t	next;
-	uint64_t	rsv1;
-	uint64_t	rsv2;
-	uint64_t	user1;
-	uint64_t	user2;
-};
-
-#define IOAT_DMA_DESCRIPTOR_CTL_INT_GN	0x00000001
-#define IOAT_DMA_DESCRIPTOR_CTL_SRC_SN	0x00000002
-#define IOAT_DMA_DESCRIPTOR_CTL_DST_SN	0x00000004
-#define IOAT_DMA_DESCRIPTOR_CTL_CP_STS	0x00000008
-#define IOAT_DMA_DESCRIPTOR_CTL_FRAME	0x00000010
-#define IOAT_DMA_DESCRIPTOR_NUL		0x00000020
-#define IOAT_DMA_DESCRIPTOR_CTL_SP_BRK	0x00000040
-#define IOAT_DMA_DESCRIPTOR_CTL_DP_BRK	0x00000080
-#define IOAT_DMA_DESCRIPTOR_CTL_BNDL	0x00000100
-#define IOAT_DMA_DESCRIPTOR_CTL_DCA	0x00000200
-#define IOAT_DMA_DESCRIPTOR_CTL_BUFHINT	0x00000400
-
-#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_CONTEXT	0xFF000000
-#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_DMA	0x00000000
-
-#define IOAT_DMA_DESCRIPTOR_CTL_CONTEXT_DCA	0x00000001
-#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_MASK	0xFF000000
-
-#endif
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index 2f052265122..645ca8d54ec 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -31,6 +31,7 @@
 #include <linux/platform_device.h>
 #include <linux/memory.h>
 #include <linux/ioport.h>
+#include <linux/raid/pq.h>
 
 #include <mach/adma.h>
 
@@ -57,65 +58,110 @@ static void iop_adma_free_slots(struct iop_adma_desc_slot *slot)
 	}
 }
 
+static void
+iop_desc_unmap(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc)
+{
+	struct dma_async_tx_descriptor *tx = &desc->async_tx;
+	struct iop_adma_desc_slot *unmap = desc->group_head;
+	struct device *dev = &iop_chan->device->pdev->dev;
+	u32 len = unmap->unmap_len;
+	enum dma_ctrl_flags flags = tx->flags;
+	u32 src_cnt;
+	dma_addr_t addr;
+	dma_addr_t dest;
+
+	src_cnt = unmap->unmap_src_cnt;
+	dest = iop_desc_get_dest_addr(unmap, iop_chan);
+	if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+		enum dma_data_direction dir;
+
+		if (src_cnt > 1) /* is xor? */
+			dir = DMA_BIDIRECTIONAL;
+		else
+			dir = DMA_FROM_DEVICE;
+
+		dma_unmap_page(dev, dest, len, dir);
+	}
+
+	if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+		while (src_cnt--) {
+			addr = iop_desc_get_src_addr(unmap, iop_chan, src_cnt);
+			if (addr == dest)
+				continue;
+			dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
+		}
+	}
+	desc->group_head = NULL;
+}
+
+static void
+iop_desc_unmap_pq(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc)
+{
+	struct dma_async_tx_descriptor *tx = &desc->async_tx;
+	struct iop_adma_desc_slot *unmap = desc->group_head;
+	struct device *dev = &iop_chan->device->pdev->dev;
+	u32 len = unmap->unmap_len;
+	enum dma_ctrl_flags flags = tx->flags;
+	u32 src_cnt = unmap->unmap_src_cnt;
+	dma_addr_t pdest = iop_desc_get_dest_addr(unmap, iop_chan);
+	dma_addr_t qdest = iop_desc_get_qdest_addr(unmap, iop_chan);
+	int i;
+
+	if (tx->flags & DMA_PREP_CONTINUE)
+		src_cnt -= 3;
+
+	if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP) && !desc->pq_check_result) {
+		dma_unmap_page(dev, pdest, len, DMA_BIDIRECTIONAL);
+		dma_unmap_page(dev, qdest, len, DMA_BIDIRECTIONAL);
+	}
+
+	if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+		dma_addr_t addr;
+
+		for (i = 0; i < src_cnt; i++) {
+			addr = iop_desc_get_src_addr(unmap, iop_chan, i);
+			dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
+		}
+		if (desc->pq_check_result) {
+			dma_unmap_page(dev, pdest, len, DMA_TO_DEVICE);
+			dma_unmap_page(dev, qdest, len, DMA_TO_DEVICE);
+		}
+	}
+
+	desc->group_head = NULL;
+}
+
+
 static dma_cookie_t
 iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc,
 	struct iop_adma_chan *iop_chan, dma_cookie_t cookie)
 {
-	BUG_ON(desc->async_tx.cookie < 0);
-	if (desc->async_tx.cookie > 0) {
-		cookie = desc->async_tx.cookie;
-		desc->async_tx.cookie = 0;
+	struct dma_async_tx_descriptor *tx = &desc->async_tx;
+
+	BUG_ON(tx->cookie < 0);
+	if (tx->cookie > 0) {
+		cookie = tx->cookie;
+		tx->cookie = 0;
 
 		/* call the callback (must not sleep or submit new
 		 * operations to this channel)
 		 */
-		if (desc->async_tx.callback)
-			desc->async_tx.callback(
-				desc->async_tx.callback_param);
+		if (tx->callback)
+			tx->callback(tx->callback_param);
 
 		/* unmap dma addresses
 		 * (unmap_single vs unmap_page?)
 		 */
 		if (desc->group_head && desc->unmap_len) {
-			struct iop_adma_desc_slot *unmap = desc->group_head;
-			struct device *dev =
-				&iop_chan->device->pdev->dev;
-			u32 len = unmap->unmap_len;
-			enum dma_ctrl_flags flags = desc->async_tx.flags;
-			u32 src_cnt;
-			dma_addr_t addr;
-			dma_addr_t dest;
-
-			src_cnt = unmap->unmap_src_cnt;
-			dest = iop_desc_get_dest_addr(unmap, iop_chan);
-			if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-				enum dma_data_direction dir;
-
-				if (src_cnt > 1) /* is xor? */
-					dir = DMA_BIDIRECTIONAL;
-				else
-					dir = DMA_FROM_DEVICE;
-
-				dma_unmap_page(dev, dest, len, dir);
-			}
-
-			if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-				while (src_cnt--) {
-					addr = iop_desc_get_src_addr(unmap,
-								     iop_chan,
-								     src_cnt);
-					if (addr == dest)
-						continue;
-					dma_unmap_page(dev, addr, len,
-						       DMA_TO_DEVICE);
-				}
-			}
-			desc->group_head = NULL;
+			if (iop_desc_is_pq(desc))
+				iop_desc_unmap_pq(iop_chan, desc);
+			else
+				iop_desc_unmap(iop_chan, desc);
 		}
 	}
 
 	/* run dependent operations */
-	dma_run_dependencies(&desc->async_tx);
+	dma_run_dependencies(tx);
 
 	return cookie;
 }
@@ -287,7 +333,12 @@ static void iop_adma_tasklet(unsigned long data)
 {
 	struct iop_adma_chan *iop_chan = (struct iop_adma_chan *) data;
 
-	spin_lock(&iop_chan->lock);
+	/* lockdep will flag depedency submissions as potentially
+	 * recursive locking, this is not the case as a dependency
+	 * submission will never recurse a channels submit routine.
+	 * There are checks in async_tx.c to prevent this.
+	 */
+	spin_lock_nested(&iop_chan->lock, SINGLE_DEPTH_NESTING);
 	__iop_adma_slot_cleanup(iop_chan);
 	spin_unlock(&iop_chan->lock);
 }
@@ -370,7 +421,7 @@ retry:
 			}
 			alloc_tail->group_head = alloc_start;
 			alloc_tail->async_tx.cookie = -EBUSY;
-			list_splice(&chain, &alloc_tail->async_tx.tx_list);
+			list_splice(&chain, &alloc_tail->tx_list);
 			iop_chan->last_used = last_used;
 			iop_desc_clear_next_desc(alloc_start);
 			iop_desc_clear_next_desc(alloc_tail);
@@ -429,7 +480,7 @@ iop_adma_tx_submit(struct dma_async_tx_descriptor *tx)
 
 	old_chain_tail = list_entry(iop_chan->chain.prev,
 		struct iop_adma_desc_slot, chain_node);
-	list_splice_init(&sw_desc->async_tx.tx_list,
+	list_splice_init(&sw_desc->tx_list,
 			 &old_chain_tail->chain_node);
 
 	/* fix up the hardware chain */
@@ -496,6 +547,7 @@ static int iop_adma_alloc_chan_resources(struct dma_chan *chan)
 
 		dma_async_tx_descriptor_init(&slot->async_tx, chan);
 		slot->async_tx.tx_submit = iop_adma_tx_submit;
+		INIT_LIST_HEAD(&slot->tx_list);
 		INIT_LIST_HEAD(&slot->chain_node);
 		INIT_LIST_HEAD(&slot->slot_node);
 		hw_desc = (char *) iop_chan->device->dma_desc_pool;
@@ -660,9 +712,9 @@ iop_adma_prep_dma_xor(struct dma_chan *chan, dma_addr_t dma_dest,
 }
 
 static struct dma_async_tx_descriptor *
-iop_adma_prep_dma_zero_sum(struct dma_chan *chan, dma_addr_t *dma_src,
-			   unsigned int src_cnt, size_t len, u32 *result,
-			   unsigned long flags)
+iop_adma_prep_dma_xor_val(struct dma_chan *chan, dma_addr_t *dma_src,
+			  unsigned int src_cnt, size_t len, u32 *result,
+			  unsigned long flags)
 {
 	struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
 	struct iop_adma_desc_slot *sw_desc, *grp_start;
@@ -696,6 +748,118 @@ iop_adma_prep_dma_zero_sum(struct dma_chan *chan, dma_addr_t *dma_src,
 	return sw_desc ? &sw_desc->async_tx : NULL;
 }
 
+static struct dma_async_tx_descriptor *
+iop_adma_prep_dma_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
+		     unsigned int src_cnt, const unsigned char *scf, size_t len,
+		     unsigned long flags)
+{
+	struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
+	struct iop_adma_desc_slot *sw_desc, *g;
+	int slot_cnt, slots_per_op;
+	int continue_srcs;
+
+	if (unlikely(!len))
+		return NULL;
+	BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT);
+
+	dev_dbg(iop_chan->device->common.dev,
+		"%s src_cnt: %d len: %u flags: %lx\n",
+		__func__, src_cnt, len, flags);
+
+	if (dmaf_p_disabled_continue(flags))
+		continue_srcs = 1+src_cnt;
+	else if (dmaf_continue(flags))
+		continue_srcs = 3+src_cnt;
+	else
+		continue_srcs = 0+src_cnt;
+
+	spin_lock_bh(&iop_chan->lock);
+	slot_cnt = iop_chan_pq_slot_count(len, continue_srcs, &slots_per_op);
+	sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
+	if (sw_desc) {
+		int i;
+
+		g = sw_desc->group_head;
+		iop_desc_set_byte_count(g, iop_chan, len);
+
+		/* even if P is disabled its destination address (bits
+		 * [3:0]) must match Q.  It is ok if P points to an
+		 * invalid address, it won't be written.
+		 */
+		if (flags & DMA_PREP_PQ_DISABLE_P)
+			dst[0] = dst[1] & 0x7;
+
+		iop_desc_set_pq_addr(g, dst);
+		sw_desc->unmap_src_cnt = src_cnt;
+		sw_desc->unmap_len = len;
+		sw_desc->async_tx.flags = flags;
+		for (i = 0; i < src_cnt; i++)
+			iop_desc_set_pq_src_addr(g, i, src[i], scf[i]);
+
+		/* if we are continuing a previous operation factor in
+		 * the old p and q values, see the comment for dma_maxpq
+		 * in include/linux/dmaengine.h
+		 */
+		if (dmaf_p_disabled_continue(flags))
+			iop_desc_set_pq_src_addr(g, i++, dst[1], 1);
+		else if (dmaf_continue(flags)) {
+			iop_desc_set_pq_src_addr(g, i++, dst[0], 0);
+			iop_desc_set_pq_src_addr(g, i++, dst[1], 1);
+			iop_desc_set_pq_src_addr(g, i++, dst[1], 0);
+		}
+		iop_desc_init_pq(g, i, flags);
+	}
+	spin_unlock_bh(&iop_chan->lock);
+
+	return sw_desc ? &sw_desc->async_tx : NULL;
+}
+
+static struct dma_async_tx_descriptor *
+iop_adma_prep_dma_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
+			 unsigned int src_cnt, const unsigned char *scf,
+			 size_t len, enum sum_check_flags *pqres,
+			 unsigned long flags)
+{
+	struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
+	struct iop_adma_desc_slot *sw_desc, *g;
+	int slot_cnt, slots_per_op;
+
+	if (unlikely(!len))
+		return NULL;
+	BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT);
+
+	dev_dbg(iop_chan->device->common.dev, "%s src_cnt: %d len: %u\n",
+		__func__, src_cnt, len);
+
+	spin_lock_bh(&iop_chan->lock);
+	slot_cnt = iop_chan_pq_zero_sum_slot_count(len, src_cnt + 2, &slots_per_op);
+	sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
+	if (sw_desc) {
+		/* for validate operations p and q are tagged onto the
+		 * end of the source list
+		 */
+		int pq_idx = src_cnt;
+
+		g = sw_desc->group_head;
+		iop_desc_init_pq_zero_sum(g, src_cnt+2, flags);
+		iop_desc_set_pq_zero_sum_byte_count(g, len);
+		g->pq_check_result = pqres;
+		pr_debug("\t%s: g->pq_check_result: %p\n",
+			__func__, g->pq_check_result);
+		sw_desc->unmap_src_cnt = src_cnt+2;
+		sw_desc->unmap_len = len;
+		sw_desc->async_tx.flags = flags;
+		while (src_cnt--)
+			iop_desc_set_pq_zero_sum_src_addr(g, src_cnt,
+							  src[src_cnt],
+							  scf[src_cnt]);
+		iop_desc_set_pq_zero_sum_addr(g, pq_idx, src);
+	}
+	spin_unlock_bh(&iop_chan->lock);
+
+	return sw_desc ? &sw_desc->async_tx : NULL;
+}
+
 static void iop_adma_free_chan_resources(struct dma_chan *chan)
 {
 	struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
@@ -906,7 +1070,7 @@ out:
 
 #define IOP_ADMA_NUM_SRC_TEST 4 /* must be <= 15 */
 static int __devinit
-iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
+iop_adma_xor_val_self_test(struct iop_adma_device *device)
 {
 	int i, src_idx;
 	struct page *dest;
@@ -1002,7 +1166,7 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
 		PAGE_SIZE, DMA_TO_DEVICE);
 
 	/* skip zero sum if the capability is not present */
-	if (!dma_has_cap(DMA_ZERO_SUM, dma_chan->device->cap_mask))
+	if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask))
 		goto free_resources;
 
 	/* zero sum the sources with the destintation page */
@@ -1016,10 +1180,10 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
 		dma_srcs[i] = dma_map_page(dma_chan->device->dev,
 					   zero_sum_srcs[i], 0, PAGE_SIZE,
 					   DMA_TO_DEVICE);
-	tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs,
-					IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE,
-					&zero_sum_result,
-					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs,
+				       IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE,
+				       &zero_sum_result,
+				       DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 
 	cookie = iop_adma_tx_submit(tx);
 	iop_adma_issue_pending(dma_chan);
@@ -1072,10 +1236,10 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
 		dma_srcs[i] = dma_map_page(dma_chan->device->dev,
 					   zero_sum_srcs[i], 0, PAGE_SIZE,
 					   DMA_TO_DEVICE);
-	tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs,
-					IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE,
-					&zero_sum_result,
-					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs,
+				       IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE,
+				       &zero_sum_result,
+				       DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 
 	cookie = iop_adma_tx_submit(tx);
 	iop_adma_issue_pending(dma_chan);
@@ -1105,6 +1269,170 @@ out:
 	return err;
 }
 
+#ifdef CONFIG_MD_RAID6_PQ
+static int __devinit
+iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device)
+{
+	/* combined sources, software pq results, and extra hw pq results */
+	struct page *pq[IOP_ADMA_NUM_SRC_TEST+2+2];
+	/* ptr to the extra hw pq buffers defined above */
+	struct page **pq_hw = &pq[IOP_ADMA_NUM_SRC_TEST+2];
+	/* address conversion buffers (dma_map / page_address) */
+	void *pq_sw[IOP_ADMA_NUM_SRC_TEST+2];
+	dma_addr_t pq_src[IOP_ADMA_NUM_SRC_TEST];
+	dma_addr_t pq_dest[2];
+
+	int i;
+	struct dma_async_tx_descriptor *tx;
+	struct dma_chan *dma_chan;
+	dma_cookie_t cookie;
+	u32 zero_sum_result;
+	int err = 0;
+	struct device *dev;
+
+	dev_dbg(device->common.dev, "%s\n", __func__);
+
+	for (i = 0; i < ARRAY_SIZE(pq); i++) {
+		pq[i] = alloc_page(GFP_KERNEL);
+		if (!pq[i]) {
+			while (i--)
+				__free_page(pq[i]);
+			return -ENOMEM;
+		}
+	}
+
+	/* Fill in src buffers */
+	for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++) {
+		pq_sw[i] = page_address(pq[i]);
+		memset(pq_sw[i], 0x11111111 * (1<<i), PAGE_SIZE);
+	}
+	pq_sw[i] = page_address(pq[i]);
+	pq_sw[i+1] = page_address(pq[i+1]);
+
+	dma_chan = container_of(device->common.channels.next,
+				struct dma_chan,
+				device_node);
+	if (iop_adma_alloc_chan_resources(dma_chan) < 1) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	dev = dma_chan->device->dev;
+
+	/* initialize the dests */
+	memset(page_address(pq_hw[0]), 0 , PAGE_SIZE);
+	memset(page_address(pq_hw[1]), 0 , PAGE_SIZE);
+
+	/* test pq */
+	pq_dest[0] = dma_map_page(dev, pq_hw[0], 0, PAGE_SIZE, DMA_FROM_DEVICE);
+	pq_dest[1] = dma_map_page(dev, pq_hw[1], 0, PAGE_SIZE, DMA_FROM_DEVICE);
+	for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++)
+		pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE,
+					 DMA_TO_DEVICE);
+
+	tx = iop_adma_prep_dma_pq(dma_chan, pq_dest, pq_src,
+				  IOP_ADMA_NUM_SRC_TEST, (u8 *)raid6_gfexp,
+				  PAGE_SIZE,
+				  DMA_PREP_INTERRUPT |
+				  DMA_CTRL_ACK);
+
+	cookie = iop_adma_tx_submit(tx);
+	iop_adma_issue_pending(dma_chan);
+	msleep(8);
+
+	if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
+		DMA_SUCCESS) {
+		dev_err(dev, "Self-test pq timed out, disabling\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	raid6_call.gen_syndrome(IOP_ADMA_NUM_SRC_TEST+2, PAGE_SIZE, pq_sw);
+
+	if (memcmp(pq_sw[IOP_ADMA_NUM_SRC_TEST],
+		   page_address(pq_hw[0]), PAGE_SIZE) != 0) {
+		dev_err(dev, "Self-test p failed compare, disabling\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+	if (memcmp(pq_sw[IOP_ADMA_NUM_SRC_TEST+1],
+		   page_address(pq_hw[1]), PAGE_SIZE) != 0) {
+		dev_err(dev, "Self-test q failed compare, disabling\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	/* test correct zero sum using the software generated pq values */
+	for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 2; i++)
+		pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE,
+					 DMA_TO_DEVICE);
+
+	zero_sum_result = ~0;
+	tx = iop_adma_prep_dma_pq_val(dma_chan, &pq_src[IOP_ADMA_NUM_SRC_TEST],
+				      pq_src, IOP_ADMA_NUM_SRC_TEST,
+				      raid6_gfexp, PAGE_SIZE, &zero_sum_result,
+				      DMA_PREP_INTERRUPT|DMA_CTRL_ACK);
+
+	cookie = iop_adma_tx_submit(tx);
+	iop_adma_issue_pending(dma_chan);
+	msleep(8);
+
+	if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
+		DMA_SUCCESS) {
+		dev_err(dev, "Self-test pq-zero-sum timed out, disabling\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	if (zero_sum_result != 0) {
+		dev_err(dev, "Self-test pq-zero-sum failed to validate: %x\n",
+			zero_sum_result);
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	/* test incorrect zero sum */
+	i = IOP_ADMA_NUM_SRC_TEST;
+	memset(pq_sw[i] + 100, 0, 100);
+	memset(pq_sw[i+1] + 200, 0, 200);
+	for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 2; i++)
+		pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE,
+					 DMA_TO_DEVICE);
+
+	zero_sum_result = 0;
+	tx = iop_adma_prep_dma_pq_val(dma_chan, &pq_src[IOP_ADMA_NUM_SRC_TEST],
+				      pq_src, IOP_ADMA_NUM_SRC_TEST,
+				      raid6_gfexp, PAGE_SIZE, &zero_sum_result,
+				      DMA_PREP_INTERRUPT|DMA_CTRL_ACK);
+
+	cookie = iop_adma_tx_submit(tx);
+	iop_adma_issue_pending(dma_chan);
+	msleep(8);
+
+	if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
+		DMA_SUCCESS) {
+		dev_err(dev, "Self-test !pq-zero-sum timed out, disabling\n");
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+	if (zero_sum_result != (SUM_CHECK_P_RESULT | SUM_CHECK_Q_RESULT)) {
+		dev_err(dev, "Self-test !pq-zero-sum failed to validate: %x\n",
+			zero_sum_result);
+		err = -ENODEV;
+		goto free_resources;
+	}
+
+free_resources:
+	iop_adma_free_chan_resources(dma_chan);
+out:
+	i = ARRAY_SIZE(pq);
+	while (i--)
+		__free_page(pq[i]);
+	return err;
+}
+#endif
+
 static int __devexit iop_adma_remove(struct platform_device *dev)
 {
 	struct iop_adma_device *device = platform_get_drvdata(dev);
@@ -1192,9 +1520,16 @@ static int __devinit iop_adma_probe(struct platform_device *pdev)
 		dma_dev->max_xor = iop_adma_get_max_xor();
 		dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor;
 	}
-	if (dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask))
-		dma_dev->device_prep_dma_zero_sum =
-			iop_adma_prep_dma_zero_sum;
+	if (dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask))
+		dma_dev->device_prep_dma_xor_val =
+			iop_adma_prep_dma_xor_val;
+	if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
+		dma_set_maxpq(dma_dev, iop_adma_get_max_pq(), 0);
+		dma_dev->device_prep_dma_pq = iop_adma_prep_dma_pq;
+	}
+	if (dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask))
+		dma_dev->device_prep_dma_pq_val =
+			iop_adma_prep_dma_pq_val;
 	if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask))
 		dma_dev->device_prep_dma_interrupt =
 			iop_adma_prep_dma_interrupt;
@@ -1248,23 +1583,35 @@ static int __devinit iop_adma_probe(struct platform_device *pdev)
 	}
 
 	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) ||
-		dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) {
-		ret = iop_adma_xor_zero_sum_self_test(adev);
+	    dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) {
+		ret = iop_adma_xor_val_self_test(adev);
 		dev_dbg(&pdev->dev, "xor self test returned %d\n", ret);
 		if (ret)
 			goto err_free_iop_chan;
 	}
 
+	if (dma_has_cap(DMA_PQ, dma_dev->cap_mask) &&
+	    dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask)) {
+		#ifdef CONFIG_MD_RAID6_PQ
+		ret = iop_adma_pq_zero_sum_self_test(adev);
+		dev_dbg(&pdev->dev, "pq self test returned %d\n", ret);
+		#else
+		/* can not test raid6, so do not publish capability */
+		dma_cap_clear(DMA_PQ, dma_dev->cap_mask);
+		dma_cap_clear(DMA_PQ_VAL, dma_dev->cap_mask);
+		ret = 0;
+		#endif
+		if (ret)
+			goto err_free_iop_chan;
+	}
+
 	dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: "
-	  "( %s%s%s%s%s%s%s%s%s%s)\n",
-	  dma_has_cap(DMA_PQ_XOR, dma_dev->cap_mask) ? "pq_xor " : "",
-	  dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "",
-	  dma_has_cap(DMA_PQ_ZERO_SUM, dma_dev->cap_mask) ? "pq_zero_sum " : "",
+	  "( %s%s%s%s%s%s%s)\n",
+	  dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "",
+	  dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "",
 	  dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
-	  dma_has_cap(DMA_DUAL_XOR, dma_dev->cap_mask) ? "dual_xor " : "",
-	  dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask) ? "xor_zero_sum " : "",
+	  dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask) ? "xor_val " : "",
 	  dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)  ? "fill " : "",
-	  dma_has_cap(DMA_MEMCPY_CRC32C, dma_dev->cap_mask) ? "cpy+crc " : "",
 	  dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "",
 	  dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : "");
 
@@ -1296,7 +1643,7 @@ static void iop_chan_start_null_memcpy(struct iop_adma_chan *iop_chan)
 	if (sw_desc) {
 		grp_start = sw_desc->group_head;
 
-		list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain);
+		list_splice_init(&sw_desc->tx_list, &iop_chan->chain);
 		async_tx_ack(&sw_desc->async_tx);
 		iop_desc_init_memcpy(grp_start, 0);
 		iop_desc_set_byte_count(grp_start, iop_chan, 0);
@@ -1352,7 +1699,7 @@ static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan)
 	sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
 	if (sw_desc) {
 		grp_start = sw_desc->group_head;
-		list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain);
+		list_splice_init(&sw_desc->tx_list, &iop_chan->chain);
 		async_tx_ack(&sw_desc->async_tx);
 		iop_desc_init_null_xor(grp_start, 2, 0);
 		iop_desc_set_byte_count(grp_start, iop_chan, 0);
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
index 9f6fe46a9b8..c0a272c7368 100644
--- a/drivers/dma/iovlock.c
+++ b/drivers/dma/iovlock.c
@@ -183,6 +183,11 @@ dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
 					iov_byte_offset,
 					kdata,
 					copy);
+			/* poll for a descriptor slot */
+			if (unlikely(dma_cookie < 0)) {
+				dma_async_issue_pending(chan);
+				continue;
+			}
 
 			len -= copy;
 			iov[iovec_idx].iov_len -= copy;
@@ -248,6 +253,11 @@ dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
 					page,
 					offset,
 					copy);
+			/* poll for a descriptor slot */
+			if (unlikely(dma_cookie < 0)) {
+				dma_async_issue_pending(chan);
+				continue;
+			}
 
 			len -= copy;
 			iov[iovec_idx].iov_len -= copy;
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index 3f23eabe09f..466ab10c1ff 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -517,7 +517,7 @@ retry:
 			}
 			alloc_tail->group_head = alloc_start;
 			alloc_tail->async_tx.cookie = -EBUSY;
-			list_splice(&chain, &alloc_tail->async_tx.tx_list);
+			list_splice(&chain, &alloc_tail->tx_list);
 			mv_chan->last_used = last_used;
 			mv_desc_clear_next_desc(alloc_start);
 			mv_desc_clear_next_desc(alloc_tail);
@@ -565,14 +565,14 @@ mv_xor_tx_submit(struct dma_async_tx_descriptor *tx)
 	cookie = mv_desc_assign_cookie(mv_chan, sw_desc);
 
 	if (list_empty(&mv_chan->chain))
-		list_splice_init(&sw_desc->async_tx.tx_list, &mv_chan->chain);
+		list_splice_init(&sw_desc->tx_list, &mv_chan->chain);
 	else {
 		new_hw_chain = 0;
 
 		old_chain_tail = list_entry(mv_chan->chain.prev,
 					    struct mv_xor_desc_slot,
 					    chain_node);
-		list_splice_init(&grp_start->async_tx.tx_list,
+		list_splice_init(&grp_start->tx_list,
 				 &old_chain_tail->chain_node);
 
 		if (!mv_can_chain(grp_start))
@@ -632,6 +632,7 @@ static int mv_xor_alloc_chan_resources(struct dma_chan *chan)
 		slot->async_tx.tx_submit = mv_xor_tx_submit;
 		INIT_LIST_HEAD(&slot->chain_node);
 		INIT_LIST_HEAD(&slot->slot_node);
+		INIT_LIST_HEAD(&slot->tx_list);
 		hw_desc = (char *) mv_chan->device->dma_desc_pool;
 		slot->async_tx.phys =
 			(dma_addr_t) &hw_desc[idx * MV_XOR_SLOT_SIZE];
diff --git a/drivers/dma/mv_xor.h b/drivers/dma/mv_xor.h
index 06cafe1ef52..977b592e976 100644
--- a/drivers/dma/mv_xor.h
+++ b/drivers/dma/mv_xor.h
@@ -126,9 +126,8 @@ struct mv_xor_chan {
  * @idx: pool index
  * @unmap_src_cnt: number of xor sources
  * @unmap_len: transaction bytecount
+ * @tx_list: list of slots that make up a multi-descriptor transaction
  * @async_tx: support for the async_tx api
- * @group_list: list of slots that make up a multi-descriptor transaction
- *	for example transfer lengths larger than the supported hw max
  * @xor_check_result: result of zero sum
  * @crc32_result: result crc calculation
  */
@@ -145,6 +144,7 @@ struct mv_xor_desc_slot {
 	u16			unmap_src_cnt;
 	u32			value;
 	size_t			unmap_len;
+	struct list_head	tx_list;
 	struct dma_async_tx_descriptor	async_tx;
 	union {
 		u32		*xor_check_result;
diff --git a/drivers/dma/shdma.c b/drivers/dma/shdma.c
new file mode 100644
index 00000000000..b3b065c4e5c
--- /dev/null
+++ b/drivers/dma/shdma.c
@@ -0,0 +1,786 @@
+/*
+ * Renesas SuperH DMA Engine support
+ *
+ * base is drivers/dma/flsdma.c
+ *
+ * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
+ * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
+ * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * - DMA of SuperH does not have Hardware DMA chain mode.
+ * - MAX DMA size is 16MB.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/platform_device.h>
+#include <cpu/dma.h>
+#include <asm/dma-sh.h>
+#include "shdma.h"
+
+/* DMA descriptor control */
+#define DESC_LAST	(-1)
+#define DESC_COMP	(1)
+#define DESC_NCOMP	(0)
+
+#define NR_DESCS_PER_CHANNEL 32
+/*
+ * Define the default configuration for dual address memory-memory transfer.
+ * The 0x400 value represents auto-request, external->external.
+ *
+ * And this driver set 4byte burst mode.
+ * If you want to change mode, you need to change RS_DEFAULT of value.
+ * (ex 1byte burst mode -> (RS_DUAL & ~TS_32)
+ */
+#define RS_DEFAULT  (RS_DUAL)
+
+#define SH_DMAC_CHAN_BASE(id) (dma_base_addr[id])
+static void sh_dmae_writel(struct sh_dmae_chan *sh_dc, u32 data, u32 reg)
+{
+	ctrl_outl(data, (SH_DMAC_CHAN_BASE(sh_dc->id) + reg));
+}
+
+static u32 sh_dmae_readl(struct sh_dmae_chan *sh_dc, u32 reg)
+{
+	return ctrl_inl((SH_DMAC_CHAN_BASE(sh_dc->id) + reg));
+}
+
+static void dmae_init(struct sh_dmae_chan *sh_chan)
+{
+	u32 chcr = RS_DEFAULT; /* default is DUAL mode */
+	sh_dmae_writel(sh_chan, chcr, CHCR);
+}
+
+/*
+ * Reset DMA controller
+ *
+ * SH7780 has two DMAOR register
+ */
+static void sh_dmae_ctl_stop(int id)
+{
+	unsigned short dmaor = dmaor_read_reg(id);
+
+	dmaor &= ~(DMAOR_NMIF | DMAOR_AE);
+	dmaor_write_reg(id, dmaor);
+}
+
+static int sh_dmae_rst(int id)
+{
+	unsigned short dmaor;
+
+	sh_dmae_ctl_stop(id);
+	dmaor = (dmaor_read_reg(id)|DMAOR_INIT);
+
+	dmaor_write_reg(id, dmaor);
+	if ((dmaor_read_reg(id) & (DMAOR_AE | DMAOR_NMIF))) {
+		pr_warning(KERN_ERR "dma-sh: Can't initialize DMAOR.\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int dmae_is_idle(struct sh_dmae_chan *sh_chan)
+{
+	u32 chcr = sh_dmae_readl(sh_chan, CHCR);
+	if (chcr & CHCR_DE) {
+		if (!(chcr & CHCR_TE))
+			return -EBUSY; /* working */
+	}
+	return 0; /* waiting */
+}
+
+static inline unsigned int calc_xmit_shift(struct sh_dmae_chan *sh_chan)
+{
+	u32 chcr = sh_dmae_readl(sh_chan, CHCR);
+	return ts_shift[(chcr & CHCR_TS_MASK) >> CHCR_TS_SHIFT];
+}
+
+static void dmae_set_reg(struct sh_dmae_chan *sh_chan, struct sh_dmae_regs hw)
+{
+	sh_dmae_writel(sh_chan, hw.sar, SAR);
+	sh_dmae_writel(sh_chan, hw.dar, DAR);
+	sh_dmae_writel(sh_chan,
+		(hw.tcr >> calc_xmit_shift(sh_chan)), TCR);
+}
+
+static void dmae_start(struct sh_dmae_chan *sh_chan)
+{
+	u32 chcr = sh_dmae_readl(sh_chan, CHCR);
+
+	chcr |= (CHCR_DE|CHCR_IE);
+	sh_dmae_writel(sh_chan, chcr, CHCR);
+}
+
+static void dmae_halt(struct sh_dmae_chan *sh_chan)
+{
+	u32 chcr = sh_dmae_readl(sh_chan, CHCR);
+
+	chcr &= ~(CHCR_DE | CHCR_TE | CHCR_IE);
+	sh_dmae_writel(sh_chan, chcr, CHCR);
+}
+
+static int dmae_set_chcr(struct sh_dmae_chan *sh_chan, u32 val)
+{
+	int ret = dmae_is_idle(sh_chan);
+	/* When DMA was working, can not set data to CHCR */
+	if (ret)
+		return ret;
+
+	sh_dmae_writel(sh_chan, val, CHCR);
+	return 0;
+}
+
+#define DMARS1_ADDR	0x04
+#define DMARS2_ADDR	0x08
+#define DMARS_SHIFT 8
+#define DMARS_CHAN_MSK 0x01
+static int dmae_set_dmars(struct sh_dmae_chan *sh_chan, u16 val)
+{
+	u32 addr;
+	int shift = 0;
+	int ret = dmae_is_idle(sh_chan);
+	if (ret)
+		return ret;
+
+	if (sh_chan->id & DMARS_CHAN_MSK)
+		shift = DMARS_SHIFT;
+
+	switch (sh_chan->id) {
+	/* DMARS0 */
+	case 0:
+	case 1:
+		addr = SH_DMARS_BASE;
+		break;
+	/* DMARS1 */
+	case 2:
+	case 3:
+		addr = (SH_DMARS_BASE + DMARS1_ADDR);
+		break;
+	/* DMARS2 */
+	case 4:
+	case 5:
+		addr = (SH_DMARS_BASE + DMARS2_ADDR);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ctrl_outw((val << shift) |
+		(ctrl_inw(addr) & (shift ? 0xFF00 : 0x00FF)),
+		addr);
+
+	return 0;
+}
+
+static dma_cookie_t sh_dmae_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct sh_desc *desc = tx_to_sh_desc(tx);
+	struct sh_dmae_chan *sh_chan = to_sh_chan(tx->chan);
+	dma_cookie_t cookie;
+
+	spin_lock_bh(&sh_chan->desc_lock);
+
+	cookie = sh_chan->common.cookie;
+	cookie++;
+	if (cookie < 0)
+		cookie = 1;
+
+	/* If desc only in the case of 1 */
+	if (desc->async_tx.cookie != -EBUSY)
+		desc->async_tx.cookie = cookie;
+	sh_chan->common.cookie = desc->async_tx.cookie;
+
+	list_splice_init(&desc->tx_list, sh_chan->ld_queue.prev);
+
+	spin_unlock_bh(&sh_chan->desc_lock);
+
+	return cookie;
+}
+
+static struct sh_desc *sh_dmae_get_desc(struct sh_dmae_chan *sh_chan)
+{
+	struct sh_desc *desc, *_desc, *ret = NULL;
+
+	spin_lock_bh(&sh_chan->desc_lock);
+	list_for_each_entry_safe(desc, _desc, &sh_chan->ld_free, node) {
+		if (async_tx_test_ack(&desc->async_tx)) {
+			list_del(&desc->node);
+			ret = desc;
+			break;
+		}
+	}
+	spin_unlock_bh(&sh_chan->desc_lock);
+
+	return ret;
+}
+
+static void sh_dmae_put_desc(struct sh_dmae_chan *sh_chan, struct sh_desc *desc)
+{
+	if (desc) {
+		spin_lock_bh(&sh_chan->desc_lock);
+
+		list_splice_init(&desc->tx_list, &sh_chan->ld_free);
+		list_add(&desc->node, &sh_chan->ld_free);
+
+		spin_unlock_bh(&sh_chan->desc_lock);
+	}
+}
+
+static int sh_dmae_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
+	struct sh_desc *desc;
+
+	spin_lock_bh(&sh_chan->desc_lock);
+	while (sh_chan->descs_allocated < NR_DESCS_PER_CHANNEL) {
+		spin_unlock_bh(&sh_chan->desc_lock);
+		desc = kzalloc(sizeof(struct sh_desc), GFP_KERNEL);
+		if (!desc) {
+			spin_lock_bh(&sh_chan->desc_lock);
+			break;
+		}
+		dma_async_tx_descriptor_init(&desc->async_tx,
+					&sh_chan->common);
+		desc->async_tx.tx_submit = sh_dmae_tx_submit;
+		desc->async_tx.flags = DMA_CTRL_ACK;
+		INIT_LIST_HEAD(&desc->tx_list);
+		sh_dmae_put_desc(sh_chan, desc);
+
+		spin_lock_bh(&sh_chan->desc_lock);
+		sh_chan->descs_allocated++;
+	}
+	spin_unlock_bh(&sh_chan->desc_lock);
+
+	return sh_chan->descs_allocated;
+}
+
+/*
+ * sh_dma_free_chan_resources - Free all resources of the channel.
+ */
+static void sh_dmae_free_chan_resources(struct dma_chan *chan)
+{
+	struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
+	struct sh_desc *desc, *_desc;
+	LIST_HEAD(list);
+
+	BUG_ON(!list_empty(&sh_chan->ld_queue));
+	spin_lock_bh(&sh_chan->desc_lock);
+
+	list_splice_init(&sh_chan->ld_free, &list);
+	sh_chan->descs_allocated = 0;
+
+	spin_unlock_bh(&sh_chan->desc_lock);
+
+	list_for_each_entry_safe(desc, _desc, &list, node)
+		kfree(desc);
+}
+
+static struct dma_async_tx_descriptor *sh_dmae_prep_memcpy(
+	struct dma_chan *chan, dma_addr_t dma_dest, dma_addr_t dma_src,
+	size_t len, unsigned long flags)
+{
+	struct sh_dmae_chan *sh_chan;
+	struct sh_desc *first = NULL, *prev = NULL, *new;
+	size_t copy_size;
+
+	if (!chan)
+		return NULL;
+
+	if (!len)
+		return NULL;
+
+	sh_chan = to_sh_chan(chan);
+
+	do {
+		/* Allocate the link descriptor from DMA pool */
+		new = sh_dmae_get_desc(sh_chan);
+		if (!new) {
+			dev_err(sh_chan->dev,
+					"No free memory for link descriptor\n");
+			goto err_get_desc;
+		}
+
+		copy_size = min(len, (size_t)SH_DMA_TCR_MAX);
+
+		new->hw.sar = dma_src;
+		new->hw.dar = dma_dest;
+		new->hw.tcr = copy_size;
+		if (!first)
+			first = new;
+
+		new->mark = DESC_NCOMP;
+		async_tx_ack(&new->async_tx);
+
+		prev = new;
+		len -= copy_size;
+		dma_src += copy_size;
+		dma_dest += copy_size;
+		/* Insert the link descriptor to the LD ring */
+		list_add_tail(&new->node, &first->tx_list);
+	} while (len);
+
+	new->async_tx.flags = flags; /* client is in control of this ack */
+	new->async_tx.cookie = -EBUSY; /* Last desc */
+
+	return &first->async_tx;
+
+err_get_desc:
+	sh_dmae_put_desc(sh_chan, first);
+	return NULL;
+
+}
+
+/*
+ * sh_chan_ld_cleanup - Clean up link descriptors
+ *
+ * This function clean up the ld_queue of DMA channel.
+ */
+static void sh_dmae_chan_ld_cleanup(struct sh_dmae_chan *sh_chan)
+{
+	struct sh_desc *desc, *_desc;
+
+	spin_lock_bh(&sh_chan->desc_lock);
+	list_for_each_entry_safe(desc, _desc, &sh_chan->ld_queue, node) {
+		dma_async_tx_callback callback;
+		void *callback_param;
+
+		/* non send data */
+		if (desc->mark == DESC_NCOMP)
+			break;
+
+		/* send data sesc */
+		callback = desc->async_tx.callback;
+		callback_param = desc->async_tx.callback_param;
+
+		/* Remove from ld_queue list */
+		list_splice_init(&desc->tx_list, &sh_chan->ld_free);
+
+		dev_dbg(sh_chan->dev, "link descriptor %p will be recycle.\n",
+				desc);
+
+		list_move(&desc->node, &sh_chan->ld_free);
+		/* Run the link descriptor callback function */
+		if (callback) {
+			spin_unlock_bh(&sh_chan->desc_lock);
+			dev_dbg(sh_chan->dev, "link descriptor %p callback\n",
+					desc);
+			callback(callback_param);
+			spin_lock_bh(&sh_chan->desc_lock);
+		}
+	}
+	spin_unlock_bh(&sh_chan->desc_lock);
+}
+
+static void sh_chan_xfer_ld_queue(struct sh_dmae_chan *sh_chan)
+{
+	struct list_head *ld_node;
+	struct sh_dmae_regs hw;
+
+	/* DMA work check */
+	if (dmae_is_idle(sh_chan))
+		return;
+
+	/* Find the first un-transfer desciptor */
+	for (ld_node = sh_chan->ld_queue.next;
+		(ld_node != &sh_chan->ld_queue)
+			&& (to_sh_desc(ld_node)->mark == DESC_COMP);
+		ld_node = ld_node->next)
+		cpu_relax();
+
+	if (ld_node != &sh_chan->ld_queue) {
+		/* Get the ld start address from ld_queue */
+		hw = to_sh_desc(ld_node)->hw;
+		dmae_set_reg(sh_chan, hw);
+		dmae_start(sh_chan);
+	}
+}
+
+static void sh_dmae_memcpy_issue_pending(struct dma_chan *chan)
+{
+	struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
+	sh_chan_xfer_ld_queue(sh_chan);
+}
+
+static enum dma_status sh_dmae_is_complete(struct dma_chan *chan,
+					dma_cookie_t cookie,
+					dma_cookie_t *done,
+					dma_cookie_t *used)
+{
+	struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
+	dma_cookie_t last_used;
+	dma_cookie_t last_complete;
+
+	sh_dmae_chan_ld_cleanup(sh_chan);
+
+	last_used = chan->cookie;
+	last_complete = sh_chan->completed_cookie;
+	if (last_complete == -EBUSY)
+		last_complete = last_used;
+
+	if (done)
+		*done = last_complete;
+
+	if (used)
+		*used = last_used;
+
+	return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+static irqreturn_t sh_dmae_interrupt(int irq, void *data)
+{
+	irqreturn_t ret = IRQ_NONE;
+	struct sh_dmae_chan *sh_chan = (struct sh_dmae_chan *)data;
+	u32 chcr = sh_dmae_readl(sh_chan, CHCR);
+
+	if (chcr & CHCR_TE) {
+		/* DMA stop */
+		dmae_halt(sh_chan);
+
+		ret = IRQ_HANDLED;
+		tasklet_schedule(&sh_chan->tasklet);
+	}
+
+	return ret;
+}
+
+#if defined(CONFIG_CPU_SH4)
+static irqreturn_t sh_dmae_err(int irq, void *data)
+{
+	int err = 0;
+	struct sh_dmae_device *shdev = (struct sh_dmae_device *)data;
+
+	/* IRQ Multi */
+	if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
+		int cnt = 0;
+		switch (irq) {
+#if defined(DMTE6_IRQ) && defined(DMAE1_IRQ)
+		case DMTE6_IRQ:
+			cnt++;
+#endif
+		case DMTE0_IRQ:
+			if (dmaor_read_reg(cnt) & (DMAOR_NMIF | DMAOR_AE)) {
+				disable_irq(irq);
+				return IRQ_HANDLED;
+			}
+		default:
+			return IRQ_NONE;
+		}
+	} else {
+		/* reset dma controller */
+		err = sh_dmae_rst(0);
+		if (err)
+			return err;
+		if (shdev->pdata.mode & SHDMA_DMAOR1) {
+			err = sh_dmae_rst(1);
+			if (err)
+				return err;
+		}
+		disable_irq(irq);
+		return IRQ_HANDLED;
+	}
+}
+#endif
+
+static void dmae_do_tasklet(unsigned long data)
+{
+	struct sh_dmae_chan *sh_chan = (struct sh_dmae_chan *)data;
+	struct sh_desc *desc, *_desc, *cur_desc = NULL;
+	u32 sar_buf = sh_dmae_readl(sh_chan, SAR);
+	list_for_each_entry_safe(desc, _desc,
+					&sh_chan->ld_queue, node) {
+		if ((desc->hw.sar + desc->hw.tcr) == sar_buf) {
+			cur_desc = desc;
+			break;
+		}
+	}
+
+	if (cur_desc) {
+		switch (cur_desc->async_tx.cookie) {
+		case 0: /* other desc data */
+			break;
+		case -EBUSY: /* last desc */
+		sh_chan->completed_cookie =
+				cur_desc->async_tx.cookie;
+			break;
+		default: /* first desc ( 0 < )*/
+			sh_chan->completed_cookie =
+				cur_desc->async_tx.cookie - 1;
+			break;
+		}
+		cur_desc->mark = DESC_COMP;
+	}
+	/* Next desc */
+	sh_chan_xfer_ld_queue(sh_chan);
+	sh_dmae_chan_ld_cleanup(sh_chan);
+}
+
+static unsigned int get_dmae_irq(unsigned int id)
+{
+	unsigned int irq = 0;
+	if (id < ARRAY_SIZE(dmte_irq_map))
+		irq = dmte_irq_map[id];
+	return irq;
+}
+
+static int __devinit sh_dmae_chan_probe(struct sh_dmae_device *shdev, int id)
+{
+	int err;
+	unsigned int irq = get_dmae_irq(id);
+	unsigned long irqflags = IRQF_DISABLED;
+	struct sh_dmae_chan *new_sh_chan;
+
+	/* alloc channel */
+	new_sh_chan = kzalloc(sizeof(struct sh_dmae_chan), GFP_KERNEL);
+	if (!new_sh_chan) {
+		dev_err(shdev->common.dev, "No free memory for allocating "
+				"dma channels!\n");
+		return -ENOMEM;
+	}
+
+	new_sh_chan->dev = shdev->common.dev;
+	new_sh_chan->id = id;
+
+	/* Init DMA tasklet */
+	tasklet_init(&new_sh_chan->tasklet, dmae_do_tasklet,
+			(unsigned long)new_sh_chan);
+
+	/* Init the channel */
+	dmae_init(new_sh_chan);
+
+	spin_lock_init(&new_sh_chan->desc_lock);
+
+	/* Init descripter manage list */
+	INIT_LIST_HEAD(&new_sh_chan->ld_queue);
+	INIT_LIST_HEAD(&new_sh_chan->ld_free);
+
+	/* copy struct dma_device */
+	new_sh_chan->common.device = &shdev->common;
+
+	/* Add the channel to DMA device channel list */
+	list_add_tail(&new_sh_chan->common.device_node,
+			&shdev->common.channels);
+	shdev->common.chancnt++;
+
+	if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
+		irqflags = IRQF_SHARED;
+#if defined(DMTE6_IRQ)
+		if (irq >= DMTE6_IRQ)
+			irq = DMTE6_IRQ;
+		else
+#endif
+			irq = DMTE0_IRQ;
+	}
+
+	snprintf(new_sh_chan->dev_id, sizeof(new_sh_chan->dev_id),
+			"sh-dmae%d", new_sh_chan->id);
+
+	/* set up channel irq */
+	err = request_irq(irq, &sh_dmae_interrupt,
+		irqflags, new_sh_chan->dev_id, new_sh_chan);
+	if (err) {
+		dev_err(shdev->common.dev, "DMA channel %d request_irq error "
+			"with return %d\n", id, err);
+		goto err_no_irq;
+	}
+
+	/* CHCR register control function */
+	new_sh_chan->set_chcr = dmae_set_chcr;
+	/* DMARS register control function */
+	new_sh_chan->set_dmars = dmae_set_dmars;
+
+	shdev->chan[id] = new_sh_chan;
+	return 0;
+
+err_no_irq:
+	/* remove from dmaengine device node */
+	list_del(&new_sh_chan->common.device_node);
+	kfree(new_sh_chan);
+	return err;
+}
+
+static void sh_dmae_chan_remove(struct sh_dmae_device *shdev)
+{
+	int i;
+
+	for (i = shdev->common.chancnt - 1 ; i >= 0 ; i--) {
+		if (shdev->chan[i]) {
+			struct sh_dmae_chan *shchan = shdev->chan[i];
+			if (!(shdev->pdata.mode & SHDMA_MIX_IRQ))
+				free_irq(dmte_irq_map[i], shchan);
+
+			list_del(&shchan->common.device_node);
+			kfree(shchan);
+			shdev->chan[i] = NULL;
+		}
+	}
+	shdev->common.chancnt = 0;
+}
+
+static int __init sh_dmae_probe(struct platform_device *pdev)
+{
+	int err = 0, cnt, ecnt;
+	unsigned long irqflags = IRQF_DISABLED;
+#if defined(CONFIG_CPU_SH4)
+	int eirq[] = { DMAE0_IRQ,
+#if defined(DMAE1_IRQ)
+			DMAE1_IRQ
+#endif
+		};
+#endif
+	struct sh_dmae_device *shdev;
+
+	shdev = kzalloc(sizeof(struct sh_dmae_device), GFP_KERNEL);
+	if (!shdev) {
+		dev_err(&pdev->dev, "No enough memory\n");
+		err = -ENOMEM;
+		goto shdev_err;
+	}
+
+	/* get platform data */
+	if (!pdev->dev.platform_data)
+		goto shdev_err;
+
+	/* platform data */
+	memcpy(&shdev->pdata, pdev->dev.platform_data,
+			sizeof(struct sh_dmae_pdata));
+
+	/* reset dma controller */
+	err = sh_dmae_rst(0);
+	if (err)
+		goto rst_err;
+
+	/* SH7780/85/23 has DMAOR1 */
+	if (shdev->pdata.mode & SHDMA_DMAOR1) {
+		err = sh_dmae_rst(1);
+		if (err)
+			goto rst_err;
+	}
+
+	INIT_LIST_HEAD(&shdev->common.channels);
+
+	dma_cap_set(DMA_MEMCPY, shdev->common.cap_mask);
+	shdev->common.device_alloc_chan_resources
+		= sh_dmae_alloc_chan_resources;
+	shdev->common.device_free_chan_resources = sh_dmae_free_chan_resources;
+	shdev->common.device_prep_dma_memcpy = sh_dmae_prep_memcpy;
+	shdev->common.device_is_tx_complete = sh_dmae_is_complete;
+	shdev->common.device_issue_pending = sh_dmae_memcpy_issue_pending;
+	shdev->common.dev = &pdev->dev;
+
+#if defined(CONFIG_CPU_SH4)
+	/* Non Mix IRQ mode SH7722/SH7730 etc... */
+	if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
+		irqflags = IRQF_SHARED;
+		eirq[0] = DMTE0_IRQ;
+#if defined(DMTE6_IRQ) && defined(DMAE1_IRQ)
+		eirq[1] = DMTE6_IRQ;
+#endif
+	}
+
+	for (ecnt = 0 ; ecnt < ARRAY_SIZE(eirq); ecnt++) {
+		err = request_irq(eirq[ecnt], sh_dmae_err,
+			irqflags, "DMAC Address Error", shdev);
+		if (err) {
+			dev_err(&pdev->dev, "DMA device request_irq"
+				"error (irq %d) with return %d\n",
+				eirq[ecnt], err);
+			goto eirq_err;
+		}
+	}
+#endif /* CONFIG_CPU_SH4 */
+
+	/* Create DMA Channel */
+	for (cnt = 0 ; cnt < MAX_DMA_CHANNELS ; cnt++) {
+		err = sh_dmae_chan_probe(shdev, cnt);
+		if (err)
+			goto chan_probe_err;
+	}
+
+	platform_set_drvdata(pdev, shdev);
+	dma_async_device_register(&shdev->common);
+
+	return err;
+
+chan_probe_err:
+	sh_dmae_chan_remove(shdev);
+
+eirq_err:
+	for (ecnt-- ; ecnt >= 0; ecnt--)
+		free_irq(eirq[ecnt], shdev);
+
+rst_err:
+	kfree(shdev);
+
+shdev_err:
+	return err;
+}
+
+static int __exit sh_dmae_remove(struct platform_device *pdev)
+{
+	struct sh_dmae_device *shdev = platform_get_drvdata(pdev);
+
+	dma_async_device_unregister(&shdev->common);
+
+	if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
+		free_irq(DMTE0_IRQ, shdev);
+#if defined(DMTE6_IRQ)
+		free_irq(DMTE6_IRQ, shdev);
+#endif
+	}
+
+	/* channel data remove */
+	sh_dmae_chan_remove(shdev);
+
+	if (!(shdev->pdata.mode & SHDMA_MIX_IRQ)) {
+		free_irq(DMAE0_IRQ, shdev);
+#if defined(DMAE1_IRQ)
+		free_irq(DMAE1_IRQ, shdev);
+#endif
+	}
+	kfree(shdev);
+
+	return 0;
+}
+
+static void sh_dmae_shutdown(struct platform_device *pdev)
+{
+	struct sh_dmae_device *shdev = platform_get_drvdata(pdev);
+	sh_dmae_ctl_stop(0);
+	if (shdev->pdata.mode & SHDMA_DMAOR1)
+		sh_dmae_ctl_stop(1);
+}
+
+static struct platform_driver sh_dmae_driver = {
+	.remove		= __exit_p(sh_dmae_remove),
+	.shutdown	= sh_dmae_shutdown,
+	.driver = {
+		.name	= "sh-dma-engine",
+	},
+};
+
+static int __init sh_dmae_init(void)
+{
+	return platform_driver_probe(&sh_dmae_driver, sh_dmae_probe);
+}
+module_init(sh_dmae_init);
+
+static void __exit sh_dmae_exit(void)
+{
+	platform_driver_unregister(&sh_dmae_driver);
+}
+module_exit(sh_dmae_exit);
+
+MODULE_AUTHOR("Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>");
+MODULE_DESCRIPTION("Renesas SH DMA Engine driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/dma/shdma.h b/drivers/dma/shdma.h
new file mode 100644
index 00000000000..2b4bc15a2c0
--- /dev/null
+++ b/drivers/dma/shdma.h
@@ -0,0 +1,64 @@
+/*
+ * Renesas SuperH DMA Engine support
+ *
+ * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
+ * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#ifndef __DMA_SHDMA_H
+#define __DMA_SHDMA_H
+
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/dmaengine.h>
+
+#define SH_DMA_TCR_MAX 0x00FFFFFF	/* 16MB */
+
+struct sh_dmae_regs {
+	u32 sar; /* SAR / source address */
+	u32 dar; /* DAR / destination address */
+	u32 tcr; /* TCR / transfer count */
+};
+
+struct sh_desc {
+	struct list_head tx_list;
+	struct sh_dmae_regs hw;
+	struct list_head node;
+	struct dma_async_tx_descriptor async_tx;
+	int mark;
+};
+
+struct sh_dmae_chan {
+	dma_cookie_t completed_cookie;	/* The maximum cookie completed */
+	spinlock_t desc_lock;			/* Descriptor operation lock */
+	struct list_head ld_queue;		/* Link descriptors queue */
+	struct list_head ld_free;		/* Link descriptors free */
+	struct dma_chan common;			/* DMA common channel */
+	struct device *dev;				/* Channel device */
+	struct tasklet_struct tasklet;	/* Tasklet */
+	int descs_allocated;			/* desc count */
+	int id;				/* Raw id of this channel */
+	char dev_id[16];	/* unique name per DMAC of channel */
+
+	/* Set chcr */
+	int (*set_chcr)(struct sh_dmae_chan *sh_chan, u32 regs);
+	/* Set DMA resource */
+	int (*set_dmars)(struct sh_dmae_chan *sh_chan, u16 res);
+};
+
+struct sh_dmae_device {
+	struct dma_device common;
+	struct sh_dmae_chan *chan[MAX_DMA_CHANNELS];
+	struct sh_dmae_pdata pdata;
+};
+
+#define to_sh_chan(chan) container_of(chan, struct sh_dmae_chan, common)
+#define to_sh_desc(lh) container_of(lh, struct sh_desc, node)
+#define tx_to_sh_desc(tx) container_of(tx, struct sh_desc, async_tx)
+
+#endif	/* __DMA_SHDMA_H */
diff --git a/drivers/dma/txx9dmac.c b/drivers/dma/txx9dmac.c
index 7837930146a..fb6bb64e886 100644
--- a/drivers/dma/txx9dmac.c
+++ b/drivers/dma/txx9dmac.c
@@ -180,9 +180,8 @@ static struct txx9dmac_desc *txx9dmac_first_queued(struct txx9dmac_chan *dc)
 
 static struct txx9dmac_desc *txx9dmac_last_child(struct txx9dmac_desc *desc)
 {
-	if (!list_empty(&desc->txd.tx_list))
-		desc = list_entry(desc->txd.tx_list.prev,
-				  struct txx9dmac_desc, desc_node);
+	if (!list_empty(&desc->tx_list))
+		desc = list_entry(desc->tx_list.prev, typeof(*desc), desc_node);
 	return desc;
 }
 
@@ -197,6 +196,7 @@ static struct txx9dmac_desc *txx9dmac_desc_alloc(struct txx9dmac_chan *dc,
 	desc = kzalloc(sizeof(*desc), flags);
 	if (!desc)
 		return NULL;
+	INIT_LIST_HEAD(&desc->tx_list);
 	dma_async_tx_descriptor_init(&desc->txd, &dc->chan);
 	desc->txd.tx_submit = txx9dmac_tx_submit;
 	/* txd.flags will be overwritten in prep funcs */
@@ -245,7 +245,7 @@ static void txx9dmac_sync_desc_for_cpu(struct txx9dmac_chan *dc,
 	struct txx9dmac_dev *ddev = dc->ddev;
 	struct txx9dmac_desc *child;
 
-	list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+	list_for_each_entry(child, &desc->tx_list, desc_node)
 		dma_sync_single_for_cpu(chan2parent(&dc->chan),
 				child->txd.phys, ddev->descsize,
 				DMA_TO_DEVICE);
@@ -267,11 +267,11 @@ static void txx9dmac_desc_put(struct txx9dmac_chan *dc,
 		txx9dmac_sync_desc_for_cpu(dc, desc);
 
 		spin_lock_bh(&dc->lock);
-		list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+		list_for_each_entry(child, &desc->tx_list, desc_node)
 			dev_vdbg(chan2dev(&dc->chan),
 				 "moving child desc %p to freelist\n",
 				 child);
-		list_splice_init(&desc->txd.tx_list, &dc->free_list);
+		list_splice_init(&desc->tx_list, &dc->free_list);
 		dev_vdbg(chan2dev(&dc->chan), "moving desc %p to freelist\n",
 			 desc);
 		list_add(&desc->desc_node, &dc->free_list);
@@ -429,7 +429,7 @@ txx9dmac_descriptor_complete(struct txx9dmac_chan *dc,
 	param = txd->callback_param;
 
 	txx9dmac_sync_desc_for_cpu(dc, desc);
-	list_splice_init(&txd->tx_list, &dc->free_list);
+	list_splice_init(&desc->tx_list, &dc->free_list);
 	list_move(&desc->desc_node, &dc->free_list);
 
 	if (!ds) {
@@ -571,7 +571,7 @@ static void txx9dmac_handle_error(struct txx9dmac_chan *dc, u32 csr)
 		 "Bad descriptor submitted for DMA! (cookie: %d)\n",
 		 bad_desc->txd.cookie);
 	txx9dmac_dump_desc(dc, &bad_desc->hwdesc);
-	list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node)
+	list_for_each_entry(child, &bad_desc->tx_list, desc_node)
 		txx9dmac_dump_desc(dc, &child->hwdesc);
 	/* Pretend the descriptor completed successfully */
 	txx9dmac_descriptor_complete(dc, bad_desc);
@@ -613,7 +613,7 @@ static void txx9dmac_scan_descriptors(struct txx9dmac_chan *dc)
 			return;
 		}
 
-		list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+		list_for_each_entry(child, &desc->tx_list, desc_node)
 			if (desc_read_CHAR(dc, child) == chain) {
 				/* Currently in progress */
 				if (csr & TXX9_DMA_CSR_ABCHC)
@@ -823,8 +823,7 @@ txx9dmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 			dma_sync_single_for_device(chan2parent(&dc->chan),
 					prev->txd.phys, ddev->descsize,
 					DMA_TO_DEVICE);
-			list_add_tail(&desc->desc_node,
-					&first->txd.tx_list);
+			list_add_tail(&desc->desc_node, &first->tx_list);
 		}
 		prev = desc;
 	}
@@ -919,8 +918,7 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 					prev->txd.phys,
 					ddev->descsize,
 					DMA_TO_DEVICE);
-			list_add_tail(&desc->desc_node,
-					&first->txd.tx_list);
+			list_add_tail(&desc->desc_node, &first->tx_list);
 		}
 		prev = desc;
 	}
diff --git a/drivers/dma/txx9dmac.h b/drivers/dma/txx9dmac.h
index c907ff01d27..365d42366b9 100644
--- a/drivers/dma/txx9dmac.h
+++ b/drivers/dma/txx9dmac.h
@@ -231,6 +231,7 @@ struct txx9dmac_desc {
 
 	/* THEN values for driver housekeeping */
 	struct list_head		desc_node ____cacheline_aligned;
+	struct list_head		tx_list;
 	struct dma_async_tx_descriptor	txd;
 	size_t				len;
 };
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index a3ca18e2d7c..02127e59fe8 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -133,6 +133,13 @@ config EDAC_I3000
 	  Support for error detection and correction on the Intel
 	  3000 and 3010 server chipsets.
 
+config EDAC_I3200
+	tristate "Intel 3200"
+	depends on EDAC_MM_EDAC && PCI && X86 && EXPERIMENTAL
+	help
+	  Support for error detection and correction on the Intel
+	  3200 and 3210 server chipsets.
+
 config EDAC_X38
 	tristate "Intel X38"
 	depends on EDAC_MM_EDAC && PCI && X86
@@ -176,11 +183,11 @@ config EDAC_I5100
 	  San Clemente MCH.
 
 config EDAC_MPC85XX
-	tristate "Freescale MPC85xx"
-	depends on EDAC_MM_EDAC && FSL_SOC && MPC85xx
+	tristate "Freescale MPC83xx / MPC85xx"
+	depends on EDAC_MM_EDAC && FSL_SOC && (PPC_83xx || MPC85xx)
 	help
 	  Support for error detection and correction on the Freescale
-	  MPC8560, MPC8540, MPC8548
+	  MPC8349, MPC8560, MPC8540, MPC8548
 
 config EDAC_MV64X60
 	tristate "Marvell MV64x60"
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index cfa033ce53a..7a473bbe8ab 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_EDAC_I82443BXGX)		+= i82443bxgx_edac.o
 obj-$(CONFIG_EDAC_I82875P)		+= i82875p_edac.o
 obj-$(CONFIG_EDAC_I82975X)		+= i82975x_edac.o
 obj-$(CONFIG_EDAC_I3000)		+= i3000_edac.o
+obj-$(CONFIG_EDAC_I3200)		+= i3200_edac.o
 obj-$(CONFIG_EDAC_X38)			+= x38_edac.o
 obj-$(CONFIG_EDAC_I82860)		+= i82860_edac.o
 obj-$(CONFIG_EDAC_R82600)		+= r82600_edac.o
@@ -49,3 +50,4 @@ obj-$(CONFIG_EDAC_CELL)			+= cell_edac.o
 obj-$(CONFIG_EDAC_PPC4XX)		+= ppc4xx_edac.o
 obj-$(CONFIG_EDAC_AMD8111)		+= amd8111_edac.o
 obj-$(CONFIG_EDAC_AMD8131)		+= amd8131_edac.o
+
diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c
index 8c54196b5ab..3d50274f134 100644
--- a/drivers/edac/cpc925_edac.c
+++ b/drivers/edac/cpc925_edac.c
@@ -885,14 +885,14 @@ static int __devinit cpc925_probe(struct platform_device *pdev)
 
 	if (!devm_request_mem_region(&pdev->dev,
 				     r->start,
-				     r->end - r->start + 1,
+				     resource_size(r),
 				     pdev->name)) {
 		cpc925_printk(KERN_ERR, "Unable to request mem region\n");
 		res = -EBUSY;
 		goto err1;
 	}
 
-	vbase = devm_ioremap(&pdev->dev, r->start, r->end - r->start + 1);
+	vbase = devm_ioremap(&pdev->dev, r->start, resource_size(r));
 	if (!vbase) {
 		cpc925_printk(KERN_ERR, "Unable to ioremap device\n");
 		res = -ENOMEM;
@@ -953,7 +953,7 @@ err3:
 	cpc925_mc_exit(mci);
 	edac_mc_free(mci);
 err2:
-	devm_release_mem_region(&pdev->dev, r->start, r->end-r->start+1);
+	devm_release_mem_region(&pdev->dev, r->start, resource_size(r));
 err1:
 	devres_release_group(&pdev->dev, cpc925_probe);
 out:
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index b02a6a69a8f..d5e13c94714 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -356,7 +356,6 @@ static void complete_edac_device_list_del(struct rcu_head *head)
 
 	edac_dev = container_of(head, struct edac_device_ctl_info, rcu);
 	INIT_LIST_HEAD(&edac_dev->link);
-	complete(&edac_dev->removal_complete);
 }
 
 /*
@@ -369,10 +368,8 @@ static void del_edac_device_from_global_list(struct edac_device_ctl_info
 						*edac_device)
 {
 	list_del_rcu(&edac_device->link);
-
-	init_completion(&edac_device->removal_complete);
 	call_rcu(&edac_device->rcu, complete_edac_device_list_del);
-	wait_for_completion(&edac_device->removal_complete);
+	rcu_barrier();
 }
 
 /*
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 335b7ebdb11..b629c41756f 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -418,16 +418,14 @@ static void complete_mc_list_del(struct rcu_head *head)
 
 	mci = container_of(head, struct mem_ctl_info, rcu);
 	INIT_LIST_HEAD(&mci->link);
-	complete(&mci->complete);
 }
 
 static void del_mc_from_global_list(struct mem_ctl_info *mci)
 {
 	atomic_dec(&edac_handlers);
 	list_del_rcu(&mci->link);
-	init_completion(&mci->complete);
 	call_rcu(&mci->rcu, complete_mc_list_del);
-	wait_for_completion(&mci->complete);
+	rcu_barrier();
 }
 
 /**
diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c
index 30b585b1d60..efb5d565078 100644
--- a/drivers/edac/edac_pci.c
+++ b/drivers/edac/edac_pci.c
@@ -174,7 +174,6 @@ static void complete_edac_pci_list_del(struct rcu_head *head)
 
 	pci = container_of(head, struct edac_pci_ctl_info, rcu);
 	INIT_LIST_HEAD(&pci->link);
-	complete(&pci->complete);
 }
 
 /*
@@ -185,9 +184,8 @@ static void complete_edac_pci_list_del(struct rcu_head *head)
 static void del_edac_pci_from_global_list(struct edac_pci_ctl_info *pci)
 {
 	list_del_rcu(&pci->link);
-	init_completion(&pci->complete);
 	call_rcu(&pci->rcu, complete_edac_pci_list_del);
-	wait_for_completion(&pci->complete);
+	rcu_barrier();
 }
 
 #if 0
diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
new file mode 100644
index 00000000000..fde4db91c4d
--- /dev/null
+++ b/drivers/edac/i3200_edac.c
@@ -0,0 +1,527 @@
+/*
+ * Intel 3200/3210 Memory Controller kernel module
+ * Copyright (C) 2008-2009 Akamai Technologies, Inc.
+ * Portions by Hitoshi Mitake <h.mitake@gmail.com>.
+ *
+ * This file may be distributed under the terms of the
+ * GNU General Public License.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/slab.h>
+#include <linux/edac.h>
+#include <linux/io.h>
+#include "edac_core.h"
+
+#define I3200_REVISION        "1.1"
+
+#define EDAC_MOD_STR        "i3200_edac"
+
+#define PCI_DEVICE_ID_INTEL_3200_HB    0x29f0
+
+#define I3200_RANKS		8
+#define I3200_RANKS_PER_CHANNEL	4
+#define I3200_CHANNELS		2
+
+/* Intel 3200 register addresses - device 0 function 0 - DRAM Controller */
+
+#define I3200_MCHBAR_LOW	0x48	/* MCH Memory Mapped Register BAR */
+#define I3200_MCHBAR_HIGH	0x4c
+#define I3200_MCHBAR_MASK	0xfffffc000ULL	/* bits 35:14 */
+#define I3200_MMR_WINDOW_SIZE	16384
+
+#define I3200_TOM		0xa0	/* Top of Memory (16b)
+		 *
+		 * 15:10 reserved
+		 *  9:0  total populated physical memory
+		 */
+#define I3200_TOM_MASK		0x3ff	/* bits 9:0 */
+#define I3200_TOM_SHIFT		26	/* 64MiB grain */
+
+#define I3200_ERRSTS		0xc8	/* Error Status Register (16b)
+		 *
+		 * 15    reserved
+		 * 14    Isochronous TBWRR Run Behind FIFO Full
+		 *       (ITCV)
+		 * 13    Isochronous TBWRR Run Behind FIFO Put
+		 *       (ITSTV)
+		 * 12    reserved
+		 * 11    MCH Thermal Sensor Event
+		 *       for SMI/SCI/SERR (GTSE)
+		 * 10    reserved
+		 *  9    LOCK to non-DRAM Memory Flag (LCKF)
+		 *  8    reserved
+		 *  7    DRAM Throttle Flag (DTF)
+		 *  6:2  reserved
+		 *  1    Multi-bit DRAM ECC Error Flag (DMERR)
+		 *  0    Single-bit DRAM ECC Error Flag (DSERR)
+		 */
+#define I3200_ERRSTS_UE		0x0002
+#define I3200_ERRSTS_CE		0x0001
+#define I3200_ERRSTS_BITS	(I3200_ERRSTS_UE | I3200_ERRSTS_CE)
+
+
+/* Intel  MMIO register space - device 0 function 0 - MMR space */
+
+#define I3200_C0DRB	0x200	/* Channel 0 DRAM Rank Boundary (16b x 4)
+		 *
+		 * 15:10 reserved
+		 *  9:0  Channel 0 DRAM Rank Boundary Address
+		 */
+#define I3200_C1DRB	0x600	/* Channel 1 DRAM Rank Boundary (16b x 4) */
+#define I3200_DRB_MASK	0x3ff	/* bits 9:0 */
+#define I3200_DRB_SHIFT	26	/* 64MiB grain */
+
+#define I3200_C0ECCERRLOG	0x280	/* Channel 0 ECC Error Log (64b)
+		 *
+		 * 63:48 Error Column Address (ERRCOL)
+		 * 47:32 Error Row Address (ERRROW)
+		 * 31:29 Error Bank Address (ERRBANK)
+		 * 28:27 Error Rank Address (ERRRANK)
+		 * 26:24 reserved
+		 * 23:16 Error Syndrome (ERRSYND)
+		 * 15: 2 reserved
+		 *    1  Multiple Bit Error Status (MERRSTS)
+		 *    0  Correctable Error Status (CERRSTS)
+		 */
+#define I3200_C1ECCERRLOG		0x680	/* Chan 1 ECC Error Log (64b) */
+#define I3200_ECCERRLOG_CE		0x1
+#define I3200_ECCERRLOG_UE		0x2
+#define I3200_ECCERRLOG_RANK_BITS	0x18000000
+#define I3200_ECCERRLOG_RANK_SHIFT	27
+#define I3200_ECCERRLOG_SYNDROME_BITS	0xff0000
+#define I3200_ECCERRLOG_SYNDROME_SHIFT	16
+#define I3200_CAPID0			0xe0	/* P.95 of spec for details */
+
+struct i3200_priv {
+	void __iomem *window;
+};
+
+static int nr_channels;
+
+static int how_many_channels(struct pci_dev *pdev)
+{
+	unsigned char capid0_8b; /* 8th byte of CAPID0 */
+
+	pci_read_config_byte(pdev, I3200_CAPID0 + 8, &capid0_8b);
+	if (capid0_8b & 0x20) { /* check DCD: Dual Channel Disable */
+		debugf0("In single channel mode.\n");
+		return 1;
+	} else {
+		debugf0("In dual channel mode.\n");
+		return 2;
+	}
+}
+
+static unsigned long eccerrlog_syndrome(u64 log)
+{
+	return (log & I3200_ECCERRLOG_SYNDROME_BITS) >>
+		I3200_ECCERRLOG_SYNDROME_SHIFT;
+}
+
+static int eccerrlog_row(int channel, u64 log)
+{
+	u64 rank = ((log & I3200_ECCERRLOG_RANK_BITS) >>
+		I3200_ECCERRLOG_RANK_SHIFT);
+	return rank | (channel * I3200_RANKS_PER_CHANNEL);
+}
+
+enum i3200_chips {
+	I3200 = 0,
+};
+
+struct i3200_dev_info {
+	const char *ctl_name;
+};
+
+struct i3200_error_info {
+	u16 errsts;
+	u16 errsts2;
+	u64 eccerrlog[I3200_CHANNELS];
+};
+
+static const struct i3200_dev_info i3200_devs[] = {
+	[I3200] = {
+		.ctl_name = "i3200"
+	},
+};
+
+static struct pci_dev *mci_pdev;
+static int i3200_registered = 1;
+
+
+static void i3200_clear_error_info(struct mem_ctl_info *mci)
+{
+	struct pci_dev *pdev;
+
+	pdev = to_pci_dev(mci->dev);
+
+	/*
+	 * Clear any error bits.
+	 * (Yes, we really clear bits by writing 1 to them.)
+	 */
+	pci_write_bits16(pdev, I3200_ERRSTS, I3200_ERRSTS_BITS,
+		I3200_ERRSTS_BITS);
+}
+
+static void i3200_get_and_clear_error_info(struct mem_ctl_info *mci,
+		struct i3200_error_info *info)
+{
+	struct pci_dev *pdev;
+	struct i3200_priv *priv = mci->pvt_info;
+	void __iomem *window = priv->window;
+
+	pdev = to_pci_dev(mci->dev);
+
+	/*
+	 * This is a mess because there is no atomic way to read all the
+	 * registers at once and the registers can transition from CE being
+	 * overwritten by UE.
+	 */
+	pci_read_config_word(pdev, I3200_ERRSTS, &info->errsts);
+	if (!(info->errsts & I3200_ERRSTS_BITS))
+		return;
+
+	info->eccerrlog[0] = readq(window + I3200_C0ECCERRLOG);
+	if (nr_channels == 2)
+		info->eccerrlog[1] = readq(window + I3200_C1ECCERRLOG);
+
+	pci_read_config_word(pdev, I3200_ERRSTS, &info->errsts2);
+
+	/*
+	 * If the error is the same for both reads then the first set
+	 * of reads is valid.  If there is a change then there is a CE
+	 * with no info and the second set of reads is valid and
+	 * should be UE info.
+	 */
+	if ((info->errsts ^ info->errsts2) & I3200_ERRSTS_BITS) {
+		info->eccerrlog[0] = readq(window + I3200_C0ECCERRLOG);
+		if (nr_channels == 2)
+			info->eccerrlog[1] = readq(window + I3200_C1ECCERRLOG);
+	}
+
+	i3200_clear_error_info(mci);
+}
+
+static void i3200_process_error_info(struct mem_ctl_info *mci,
+		struct i3200_error_info *info)
+{
+	int channel;
+	u64 log;
+
+	if (!(info->errsts & I3200_ERRSTS_BITS))
+		return;
+
+	if ((info->errsts ^ info->errsts2) & I3200_ERRSTS_BITS) {
+		edac_mc_handle_ce_no_info(mci, "UE overwrote CE");
+		info->errsts = info->errsts2;
+	}
+
+	for (channel = 0; channel < nr_channels; channel++) {
+		log = info->eccerrlog[channel];
+		if (log & I3200_ECCERRLOG_UE) {
+			edac_mc_handle_ue(mci, 0, 0,
+				eccerrlog_row(channel, log),
+				"i3200 UE");
+		} else if (log & I3200_ECCERRLOG_CE) {
+			edac_mc_handle_ce(mci, 0, 0,
+				eccerrlog_syndrome(log),
+				eccerrlog_row(channel, log), 0,
+				"i3200 CE");
+		}
+	}
+}
+
+static void i3200_check(struct mem_ctl_info *mci)
+{
+	struct i3200_error_info info;
+
+	debugf1("MC%d: %s()\n", mci->mc_idx, __func__);
+	i3200_get_and_clear_error_info(mci, &info);
+	i3200_process_error_info(mci, &info);
+}
+
+
+void __iomem *i3200_map_mchbar(struct pci_dev *pdev)
+{
+	union {
+		u64 mchbar;
+		struct {
+			u32 mchbar_low;
+			u32 mchbar_high;
+		};
+	} u;
+	void __iomem *window;
+
+	pci_read_config_dword(pdev, I3200_MCHBAR_LOW, &u.mchbar_low);
+	pci_read_config_dword(pdev, I3200_MCHBAR_HIGH, &u.mchbar_high);
+	u.mchbar &= I3200_MCHBAR_MASK;
+
+	if (u.mchbar != (resource_size_t)u.mchbar) {
+		printk(KERN_ERR
+			"i3200: mmio space beyond accessible range (0x%llx)\n",
+			(unsigned long long)u.mchbar);
+		return NULL;
+	}
+
+	window = ioremap_nocache(u.mchbar, I3200_MMR_WINDOW_SIZE);
+	if (!window)
+		printk(KERN_ERR "i3200: cannot map mmio space at 0x%llx\n",
+			(unsigned long long)u.mchbar);
+
+	return window;
+}
+
+
+static void i3200_get_drbs(void __iomem *window,
+	u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL])
+{
+	int i;
+
+	for (i = 0; i < I3200_RANKS_PER_CHANNEL; i++) {
+		drbs[0][i] = readw(window + I3200_C0DRB + 2*i) & I3200_DRB_MASK;
+		drbs[1][i] = readw(window + I3200_C1DRB + 2*i) & I3200_DRB_MASK;
+	}
+}
+
+static bool i3200_is_stacked(struct pci_dev *pdev,
+	u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL])
+{
+	u16 tom;
+
+	pci_read_config_word(pdev, I3200_TOM, &tom);
+	tom &= I3200_TOM_MASK;
+
+	return drbs[I3200_CHANNELS - 1][I3200_RANKS_PER_CHANNEL - 1] == tom;
+}
+
+static unsigned long drb_to_nr_pages(
+	u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL], bool stacked,
+	int channel, int rank)
+{
+	int n;
+
+	n = drbs[channel][rank];
+	if (rank > 0)
+		n -= drbs[channel][rank - 1];
+	if (stacked && (channel == 1) &&
+	drbs[channel][rank] == drbs[channel][I3200_RANKS_PER_CHANNEL - 1])
+		n -= drbs[0][I3200_RANKS_PER_CHANNEL - 1];
+
+	n <<= (I3200_DRB_SHIFT - PAGE_SHIFT);
+	return n;
+}
+
+static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
+{
+	int rc;
+	int i;
+	struct mem_ctl_info *mci = NULL;
+	unsigned long last_page;
+	u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL];
+	bool stacked;
+	void __iomem *window;
+	struct i3200_priv *priv;
+
+	debugf0("MC: %s()\n", __func__);
+
+	window = i3200_map_mchbar(pdev);
+	if (!window)
+		return -ENODEV;
+
+	i3200_get_drbs(window, drbs);
+	nr_channels = how_many_channels(pdev);
+
+	mci = edac_mc_alloc(sizeof(struct i3200_priv), I3200_RANKS,
+		nr_channels, 0);
+	if (!mci)
+		return -ENOMEM;
+
+	debugf3("MC: %s(): init mci\n", __func__);
+
+	mci->dev = &pdev->dev;
+	mci->mtype_cap = MEM_FLAG_DDR2;
+
+	mci->edac_ctl_cap = EDAC_FLAG_SECDED;
+	mci->edac_cap = EDAC_FLAG_SECDED;
+
+	mci->mod_name = EDAC_MOD_STR;
+	mci->mod_ver = I3200_REVISION;
+	mci->ctl_name = i3200_devs[dev_idx].ctl_name;
+	mci->dev_name = pci_name(pdev);
+	mci->edac_check = i3200_check;
+	mci->ctl_page_to_phys = NULL;
+	priv = mci->pvt_info;
+	priv->window = window;
+
+	stacked = i3200_is_stacked(pdev, drbs);
+
+	/*
+	 * The dram rank boundary (DRB) reg values are boundary addresses
+	 * for each DRAM rank with a granularity of 64MB.  DRB regs are
+	 * cumulative; the last one will contain the total memory
+	 * contained in all ranks.
+	 */
+	last_page = -1UL;
+	for (i = 0; i < mci->nr_csrows; i++) {
+		unsigned long nr_pages;
+		struct csrow_info *csrow = &mci->csrows[i];
+
+		nr_pages = drb_to_nr_pages(drbs, stacked,
+			i / I3200_RANKS_PER_CHANNEL,
+			i % I3200_RANKS_PER_CHANNEL);
+
+		if (nr_pages == 0) {
+			csrow->mtype = MEM_EMPTY;
+			continue;
+		}
+
+		csrow->first_page = last_page + 1;
+		last_page += nr_pages;
+		csrow->last_page = last_page;
+		csrow->nr_pages = nr_pages;
+
+		csrow->grain = nr_pages << PAGE_SHIFT;
+		csrow->mtype = MEM_DDR2;
+		csrow->dtype = DEV_UNKNOWN;
+		csrow->edac_mode = EDAC_UNKNOWN;
+	}
+
+	i3200_clear_error_info(mci);
+
+	rc = -ENODEV;
+	if (edac_mc_add_mc(mci)) {
+		debugf3("MC: %s(): failed edac_mc_add_mc()\n", __func__);
+		goto fail;
+	}
+
+	/* get this far and it's successful */
+	debugf3("MC: %s(): success\n", __func__);
+	return 0;
+
+fail:
+	iounmap(window);
+	if (mci)
+		edac_mc_free(mci);
+
+	return rc;
+}
+
+static int __devinit i3200_init_one(struct pci_dev *pdev,
+		const struct pci_device_id *ent)
+{
+	int rc;
+
+	debugf0("MC: %s()\n", __func__);
+
+	if (pci_enable_device(pdev) < 0)
+		return -EIO;
+
+	rc = i3200_probe1(pdev, ent->driver_data);
+	if (!mci_pdev)
+		mci_pdev = pci_dev_get(pdev);
+
+	return rc;
+}
+
+static void __devexit i3200_remove_one(struct pci_dev *pdev)
+{
+	struct mem_ctl_info *mci;
+	struct i3200_priv *priv;
+
+	debugf0("%s()\n", __func__);
+
+	mci = edac_mc_del_mc(&pdev->dev);
+	if (!mci)
+		return;
+
+	priv = mci->pvt_info;
+	iounmap(priv->window);
+
+	edac_mc_free(mci);
+}
+
+static const struct pci_device_id i3200_pci_tbl[] __devinitdata = {
+	{
+		PCI_VEND_DEV(INTEL, 3200_HB), PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		I3200},
+	{
+		0,
+	}            /* 0 terminated list. */
+};
+
+MODULE_DEVICE_TABLE(pci, i3200_pci_tbl);
+
+static struct pci_driver i3200_driver = {
+	.name = EDAC_MOD_STR,
+	.probe = i3200_init_one,
+	.remove = __devexit_p(i3200_remove_one),
+	.id_table = i3200_pci_tbl,
+};
+
+static int __init i3200_init(void)
+{
+	int pci_rc;
+
+	debugf3("MC: %s()\n", __func__);
+
+	/* Ensure that the OPSTATE is set correctly for POLL or NMI */
+	opstate_init();
+
+	pci_rc = pci_register_driver(&i3200_driver);
+	if (pci_rc < 0)
+		goto fail0;
+
+	if (!mci_pdev) {
+		i3200_registered = 0;
+		mci_pdev = pci_get_device(PCI_VENDOR_ID_INTEL,
+				PCI_DEVICE_ID_INTEL_3200_HB, NULL);
+		if (!mci_pdev) {
+			debugf0("i3200 pci_get_device fail\n");
+			pci_rc = -ENODEV;
+			goto fail1;
+		}
+
+		pci_rc = i3200_init_one(mci_pdev, i3200_pci_tbl);
+		if (pci_rc < 0) {
+			debugf0("i3200 init fail\n");
+			pci_rc = -ENODEV;
+			goto fail1;
+		}
+	}
+
+	return 0;
+
+fail1:
+	pci_unregister_driver(&i3200_driver);
+
+fail0:
+	if (mci_pdev)
+		pci_dev_put(mci_pdev);
+
+	return pci_rc;
+}
+
+static void __exit i3200_exit(void)
+{
+	debugf3("MC: %s()\n", __func__);
+
+	pci_unregister_driver(&i3200_driver);
+	if (!i3200_registered) {
+		i3200_remove_one(mci_pdev);
+		pci_dev_put(mci_pdev);
+	}
+}
+
+module_init(i3200_init);
+module_exit(i3200_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Akamai Technologies, Inc.");
+MODULE_DESCRIPTION("MC support for Intel 3200 memory hub controllers");
+
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c
index 3f2ccfc6407..157f6504f25 100644
--- a/drivers/edac/mpc85xx_edac.c
+++ b/drivers/edac/mpc85xx_edac.c
@@ -41,7 +41,9 @@ static u32 orig_pci_err_en;
 #endif
 
 static u32 orig_l2_err_disable;
+#ifdef CONFIG_MPC85xx
 static u32 orig_hid1[2];
+#endif
 
 /************************ MC SYSFS parts ***********************************/
 
@@ -646,6 +648,7 @@ static struct of_device_id mpc85xx_l2_err_of_match[] = {
 	{ .compatible = "fsl,mpc8560-l2-cache-controller", },
 	{ .compatible = "fsl,mpc8568-l2-cache-controller", },
 	{ .compatible = "fsl,mpc8572-l2-cache-controller", },
+	{ .compatible = "fsl,p2020-l2-cache-controller", },
 	{},
 };
 
@@ -788,19 +791,20 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci)
 		csrow = &mci->csrows[index];
 		cs_bnds = in_be32(pdata->mc_vbase + MPC85XX_MC_CS_BNDS_0 +
 				  (index * MPC85XX_MC_CS_BNDS_OFS));
-		start = (cs_bnds & 0xfff0000) << 4;
-		end = ((cs_bnds & 0xfff) << 20);
-		if (start)
-			start |= 0xfffff;
-		if (end)
-			end |= 0xfffff;
+
+		start = (cs_bnds & 0xffff0000) >> 16;
+		end   = (cs_bnds & 0x0000ffff);
 
 		if (start == end)
 			continue;	/* not populated */
 
+		start <<= (24 - PAGE_SHIFT);
+		end   <<= (24 - PAGE_SHIFT);
+		end    |= (1 << (24 - PAGE_SHIFT)) - 1;
+
 		csrow->first_page = start >> PAGE_SHIFT;
 		csrow->last_page = end >> PAGE_SHIFT;
-		csrow->nr_pages = csrow->last_page + 1 - csrow->first_page;
+		csrow->nr_pages = end + 1 - start;
 		csrow->grain = 8;
 		csrow->mtype = mtype;
 		csrow->dtype = DEV_UNKNOWN;
@@ -984,6 +988,8 @@ static struct of_device_id mpc85xx_mc_err_of_match[] = {
 	{ .compatible = "fsl,mpc8560-memory-controller", },
 	{ .compatible = "fsl,mpc8568-memory-controller", },
 	{ .compatible = "fsl,mpc8572-memory-controller", },
+	{ .compatible = "fsl,mpc8349-memory-controller", },
+	{ .compatible = "fsl,p2020-memory-controller", },
 	{},
 };
 
@@ -999,13 +1005,13 @@ static struct of_platform_driver mpc85xx_mc_err_driver = {
 		   },
 };
 
-
+#ifdef CONFIG_MPC85xx
 static void __init mpc85xx_mc_clear_rfxe(void *data)
 {
 	orig_hid1[smp_processor_id()] = mfspr(SPRN_HID1);
 	mtspr(SPRN_HID1, (orig_hid1[smp_processor_id()] & ~0x20000));
 }
-
+#endif
 
 static int __init mpc85xx_mc_init(void)
 {
@@ -1038,26 +1044,32 @@ static int __init mpc85xx_mc_init(void)
 		printk(KERN_WARNING EDAC_MOD_STR "PCI fails to register\n");
 #endif
 
+#ifdef CONFIG_MPC85xx
 	/*
 	 * need to clear HID1[RFXE] to disable machine check int
 	 * so we can catch it
 	 */
 	if (edac_op_state == EDAC_OPSTATE_INT)
 		on_each_cpu(mpc85xx_mc_clear_rfxe, NULL, 0);
+#endif
 
 	return 0;
 }
 
 module_init(mpc85xx_mc_init);
 
+#ifdef CONFIG_MPC85xx
 static void __exit mpc85xx_mc_restore_hid1(void *data)
 {
 	mtspr(SPRN_HID1, orig_hid1[smp_processor_id()]);
 }
+#endif
 
 static void __exit mpc85xx_mc_exit(void)
 {
+#ifdef CONFIG_MPC85xx
 	on_each_cpu(mpc85xx_mc_restore_hid1, NULL, 0);
+#endif
 #ifdef CONFIG_PCI
 	of_unregister_platform_driver(&mpc85xx_pci_err_driver);
 #endif
diff --git a/drivers/edac/mv64x60_edac.c b/drivers/edac/mv64x60_edac.c
index 5131aaae8e0..a6b9fec13a7 100644
--- a/drivers/edac/mv64x60_edac.c
+++ b/drivers/edac/mv64x60_edac.c
@@ -90,7 +90,7 @@ static int __init mv64x60_pci_fixup(struct platform_device *pdev)
 		return -ENOENT;
 	}
 
-	pci_serr = ioremap(r->start, r->end - r->start + 1);
+	pci_serr = ioremap(r->start, resource_size(r));
 	if (!pci_serr)
 		return -ENOMEM;
 
@@ -140,7 +140,7 @@ static int __devinit mv64x60_pci_err_probe(struct platform_device *pdev)
 
 	if (!devm_request_mem_region(&pdev->dev,
 				     r->start,
-				     r->end - r->start + 1,
+				     resource_size(r),
 				     pdata->name)) {
 		printk(KERN_ERR "%s: Error while requesting mem region\n",
 		       __func__);
@@ -150,7 +150,7 @@ static int __devinit mv64x60_pci_err_probe(struct platform_device *pdev)
 
 	pdata->pci_vbase = devm_ioremap(&pdev->dev,
 					r->start,
-					r->end - r->start + 1);
+					resource_size(r));
 	if (!pdata->pci_vbase) {
 		printk(KERN_ERR "%s: Unable to setup PCI err regs\n", __func__);
 		res = -ENOMEM;
@@ -306,7 +306,7 @@ static int __devinit mv64x60_sram_err_probe(struct platform_device *pdev)
 
 	if (!devm_request_mem_region(&pdev->dev,
 				     r->start,
-				     r->end - r->start + 1,
+				     resource_size(r),
 				     pdata->name)) {
 		printk(KERN_ERR "%s: Error while request mem region\n",
 		       __func__);
@@ -316,7 +316,7 @@ static int __devinit mv64x60_sram_err_probe(struct platform_device *pdev)
 
 	pdata->sram_vbase = devm_ioremap(&pdev->dev,
 					 r->start,
-					 r->end - r->start + 1);
+					 resource_size(r));
 	if (!pdata->sram_vbase) {
 		printk(KERN_ERR "%s: Unable to setup SRAM err regs\n",
 		       __func__);
@@ -474,7 +474,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
 
 	if (!devm_request_mem_region(&pdev->dev,
 				     r->start,
-				     r->end - r->start + 1,
+				     resource_size(r),
 				     pdata->name)) {
 		printk(KERN_ERR "%s: Error while requesting mem region\n",
 		       __func__);
@@ -484,7 +484,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
 
 	pdata->cpu_vbase[0] = devm_ioremap(&pdev->dev,
 					   r->start,
-					   r->end - r->start + 1);
+					   resource_size(r));
 	if (!pdata->cpu_vbase[0]) {
 		printk(KERN_ERR "%s: Unable to setup CPU err regs\n", __func__);
 		res = -ENOMEM;
@@ -501,7 +501,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
 
 	if (!devm_request_mem_region(&pdev->dev,
 				     r->start,
-				     r->end - r->start + 1,
+				     resource_size(r),
 				     pdata->name)) {
 		printk(KERN_ERR "%s: Error while requesting mem region\n",
 		       __func__);
@@ -511,7 +511,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
 
 	pdata->cpu_vbase[1] = devm_ioremap(&pdev->dev,
 					   r->start,
-					   r->end - r->start + 1);
+					   resource_size(r));
 	if (!pdata->cpu_vbase[1]) {
 		printk(KERN_ERR "%s: Unable to setup CPU err regs\n", __func__);
 		res = -ENOMEM;
@@ -726,7 +726,7 @@ static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev)
 
 	if (!devm_request_mem_region(&pdev->dev,
 				     r->start,
-				     r->end - r->start + 1,
+				     resource_size(r),
 				     pdata->name)) {
 		printk(KERN_ERR "%s: Error while requesting mem region\n",
 		       __func__);
@@ -736,7 +736,7 @@ static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev)
 
 	pdata->mc_vbase = devm_ioremap(&pdev->dev,
 				       r->start,
-				       r->end - r->start + 1);
+				       resource_size(r));
 	if (!pdata->mc_vbase) {
 		printk(KERN_ERR "%s: Unable to setup MC err regs\n", __func__);
 		res = -ENOMEM;
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index e4d971c8b9d..f831ea15929 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -102,6 +102,7 @@ config DRM_I915
 	select BACKLIGHT_CLASS_DEVICE if ACPI
 	select INPUT if ACPI
 	select ACPI_VIDEO if ACPI
+	select ACPI_BUTTON if ACPI
 	help
 	  Choose this option if you have a system that has Intel 830M, 845G,
 	  852GM, 855GM 865G or 915G integrated graphics.  If M is selected, the
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 230c9ffdd5e..80391995bde 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -142,6 +142,19 @@ drm_gem_object_alloc(struct drm_device *dev, size_t size)
 	if (IS_ERR(obj->filp))
 		goto free;
 
+	/* Basically we want to disable the OOM killer and handle ENOMEM
+	 * ourselves by sacrificing pages from cached buffers.
+	 * XXX shmem_file_[gs]et_gfp_mask()
+	 */
+	mapping_set_gfp_mask(obj->filp->f_path.dentry->d_inode->i_mapping,
+			     GFP_HIGHUSER |
+			     __GFP_COLD |
+			     __GFP_FS |
+			     __GFP_RECLAIMABLE |
+			     __GFP_NORETRY |
+			     __GFP_NOWARN |
+			     __GFP_NOMEMALLOC);
+
 	kref_init(&obj->refcount);
 	kref_init(&obj->handlecount);
 	obj->size = size;
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 5269dfa5f62..fa7b9be096b 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -9,6 +9,7 @@ i915-y := i915_drv.o i915_dma.o i915_irq.o i915_mem.o \
 	  i915_gem.o \
 	  i915_gem_debug.o \
 	  i915_gem_tiling.o \
+	  i915_trace_points.o \
 	  intel_display.o \
 	  intel_crt.o \
 	  intel_lvds.o \
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1e3bdcee863..f8ce9a3a420 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -96,11 +96,13 @@ static int i915_gem_object_list_info(struct seq_file *m, void *data)
 	{
 		struct drm_gem_object *obj = obj_priv->obj;
 
-		seq_printf(m, "    %p: %s %08x %08x %d",
+		seq_printf(m, "    %p: %s %8zd %08x %08x %d %s",
 			   obj,
 			   get_pin_flag(obj_priv),
+			   obj->size,
 			   obj->read_domains, obj->write_domain,
-			   obj_priv->last_rendering_seqno);
+			   obj_priv->last_rendering_seqno,
+			   obj_priv->dirty ? "dirty" : "");
 
 		if (obj->name)
 			seq_printf(m, " (name: %d)", obj->name);
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 5a49a1867b3..45d507ebd3f 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -33,6 +33,7 @@
 #include "intel_drv.h"
 #include "i915_drm.h"
 #include "i915_drv.h"
+#include "i915_trace.h"
 #include <linux/vgaarb.h>
 
 /* Really want an OS-independent resettable timer.  Would like to have
@@ -50,14 +51,18 @@ int i915_wait_ring(struct drm_device * dev, int n, const char *caller)
 	u32 last_head = I915_READ(PRB0_HEAD) & HEAD_ADDR;
 	int i;
 
+	trace_i915_ring_wait_begin (dev);
+
 	for (i = 0; i < 100000; i++) {
 		ring->head = I915_READ(PRB0_HEAD) & HEAD_ADDR;
 		acthd = I915_READ(acthd_reg);
 		ring->space = ring->head - (ring->tail + 8);
 		if (ring->space < 0)
 			ring->space += ring->Size;
-		if (ring->space >= n)
+		if (ring->space >= n) {
+			trace_i915_ring_wait_end (dev);
 			return 0;
+		}
 
 		if (dev->primary->master) {
 			struct drm_i915_master_private *master_priv = dev->primary->master->driver_priv;
@@ -77,6 +82,7 @@ int i915_wait_ring(struct drm_device * dev, int n, const char *caller)
 
 	}
 
+	trace_i915_ring_wait_end (dev);
 	return -EBUSY;
 }
 
@@ -922,7 +928,8 @@ static int i915_get_bridge_dev(struct drm_device *dev)
  * how much was set aside so we can use it for our own purposes.
  */
 static int i915_probe_agp(struct drm_device *dev, uint32_t *aperture_size,
-			  uint32_t *preallocated_size)
+			  uint32_t *preallocated_size,
+			  uint32_t *start)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u16 tmp = 0;
@@ -1009,10 +1016,159 @@ static int i915_probe_agp(struct drm_device *dev, uint32_t *aperture_size,
 		return -1;
 	}
 	*preallocated_size = stolen - overhead;
+	*start = overhead;
 
 	return 0;
 }
 
+#define PTE_ADDRESS_MASK		0xfffff000
+#define PTE_ADDRESS_MASK_HIGH		0x000000f0 /* i915+ */
+#define PTE_MAPPING_TYPE_UNCACHED	(0 << 1)
+#define PTE_MAPPING_TYPE_DCACHE		(1 << 1) /* i830 only */
+#define PTE_MAPPING_TYPE_CACHED		(3 << 1)
+#define PTE_MAPPING_TYPE_MASK		(3 << 1)
+#define PTE_VALID			(1 << 0)
+
+/**
+ * i915_gtt_to_phys - take a GTT address and turn it into a physical one
+ * @dev: drm device
+ * @gtt_addr: address to translate
+ *
+ * Some chip functions require allocations from stolen space but need the
+ * physical address of the memory in question.  We use this routine
+ * to get a physical address suitable for register programming from a given
+ * GTT address.
+ */
+static unsigned long i915_gtt_to_phys(struct drm_device *dev,
+				      unsigned long gtt_addr)
+{
+	unsigned long *gtt;
+	unsigned long entry, phys;
+	int gtt_bar = IS_I9XX(dev) ? 0 : 1;
+	int gtt_offset, gtt_size;
+
+	if (IS_I965G(dev)) {
+		if (IS_G4X(dev) || IS_IGDNG(dev)) {
+			gtt_offset = 2*1024*1024;
+			gtt_size = 2*1024*1024;
+		} else {
+			gtt_offset = 512*1024;
+			gtt_size = 512*1024;
+		}
+	} else {
+		gtt_bar = 3;
+		gtt_offset = 0;
+		gtt_size = pci_resource_len(dev->pdev, gtt_bar);
+	}
+
+	gtt = ioremap_wc(pci_resource_start(dev->pdev, gtt_bar) + gtt_offset,
+			 gtt_size);
+	if (!gtt) {
+		DRM_ERROR("ioremap of GTT failed\n");
+		return 0;
+	}
+
+	entry = *(volatile u32 *)(gtt + (gtt_addr / 1024));
+
+	DRM_DEBUG("GTT addr: 0x%08lx, PTE: 0x%08lx\n", gtt_addr, entry);
+
+	/* Mask out these reserved bits on this hardware. */
+	if (!IS_I9XX(dev) || IS_I915G(dev) || IS_I915GM(dev) ||
+	    IS_I945G(dev) || IS_I945GM(dev)) {
+		entry &= ~PTE_ADDRESS_MASK_HIGH;
+	}
+
+	/* If it's not a mapping type we know, then bail. */
+	if ((entry & PTE_MAPPING_TYPE_MASK) != PTE_MAPPING_TYPE_UNCACHED &&
+	    (entry & PTE_MAPPING_TYPE_MASK) != PTE_MAPPING_TYPE_CACHED)	{
+		iounmap(gtt);
+		return 0;
+	}
+
+	if (!(entry & PTE_VALID)) {
+		DRM_ERROR("bad GTT entry in stolen space\n");
+		iounmap(gtt);
+		return 0;
+	}
+
+	iounmap(gtt);
+
+	phys =(entry & PTE_ADDRESS_MASK) |
+		((uint64_t)(entry & PTE_ADDRESS_MASK_HIGH) << (32 - 4));
+
+	DRM_DEBUG("GTT addr: 0x%08lx, phys addr: 0x%08lx\n", gtt_addr, phys);
+
+	return phys;
+}
+
+static void i915_warn_stolen(struct drm_device *dev)
+{
+	DRM_ERROR("not enough stolen space for compressed buffer, disabling\n");
+	DRM_ERROR("hint: you may be able to increase stolen memory size in the BIOS to avoid this\n");
+}
+
+static void i915_setup_compression(struct drm_device *dev, int size)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_mm_node *compressed_fb, *compressed_llb;
+	unsigned long cfb_base, ll_base;
+
+	/* Leave 1M for line length buffer & misc. */
+	compressed_fb = drm_mm_search_free(&dev_priv->vram, size, 4096, 0);
+	if (!compressed_fb) {
+		i915_warn_stolen(dev);
+		return;
+	}
+
+	compressed_fb = drm_mm_get_block(compressed_fb, size, 4096);
+	if (!compressed_fb) {
+		i915_warn_stolen(dev);
+		return;
+	}
+
+	cfb_base = i915_gtt_to_phys(dev, compressed_fb->start);
+	if (!cfb_base) {
+		DRM_ERROR("failed to get stolen phys addr, disabling FBC\n");
+		drm_mm_put_block(compressed_fb);
+	}
+
+	if (!IS_GM45(dev)) {
+		compressed_llb = drm_mm_search_free(&dev_priv->vram, 4096,
+						    4096, 0);
+		if (!compressed_llb) {
+			i915_warn_stolen(dev);
+			return;
+		}
+
+		compressed_llb = drm_mm_get_block(compressed_llb, 4096, 4096);
+		if (!compressed_llb) {
+			i915_warn_stolen(dev);
+			return;
+		}
+
+		ll_base = i915_gtt_to_phys(dev, compressed_llb->start);
+		if (!ll_base) {
+			DRM_ERROR("failed to get stolen phys addr, disabling FBC\n");
+			drm_mm_put_block(compressed_fb);
+			drm_mm_put_block(compressed_llb);
+		}
+	}
+
+	dev_priv->cfb_size = size;
+
+	if (IS_GM45(dev)) {
+		g4x_disable_fbc(dev);
+		I915_WRITE(DPFC_CB_BASE, compressed_fb->start);
+	} else {
+		i8xx_disable_fbc(dev);
+		I915_WRITE(FBC_CFB_BASE, cfb_base);
+		I915_WRITE(FBC_LL_BASE, ll_base);
+	}
+
+	DRM_DEBUG("FBC base 0x%08lx, ll base 0x%08lx, size %dM\n", cfb_base,
+		  ll_base, size >> 20);
+}
+
 /* true = enable decode, false = disable decoder */
 static unsigned int i915_vga_set_decode(void *cookie, bool state)
 {
@@ -1027,6 +1183,7 @@ static unsigned int i915_vga_set_decode(void *cookie, bool state)
 }
 
 static int i915_load_modeset_init(struct drm_device *dev,
+				  unsigned long prealloc_start,
 				  unsigned long prealloc_size,
 				  unsigned long agp_size)
 {
@@ -1047,6 +1204,10 @@ static int i915_load_modeset_init(struct drm_device *dev,
 
 	/* Basic memrange allocator for stolen space (aka vram) */
 	drm_mm_init(&dev_priv->vram, 0, prealloc_size);
+	DRM_INFO("set up %ldM of stolen space\n", prealloc_size / (1024*1024));
+
+	/* We're off and running w/KMS */
+	dev_priv->mm.suspended = 0;
 
 	/* Let GEM Manage from end of prealloc space to end of aperture.
 	 *
@@ -1059,10 +1220,25 @@ static int i915_load_modeset_init(struct drm_device *dev,
 	 */
 	i915_gem_do_init(dev, prealloc_size, agp_size - 4096);
 
+	mutex_lock(&dev->struct_mutex);
 	ret = i915_gem_init_ringbuffer(dev);
+	mutex_unlock(&dev->struct_mutex);
 	if (ret)
 		goto out;
 
+	/* Try to set up FBC with a reasonable compressed buffer size */
+	if (IS_MOBILE(dev) && (IS_I9XX(dev) || IS_I965G(dev) || IS_GM45(dev)) &&
+	    i915_powersave) {
+		int cfb_size;
+
+		/* Try to get an 8M buffer... */
+		if (prealloc_size > (9*1024*1024))
+			cfb_size = 8*1024*1024;
+		else /* fall back to 7/8 of the stolen space */
+			cfb_size = prealloc_size * 7 / 8;
+		i915_setup_compression(dev, cfb_size);
+	}
+
 	/* Allow hardware batchbuffers unless told otherwise.
 	 */
 	dev_priv->allow_batchbuffer = 1;
@@ -1180,7 +1356,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	resource_size_t base, size;
 	int ret = 0, mmio_bar = IS_I9XX(dev) ? 0 : 1;
-	uint32_t agp_size, prealloc_size;
+	uint32_t agp_size, prealloc_size, prealloc_start;
 
 	/* i915 has 4 more counters */
 	dev->counters += 4;
@@ -1234,7 +1410,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 			 "performance may suffer.\n");
 	}
 
-	ret = i915_probe_agp(dev, &agp_size, &prealloc_size);
+	ret = i915_probe_agp(dev, &agp_size, &prealloc_size, &prealloc_start);
 	if (ret)
 		goto out_iomapfree;
 
@@ -1300,8 +1476,12 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 		return ret;
 	}
 
+	/* Start out suspended */
+	dev_priv->mm.suspended = 1;
+
 	if (drm_core_check_feature(dev, DRIVER_MODESET)) {
-		ret = i915_load_modeset_init(dev, prealloc_size, agp_size);
+		ret = i915_load_modeset_init(dev, prealloc_start,
+					     prealloc_size, agp_size);
 		if (ret < 0) {
 			DRM_ERROR("failed to init modeset\n");
 			goto out_workqueue_free;
@@ -1313,6 +1493,8 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	if (!IS_IGDNG(dev))
 		intel_opregion_init(dev, 0);
 
+	setup_timer(&dev_priv->hangcheck_timer, i915_hangcheck_elapsed,
+		    (unsigned long) dev);
 	return 0;
 
 out_workqueue_free:
@@ -1333,6 +1515,7 @@ int i915_driver_unload(struct drm_device *dev)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	destroy_workqueue(dev_priv->wq);
+	del_timer_sync(&dev_priv->hangcheck_timer);
 
 	io_mapping_free(dev_priv->mm.gtt_mapping);
 	if (dev_priv->mm.gtt_mtrr >= 0) {
@@ -1472,6 +1655,7 @@ struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF(DRM_I915_GEM_GET_TILING, i915_gem_get_tiling, 0),
 	DRM_IOCTL_DEF(DRM_I915_GEM_GET_APERTURE, i915_gem_get_aperture_ioctl, 0),
 	DRM_IOCTL_DEF(DRM_I915_GET_PIPE_FROM_CRTC_ID, intel_get_pipe_from_crtc_id, 0),
+	DRM_IOCTL_DEF(DRM_I915_GEM_MADVISE, i915_gem_madvise_ioctl, 0),
 };
 
 int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls);
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index dbe568c9327..b93814c0d3e 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -89,6 +89,8 @@ static int i915_suspend(struct drm_device *dev, pm_message_t state)
 		pci_set_power_state(dev->pdev, PCI_D3hot);
 	}
 
+	dev_priv->suspended = 1;
+
 	return 0;
 }
 
@@ -97,8 +99,6 @@ static int i915_resume(struct drm_device *dev)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret = 0;
 
-	pci_set_power_state(dev->pdev, PCI_D0);
-	pci_restore_state(dev->pdev);
 	if (pci_enable_device(dev->pdev))
 		return -1;
 	pci_set_master(dev->pdev);
@@ -124,9 +124,135 @@ static int i915_resume(struct drm_device *dev)
 		drm_helper_resume_force_mode(dev);
 	}
 
+	dev_priv->suspended = 0;
+
 	return ret;
 }
 
+/**
+ * i965_reset - reset chip after a hang
+ * @dev: drm device to reset
+ * @flags: reset domains
+ *
+ * Reset the chip.  Useful if a hang is detected. Returns zero on successful
+ * reset or otherwise an error code.
+ *
+ * Procedure is fairly simple:
+ *   - reset the chip using the reset reg
+ *   - re-init context state
+ *   - re-init hardware status page
+ *   - re-init ring buffer
+ *   - re-init interrupt state
+ *   - re-init display
+ */
+int i965_reset(struct drm_device *dev, u8 flags)
+{
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	unsigned long timeout;
+	u8 gdrst;
+	/*
+	 * We really should only reset the display subsystem if we actually
+	 * need to
+	 */
+	bool need_display = true;
+
+	mutex_lock(&dev->struct_mutex);
+
+	/*
+	 * Clear request list
+	 */
+	i915_gem_retire_requests(dev);
+
+	if (need_display)
+		i915_save_display(dev);
+
+	if (IS_I965G(dev) || IS_G4X(dev)) {
+		/*
+		 * Set the domains we want to reset, then the reset bit (bit 0).
+		 * Clear the reset bit after a while and wait for hardware status
+		 * bit (bit 1) to be set
+		 */
+		pci_read_config_byte(dev->pdev, GDRST, &gdrst);
+		pci_write_config_byte(dev->pdev, GDRST, gdrst | flags | ((flags == GDRST_FULL) ? 0x1 : 0x0));
+		udelay(50);
+		pci_write_config_byte(dev->pdev, GDRST, gdrst & 0xfe);
+
+		/* ...we don't want to loop forever though, 500ms should be plenty */
+	       timeout = jiffies + msecs_to_jiffies(500);
+		do {
+			udelay(100);
+			pci_read_config_byte(dev->pdev, GDRST, &gdrst);
+		} while ((gdrst & 0x1) && time_after(timeout, jiffies));
+
+		if (gdrst & 0x1) {
+			WARN(true, "i915: Failed to reset chip\n");
+			mutex_unlock(&dev->struct_mutex);
+			return -EIO;
+		}
+	} else {
+		DRM_ERROR("Error occurred. Don't know how to reset this chip.\n");
+		return -ENODEV;
+	}
+
+	/* Ok, now get things going again... */
+
+	/*
+	 * Everything depends on having the GTT running, so we need to start
+	 * there.  Fortunately we don't need to do this unless we reset the
+	 * chip at a PCI level.
+	 *
+	 * Next we need to restore the context, but we don't use those
+	 * yet either...
+	 *
+	 * Ring buffer needs to be re-initialized in the KMS case, or if X
+	 * was running at the time of the reset (i.e. we weren't VT
+	 * switched away).
+	 */
+	if (drm_core_check_feature(dev, DRIVER_MODESET) ||
+	    !dev_priv->mm.suspended) {
+		drm_i915_ring_buffer_t *ring = &dev_priv->ring;
+		struct drm_gem_object *obj = ring->ring_obj;
+		struct drm_i915_gem_object *obj_priv = obj->driver_private;
+		dev_priv->mm.suspended = 0;
+
+		/* Stop the ring if it's running. */
+		I915_WRITE(PRB0_CTL, 0);
+		I915_WRITE(PRB0_TAIL, 0);
+		I915_WRITE(PRB0_HEAD, 0);
+
+		/* Initialize the ring. */
+		I915_WRITE(PRB0_START, obj_priv->gtt_offset);
+		I915_WRITE(PRB0_CTL,
+			   ((obj->size - 4096) & RING_NR_PAGES) |
+			   RING_NO_REPORT |
+			   RING_VALID);
+		if (!drm_core_check_feature(dev, DRIVER_MODESET))
+			i915_kernel_lost_context(dev);
+		else {
+			ring->head = I915_READ(PRB0_HEAD) & HEAD_ADDR;
+			ring->tail = I915_READ(PRB0_TAIL) & TAIL_ADDR;
+			ring->space = ring->head - (ring->tail + 8);
+			if (ring->space < 0)
+				ring->space += ring->Size;
+		}
+
+		mutex_unlock(&dev->struct_mutex);
+		drm_irq_uninstall(dev);
+		drm_irq_install(dev);
+		mutex_lock(&dev->struct_mutex);
+	}
+
+	/*
+	 * Display needs restore too...
+	 */
+	if (need_display)
+		i915_restore_display(dev);
+
+	mutex_unlock(&dev->struct_mutex);
+	return 0;
+}
+
+
 static int __devinit
 i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
@@ -234,6 +360,8 @@ static int __init i915_init(void)
 {
 	driver.num_ioctls = i915_max_ioctl;
 
+	i915_gem_shrinker_init();
+
 	/*
 	 * If CONFIG_DRM_I915_KMS is set, default to KMS unless
 	 * explicitly disabled with the module pararmeter.
@@ -260,6 +388,7 @@ static int __init i915_init(void)
 
 static void __exit i915_exit(void)
 {
+	i915_gem_shrinker_exit();
 	drm_exit(&driver);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a0632f8e76a..b24b2d145b7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -48,6 +48,11 @@ enum pipe {
 	PIPE_B,
 };
 
+enum plane {
+	PLANE_A = 0,
+	PLANE_B,
+};
+
 #define I915_NUM_PIPE	2
 
 /* Interface history:
@@ -148,6 +153,23 @@ struct drm_i915_error_state {
 	struct timeval time;
 };
 
+struct drm_i915_display_funcs {
+	void (*dpms)(struct drm_crtc *crtc, int mode);
+	bool (*fbc_enabled)(struct drm_crtc *crtc);
+	void (*enable_fbc)(struct drm_crtc *crtc, unsigned long interval);
+	void (*disable_fbc)(struct drm_device *dev);
+	int (*get_display_clock_speed)(struct drm_device *dev);
+	int (*get_fifo_size)(struct drm_device *dev, int plane);
+	void (*update_wm)(struct drm_device *dev, int planea_clock,
+			  int planeb_clock, int sr_hdisplay, int pixel_size);
+	/* clock updates for mode set */
+	/* cursor updates */
+	/* render clock increase/decrease */
+	/* display clock increase/decrease */
+	/* pll clock increase/decrease */
+	/* clock gating init */
+};
+
 typedef struct drm_i915_private {
 	struct drm_device *dev;
 
@@ -198,10 +220,21 @@ typedef struct drm_i915_private {
 	unsigned int sr01, adpa, ppcr, dvob, dvoc, lvds;
 	int vblank_pipe;
 
+	/* For hangcheck timer */
+#define DRM_I915_HANGCHECK_PERIOD 75 /* in jiffies */
+	struct timer_list hangcheck_timer;
+	int hangcheck_count;
+	uint32_t last_acthd;
+
 	bool cursor_needs_physical;
 
 	struct drm_mm vram;
 
+	unsigned long cfb_size;
+	unsigned long cfb_pitch;
+	int cfb_fence;
+	int cfb_plane;
+
 	int irq_enabled;
 
 	struct intel_opregion opregion;
@@ -222,6 +255,8 @@ typedef struct drm_i915_private {
 	unsigned int edp_support:1;
 	int lvds_ssc_freq;
 
+	struct notifier_block lid_notifier;
+
 	int crt_ddc_bus; /* -1 = unknown, else GPIO to use for CRT DDC */
 	struct drm_i915_fence_reg fence_regs[16]; /* assume 965 */
 	int fence_reg_start; /* 4 if userland hasn't ioctl'd us yet */
@@ -234,7 +269,11 @@ typedef struct drm_i915_private {
 	struct work_struct error_work;
 	struct workqueue_struct *wq;
 
+	/* Display functions */
+	struct drm_i915_display_funcs display;
+
 	/* Register state */
+	bool suspended;
 	u8 saveLBB;
 	u32 saveDSPACNTR;
 	u32 saveDSPBCNTR;
@@ -350,6 +389,15 @@ typedef struct drm_i915_private {
 		int gtt_mtrr;
 
 		/**
+		 * Membership on list of all loaded devices, used to evict
+		 * inactive buffers under memory pressure.
+		 *
+		 * Modifications should only be done whilst holding the
+		 * shrink_list_lock spinlock.
+		 */
+		struct list_head shrink_list;
+
+		/**
 		 * List of objects currently involved in rendering from the
 		 * ringbuffer.
 		 *
@@ -432,7 +480,7 @@ typedef struct drm_i915_private {
 		 * It prevents command submission from occuring and makes
 		 * every pending request fail
 		 */
-		int wedged;
+		atomic_t wedged;
 
 		/** Bit 6 swizzling required for X tiling */
 		uint32_t bit_6_swizzle_x;
@@ -491,10 +539,7 @@ struct drm_i915_gem_object {
 	 * This is the same as gtt_space->start
 	 */
 	uint32_t gtt_offset;
-	/**
-	 * Required alignment for the object
-	 */
-	uint32_t gtt_alignment;
+
 	/**
 	 * Fake offset for use by mmap(2)
 	 */
@@ -541,6 +586,11 @@ struct drm_i915_gem_object {
 	 * in an execbuffer object list.
 	 */
 	int in_execbuffer;
+
+	/**
+	 * Advice: are the backing pages purgeable?
+	 */
+	int madv;
 };
 
 /**
@@ -585,6 +635,8 @@ extern int i915_max_ioctl;
 extern unsigned int i915_fbpercrtc;
 extern unsigned int i915_powersave;
 
+extern void i915_save_display(struct drm_device *dev);
+extern void i915_restore_display(struct drm_device *dev);
 extern int i915_master_create(struct drm_device *dev, struct drm_master *master);
 extern void i915_master_destroy(struct drm_device *dev, struct drm_master *master);
 
@@ -604,8 +656,10 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
 extern int i915_emit_box(struct drm_device *dev,
 			 struct drm_clip_rect *boxes,
 			 int i, int DR1, int DR4);
+extern int i965_reset(struct drm_device *dev, u8 flags);
 
 /* i915_irq.c */
+void i915_hangcheck_elapsed(unsigned long data);
 extern int i915_irq_emit(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int i915_irq_wait(struct drm_device *dev, void *data,
@@ -676,6 +730,8 @@ int i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 int i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
 			    struct drm_file *file_priv);
+int i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
+			   struct drm_file *file_priv);
 int i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 int i915_gem_leavevt_ioctl(struct drm_device *dev, void *data,
@@ -695,6 +751,7 @@ int i915_gem_object_unbind(struct drm_gem_object *obj);
 void i915_gem_release_mmap(struct drm_gem_object *obj);
 void i915_gem_lastclose(struct drm_device *dev);
 uint32_t i915_get_gem_seqno(struct drm_device *dev);
+bool i915_seqno_passed(uint32_t seq1, uint32_t seq2);
 int i915_gem_object_get_fence_reg(struct drm_gem_object *obj);
 int i915_gem_object_put_fence_reg(struct drm_gem_object *obj);
 void i915_gem_retire_requests(struct drm_device *dev);
@@ -720,6 +777,9 @@ int i915_gem_object_get_pages(struct drm_gem_object *obj);
 void i915_gem_object_put_pages(struct drm_gem_object *obj);
 void i915_gem_release(struct drm_device * dev, struct drm_file *file_priv);
 
+void i915_gem_shrinker_init(void);
+void i915_gem_shrinker_exit(void);
+
 /* i915_gem_tiling.c */
 void i915_gem_detect_bit_6_swizzle(struct drm_device *dev);
 void i915_gem_object_do_bit_17_swizzle(struct drm_gem_object *obj);
@@ -767,6 +827,8 @@ static inline void opregion_enable_asle(struct drm_device *dev) { return; }
 extern void intel_modeset_init(struct drm_device *dev);
 extern void intel_modeset_cleanup(struct drm_device *dev);
 extern int intel_modeset_vga_set_state(struct drm_device *dev, bool state);
+extern void i8xx_disable_fbc(struct drm_device *dev);
+extern void g4x_disable_fbc(struct drm_device *dev);
 
 /**
  * Lock test for when it's just for synchronization of ring access.
@@ -864,6 +926,7 @@ extern int i915_wait_ring(struct drm_device * dev, int n, const char *caller);
 		       (dev)->pci_device == 0x2E12 || \
 		       (dev)->pci_device == 0x2E22 || \
 		       (dev)->pci_device == 0x2E32 || \
+		       (dev)->pci_device == 0x2E42 || \
 		       (dev)->pci_device == 0x0042 || \
 		       (dev)->pci_device == 0x0046)
 
@@ -876,6 +939,7 @@ extern int i915_wait_ring(struct drm_device * dev, int n, const char *caller);
 		     (dev)->pci_device == 0x2E12 || \
 		     (dev)->pci_device == 0x2E22 || \
 		     (dev)->pci_device == 0x2E32 || \
+		     (dev)->pci_device == 0x2E42 || \
 		     IS_GM45(dev))
 
 #define IS_IGDG(dev) ((dev)->pci_device == 0xa001)
@@ -909,12 +973,13 @@ extern int i915_wait_ring(struct drm_device * dev, int n, const char *caller);
 #define SUPPORTS_INTEGRATED_HDMI(dev)	(IS_G4X(dev) || IS_IGDNG(dev))
 #define SUPPORTS_INTEGRATED_DP(dev)	(IS_G4X(dev) || IS_IGDNG(dev))
 #define SUPPORTS_EDP(dev)		(IS_IGDNG_M(dev))
-#define I915_HAS_HOTPLUG(dev) (IS_I945G(dev) || IS_I945GM(dev) || IS_I965G(dev))
+#define I915_HAS_HOTPLUG(dev) (IS_I945G(dev) || IS_I945GM(dev) || IS_G33(dev) || IS_I965G(dev))
 /* dsparb controlled by hw only */
 #define DSPARB_HWCONTROL(dev) (IS_G4X(dev) || IS_IGDNG(dev))
 
 #define HAS_FW_BLC(dev) (IS_I9XX(dev) || IS_G4X(dev) || IS_IGDNG(dev))
 #define HAS_PIPE_CXSR(dev) (IS_G4X(dev) || IS_IGDNG(dev))
+#define I915_HAS_FBC(dev) (IS_MOBILE(dev) && (IS_I9XX(dev) || IS_I965G(dev)))
 
 #define PRIMARY_RINGBUFFER_SIZE         (128*1024)
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c67317112f4..40727d4c291 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -29,6 +29,7 @@
 #include "drm.h"
 #include "i915_drm.h"
 #include "i915_drv.h"
+#include "i915_trace.h"
 #include "intel_drv.h"
 #include <linux/swap.h>
 #include <linux/pci.h>
@@ -48,11 +49,15 @@ static int i915_gem_object_wait_rendering(struct drm_gem_object *obj);
 static int i915_gem_object_bind_to_gtt(struct drm_gem_object *obj,
 					   unsigned alignment);
 static void i915_gem_clear_fence_reg(struct drm_gem_object *obj);
-static int i915_gem_evict_something(struct drm_device *dev);
+static int i915_gem_evict_something(struct drm_device *dev, int min_size);
+static int i915_gem_evict_from_inactive_list(struct drm_device *dev);
 static int i915_gem_phys_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
 				struct drm_i915_gem_pwrite *args,
 				struct drm_file *file_priv);
 
+static LIST_HEAD(shrink_list);
+static DEFINE_SPINLOCK(shrink_list_lock);
+
 int i915_gem_do_init(struct drm_device *dev, unsigned long start,
 		     unsigned long end)
 {
@@ -316,6 +321,45 @@ fail_unlock:
 	return ret;
 }
 
+static inline gfp_t
+i915_gem_object_get_page_gfp_mask (struct drm_gem_object *obj)
+{
+	return mapping_gfp_mask(obj->filp->f_path.dentry->d_inode->i_mapping);
+}
+
+static inline void
+i915_gem_object_set_page_gfp_mask (struct drm_gem_object *obj, gfp_t gfp)
+{
+	mapping_set_gfp_mask(obj->filp->f_path.dentry->d_inode->i_mapping, gfp);
+}
+
+static int
+i915_gem_object_get_pages_or_evict(struct drm_gem_object *obj)
+{
+	int ret;
+
+	ret = i915_gem_object_get_pages(obj);
+
+	/* If we've insufficient memory to map in the pages, attempt
+	 * to make some space by throwing out some old buffers.
+	 */
+	if (ret == -ENOMEM) {
+		struct drm_device *dev = obj->dev;
+		gfp_t gfp;
+
+		ret = i915_gem_evict_something(dev, obj->size);
+		if (ret)
+			return ret;
+
+		gfp = i915_gem_object_get_page_gfp_mask(obj);
+		i915_gem_object_set_page_gfp_mask(obj, gfp & ~__GFP_NORETRY);
+		ret = i915_gem_object_get_pages(obj);
+		i915_gem_object_set_page_gfp_mask (obj, gfp);
+	}
+
+	return ret;
+}
+
 /**
  * This is the fallback shmem pread path, which allocates temporary storage
  * in kernel space to copy_to_user into outside of the struct_mutex, so we
@@ -367,8 +411,8 @@ i915_gem_shmem_pread_slow(struct drm_device *dev, struct drm_gem_object *obj,
 
 	mutex_lock(&dev->struct_mutex);
 
-	ret = i915_gem_object_get_pages(obj);
-	if (ret != 0)
+	ret = i915_gem_object_get_pages_or_evict(obj);
+	if (ret)
 		goto fail_unlock;
 
 	ret = i915_gem_object_set_cpu_read_domain_range(obj, args->offset,
@@ -842,8 +886,8 @@ i915_gem_shmem_pwrite_slow(struct drm_device *dev, struct drm_gem_object *obj,
 
 	mutex_lock(&dev->struct_mutex);
 
-	ret = i915_gem_object_get_pages(obj);
-	if (ret != 0)
+	ret = i915_gem_object_get_pages_or_evict(obj);
+	if (ret)
 		goto fail_unlock;
 
 	ret = i915_gem_object_set_to_cpu_domain(obj, 1);
@@ -1155,28 +1199,22 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	/* Now bind it into the GTT if needed */
 	mutex_lock(&dev->struct_mutex);
 	if (!obj_priv->gtt_space) {
-		ret = i915_gem_object_bind_to_gtt(obj, obj_priv->gtt_alignment);
-		if (ret) {
-			mutex_unlock(&dev->struct_mutex);
-			return VM_FAULT_SIGBUS;
-		}
-
-		ret = i915_gem_object_set_to_gtt_domain(obj, write);
-		if (ret) {
-			mutex_unlock(&dev->struct_mutex);
-			return VM_FAULT_SIGBUS;
-		}
+		ret = i915_gem_object_bind_to_gtt(obj, 0);
+		if (ret)
+			goto unlock;
 
 		list_add_tail(&obj_priv->list, &dev_priv->mm.inactive_list);
+
+		ret = i915_gem_object_set_to_gtt_domain(obj, write);
+		if (ret)
+			goto unlock;
 	}
 
 	/* Need a new fence register? */
 	if (obj_priv->tiling_mode != I915_TILING_NONE) {
 		ret = i915_gem_object_get_fence_reg(obj);
-		if (ret) {
-			mutex_unlock(&dev->struct_mutex);
-			return VM_FAULT_SIGBUS;
-		}
+		if (ret)
+			goto unlock;
 	}
 
 	pfn = ((dev->agp->base + obj_priv->gtt_offset) >> PAGE_SHIFT) +
@@ -1184,18 +1222,18 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	/* Finally, remap it using the new GTT offset */
 	ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn);
-
+unlock:
 	mutex_unlock(&dev->struct_mutex);
 
 	switch (ret) {
+	case 0:
+	case -ERESTARTSYS:
+		return VM_FAULT_NOPAGE;
 	case -ENOMEM:
 	case -EAGAIN:
 		return VM_FAULT_OOM;
-	case -EFAULT:
-	case -EINVAL:
-		return VM_FAULT_SIGBUS;
 	default:
-		return VM_FAULT_NOPAGE;
+		return VM_FAULT_SIGBUS;
 	}
 }
 
@@ -1388,6 +1426,14 @@ i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
 
 	obj_priv = obj->driver_private;
 
+	if (obj_priv->madv != I915_MADV_WILLNEED) {
+		DRM_ERROR("Attempting to mmap a purgeable buffer\n");
+		drm_gem_object_unreference(obj);
+		mutex_unlock(&dev->struct_mutex);
+		return -EINVAL;
+	}
+
+
 	if (!obj_priv->mmap_offset) {
 		ret = i915_gem_create_mmap_offset(obj);
 		if (ret) {
@@ -1399,22 +1445,12 @@ i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
 
 	args->offset = obj_priv->mmap_offset;
 
-	obj_priv->gtt_alignment = i915_gem_get_gtt_alignment(obj);
-
-	/* Make sure the alignment is correct for fence regs etc */
-	if (obj_priv->agp_mem &&
-	    (obj_priv->gtt_offset & (obj_priv->gtt_alignment - 1))) {
-		drm_gem_object_unreference(obj);
-		mutex_unlock(&dev->struct_mutex);
-		return -EINVAL;
-	}
-
 	/*
 	 * Pull it into the GTT so that we have a page list (makes the
 	 * initial fault faster and any subsequent flushing possible).
 	 */
 	if (!obj_priv->agp_mem) {
-		ret = i915_gem_object_bind_to_gtt(obj, obj_priv->gtt_alignment);
+		ret = i915_gem_object_bind_to_gtt(obj, 0);
 		if (ret) {
 			drm_gem_object_unreference(obj);
 			mutex_unlock(&dev->struct_mutex);
@@ -1437,6 +1473,7 @@ i915_gem_object_put_pages(struct drm_gem_object *obj)
 	int i;
 
 	BUG_ON(obj_priv->pages_refcount == 0);
+	BUG_ON(obj_priv->madv == __I915_MADV_PURGED);
 
 	if (--obj_priv->pages_refcount != 0)
 		return;
@@ -1444,13 +1481,21 @@ i915_gem_object_put_pages(struct drm_gem_object *obj)
 	if (obj_priv->tiling_mode != I915_TILING_NONE)
 		i915_gem_object_save_bit_17_swizzle(obj);
 
-	for (i = 0; i < page_count; i++)
-		if (obj_priv->pages[i] != NULL) {
-			if (obj_priv->dirty)
-				set_page_dirty(obj_priv->pages[i]);
+	if (obj_priv->madv == I915_MADV_DONTNEED)
+		obj_priv->dirty = 0;
+
+	for (i = 0; i < page_count; i++) {
+		if (obj_priv->pages[i] == NULL)
+			break;
+
+		if (obj_priv->dirty)
+			set_page_dirty(obj_priv->pages[i]);
+
+		if (obj_priv->madv == I915_MADV_WILLNEED)
 			mark_page_accessed(obj_priv->pages[i]);
-			page_cache_release(obj_priv->pages[i]);
-		}
+
+		page_cache_release(obj_priv->pages[i]);
+	}
 	obj_priv->dirty = 0;
 
 	drm_free_large(obj_priv->pages);
@@ -1489,6 +1534,26 @@ i915_gem_object_move_to_flushing(struct drm_gem_object *obj)
 	obj_priv->last_rendering_seqno = 0;
 }
 
+/* Immediately discard the backing storage */
+static void
+i915_gem_object_truncate(struct drm_gem_object *obj)
+{
+	struct drm_i915_gem_object *obj_priv = obj->driver_private;
+	struct inode *inode;
+
+	inode = obj->filp->f_path.dentry->d_inode;
+	if (inode->i_op->truncate)
+		inode->i_op->truncate (inode);
+
+	obj_priv->madv = __I915_MADV_PURGED;
+}
+
+static inline int
+i915_gem_object_is_purgeable(struct drm_i915_gem_object *obj_priv)
+{
+	return obj_priv->madv == I915_MADV_DONTNEED;
+}
+
 static void
 i915_gem_object_move_to_inactive(struct drm_gem_object *obj)
 {
@@ -1577,15 +1642,24 @@ i915_add_request(struct drm_device *dev, struct drm_file *file_priv,
 
 			if ((obj->write_domain & flush_domains) ==
 			    obj->write_domain) {
+				uint32_t old_write_domain = obj->write_domain;
+
 				obj->write_domain = 0;
 				i915_gem_object_move_to_active(obj, seqno);
+
+				trace_i915_gem_object_change_domain(obj,
+								    obj->read_domains,
+								    old_write_domain);
 			}
 		}
 
 	}
 
-	if (was_empty && !dev_priv->mm.suspended)
-		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, HZ);
+	if (!dev_priv->mm.suspended) {
+		mod_timer(&dev_priv->hangcheck_timer, jiffies + DRM_I915_HANGCHECK_PERIOD);
+		if (was_empty)
+			queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, HZ);
+	}
 	return seqno;
 }
 
@@ -1623,6 +1697,8 @@ i915_gem_retire_request(struct drm_device *dev,
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 
+	trace_i915_gem_request_retire(dev, request->seqno);
+
 	/* Move any buffers on the active list that are no longer referenced
 	 * by the ringbuffer to the flushing/inactive lists as appropriate.
 	 */
@@ -1671,7 +1747,7 @@ out:
 /**
  * Returns true if seq1 is later than seq2.
  */
-static int
+bool
 i915_seqno_passed(uint32_t seq1, uint32_t seq2)
 {
 	return (int32_t)(seq1 - seq2) >= 0;
@@ -1709,7 +1785,7 @@ i915_gem_retire_requests(struct drm_device *dev)
 		retiring_seqno = request->seqno;
 
 		if (i915_seqno_passed(seqno, retiring_seqno) ||
-		    dev_priv->mm.wedged) {
+		    atomic_read(&dev_priv->mm.wedged)) {
 			i915_gem_retire_request(dev, request);
 
 			list_del(&request->list);
@@ -1751,6 +1827,9 @@ i915_wait_request(struct drm_device *dev, uint32_t seqno)
 
 	BUG_ON(seqno == 0);
 
+	if (atomic_read(&dev_priv->mm.wedged))
+		return -EIO;
+
 	if (!i915_seqno_passed(i915_get_gem_seqno(dev), seqno)) {
 		if (IS_IGDNG(dev))
 			ier = I915_READ(DEIER) | I915_READ(GTIER);
@@ -1763,16 +1842,20 @@ i915_wait_request(struct drm_device *dev, uint32_t seqno)
 			i915_driver_irq_postinstall(dev);
 		}
 
+		trace_i915_gem_request_wait_begin(dev, seqno);
+
 		dev_priv->mm.waiting_gem_seqno = seqno;
 		i915_user_irq_get(dev);
 		ret = wait_event_interruptible(dev_priv->irq_queue,
 					       i915_seqno_passed(i915_get_gem_seqno(dev),
 								 seqno) ||
-					       dev_priv->mm.wedged);
+					       atomic_read(&dev_priv->mm.wedged));
 		i915_user_irq_put(dev);
 		dev_priv->mm.waiting_gem_seqno = 0;
+
+		trace_i915_gem_request_wait_end(dev, seqno);
 	}
-	if (dev_priv->mm.wedged)
+	if (atomic_read(&dev_priv->mm.wedged))
 		ret = -EIO;
 
 	if (ret && ret != -ERESTARTSYS)
@@ -1803,6 +1886,8 @@ i915_gem_flush(struct drm_device *dev,
 	DRM_INFO("%s: invalidate %08x flush %08x\n", __func__,
 		  invalidate_domains, flush_domains);
 #endif
+	trace_i915_gem_request_flush(dev, dev_priv->mm.next_gem_seqno,
+				     invalidate_domains, flush_domains);
 
 	if (flush_domains & I915_GEM_DOMAIN_CPU)
 		drm_agp_chipset_flush(dev);
@@ -1915,6 +2000,12 @@ i915_gem_object_unbind(struct drm_gem_object *obj)
 		return -EINVAL;
 	}
 
+	/* blow away mappings if mapped through GTT */
+	i915_gem_release_mmap(obj);
+
+	if (obj_priv->fence_reg != I915_FENCE_REG_NONE)
+		i915_gem_clear_fence_reg(obj);
+
 	/* Move the object to the CPU domain to ensure that
 	 * any possible CPU writes while it's not in the GTT
 	 * are flushed when we go to remap it. This will
@@ -1928,21 +2019,16 @@ i915_gem_object_unbind(struct drm_gem_object *obj)
 		return ret;
 	}
 
+	BUG_ON(obj_priv->active);
+
 	if (obj_priv->agp_mem != NULL) {
 		drm_unbind_agp(obj_priv->agp_mem);
 		drm_free_agp(obj_priv->agp_mem, obj->size / PAGE_SIZE);
 		obj_priv->agp_mem = NULL;
 	}
 
-	BUG_ON(obj_priv->active);
-
-	/* blow away mappings if mapped through GTT */
-	i915_gem_release_mmap(obj);
-
-	if (obj_priv->fence_reg != I915_FENCE_REG_NONE)
-		i915_gem_clear_fence_reg(obj);
-
 	i915_gem_object_put_pages(obj);
+	BUG_ON(obj_priv->pages_refcount);
 
 	if (obj_priv->gtt_space) {
 		atomic_dec(&dev->gtt_count);
@@ -1956,40 +2042,113 @@ i915_gem_object_unbind(struct drm_gem_object *obj)
 	if (!list_empty(&obj_priv->list))
 		list_del_init(&obj_priv->list);
 
+	if (i915_gem_object_is_purgeable(obj_priv))
+		i915_gem_object_truncate(obj);
+
+	trace_i915_gem_object_unbind(obj);
+
 	return 0;
 }
 
+static struct drm_gem_object *
+i915_gem_find_inactive_object(struct drm_device *dev, int min_size)
+{
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	struct drm_i915_gem_object *obj_priv;
+	struct drm_gem_object *best = NULL;
+	struct drm_gem_object *first = NULL;
+
+	/* Try to find the smallest clean object */
+	list_for_each_entry(obj_priv, &dev_priv->mm.inactive_list, list) {
+		struct drm_gem_object *obj = obj_priv->obj;
+		if (obj->size >= min_size) {
+			if ((!obj_priv->dirty ||
+			     i915_gem_object_is_purgeable(obj_priv)) &&
+			    (!best || obj->size < best->size)) {
+				best = obj;
+				if (best->size == min_size)
+					return best;
+			}
+			if (!first)
+			    first = obj;
+		}
+	}
+
+	return best ? best : first;
+}
+
 static int
-i915_gem_evict_something(struct drm_device *dev)
+i915_gem_evict_everything(struct drm_device *dev)
+{
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	uint32_t seqno;
+	int ret;
+	bool lists_empty;
+
+	spin_lock(&dev_priv->mm.active_list_lock);
+	lists_empty = (list_empty(&dev_priv->mm.inactive_list) &&
+		       list_empty(&dev_priv->mm.flushing_list) &&
+		       list_empty(&dev_priv->mm.active_list));
+	spin_unlock(&dev_priv->mm.active_list_lock);
+
+	if (lists_empty)
+		return -ENOSPC;
+
+	/* Flush everything (on to the inactive lists) and evict */
+	i915_gem_flush(dev, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
+	seqno = i915_add_request(dev, NULL, I915_GEM_GPU_DOMAINS);
+	if (seqno == 0)
+		return -ENOMEM;
+
+	ret = i915_wait_request(dev, seqno);
+	if (ret)
+		return ret;
+
+	ret = i915_gem_evict_from_inactive_list(dev);
+	if (ret)
+		return ret;
+
+	spin_lock(&dev_priv->mm.active_list_lock);
+	lists_empty = (list_empty(&dev_priv->mm.inactive_list) &&
+		       list_empty(&dev_priv->mm.flushing_list) &&
+		       list_empty(&dev_priv->mm.active_list));
+	spin_unlock(&dev_priv->mm.active_list_lock);
+	BUG_ON(!lists_empty);
+
+	return 0;
+}
+
+static int
+i915_gem_evict_something(struct drm_device *dev, int min_size)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_gem_object *obj;
-	struct drm_i915_gem_object *obj_priv;
-	int ret = 0;
+	int ret;
 
 	for (;;) {
+		i915_gem_retire_requests(dev);
+
 		/* If there's an inactive buffer available now, grab it
 		 * and be done.
 		 */
-		if (!list_empty(&dev_priv->mm.inactive_list)) {
-			obj_priv = list_first_entry(&dev_priv->mm.inactive_list,
-						    struct drm_i915_gem_object,
-						    list);
-			obj = obj_priv->obj;
-			BUG_ON(obj_priv->pin_count != 0);
+		obj = i915_gem_find_inactive_object(dev, min_size);
+		if (obj) {
+			struct drm_i915_gem_object *obj_priv;
+
 #if WATCH_LRU
 			DRM_INFO("%s: evicting %p\n", __func__, obj);
 #endif
+			obj_priv = obj->driver_private;
+			BUG_ON(obj_priv->pin_count != 0);
 			BUG_ON(obj_priv->active);
 
 			/* Wait on the rendering and unbind the buffer. */
-			ret = i915_gem_object_unbind(obj);
-			break;
+			return i915_gem_object_unbind(obj);
 		}
 
 		/* If we didn't get anything, but the ring is still processing
-		 * things, wait for one of those things to finish and hopefully
-		 * leave us a buffer to evict.
+		 * things, wait for the next to finish and hopefully leave us
+		 * a buffer to evict.
 		 */
 		if (!list_empty(&dev_priv->mm.request_list)) {
 			struct drm_i915_gem_request *request;
@@ -2000,16 +2159,9 @@ i915_gem_evict_something(struct drm_device *dev)
 
 			ret = i915_wait_request(dev, request->seqno);
 			if (ret)
-				break;
+				return ret;
 
-			/* if waiting caused an object to become inactive,
-			 * then loop around and wait for it. Otherwise, we
-			 * assume that waiting freed and unbound something,
-			 * so there should now be some space in the GTT
-			 */
-			if (!list_empty(&dev_priv->mm.inactive_list))
-				continue;
-			break;
+			continue;
 		}
 
 		/* If we didn't have anything on the request list but there
@@ -2018,46 +2170,44 @@ i915_gem_evict_something(struct drm_device *dev)
 		 * will get moved to inactive.
 		 */
 		if (!list_empty(&dev_priv->mm.flushing_list)) {
-			obj_priv = list_first_entry(&dev_priv->mm.flushing_list,
-						    struct drm_i915_gem_object,
-						    list);
-			obj = obj_priv->obj;
+			struct drm_i915_gem_object *obj_priv;
 
-			i915_gem_flush(dev,
-				       obj->write_domain,
-				       obj->write_domain);
-			i915_add_request(dev, NULL, obj->write_domain);
+			/* Find an object that we can immediately reuse */
+			list_for_each_entry(obj_priv, &dev_priv->mm.flushing_list, list) {
+				obj = obj_priv->obj;
+				if (obj->size >= min_size)
+					break;
 
-			obj = NULL;
-			continue;
-		}
+				obj = NULL;
+			}
 
-		DRM_ERROR("inactive empty %d request empty %d "
-			  "flushing empty %d\n",
-			  list_empty(&dev_priv->mm.inactive_list),
-			  list_empty(&dev_priv->mm.request_list),
-			  list_empty(&dev_priv->mm.flushing_list));
-		/* If we didn't do any of the above, there's nothing to be done
-		 * and we just can't fit it in.
-		 */
-		return -ENOSPC;
-	}
-	return ret;
-}
+			if (obj != NULL) {
+				uint32_t seqno;
 
-static int
-i915_gem_evict_everything(struct drm_device *dev)
-{
-	int ret;
+				i915_gem_flush(dev,
+					       obj->write_domain,
+					       obj->write_domain);
+				seqno = i915_add_request(dev, NULL, obj->write_domain);
+				if (seqno == 0)
+					return -ENOMEM;
 
-	for (;;) {
-		ret = i915_gem_evict_something(dev);
-		if (ret != 0)
-			break;
+				ret = i915_wait_request(dev, seqno);
+				if (ret)
+					return ret;
+
+				continue;
+			}
+		}
+
+		/* If we didn't do any of the above, there's no single buffer
+		 * large enough to swap out for the new one, so just evict
+		 * everything and start again. (This should be rare.)
+		 */
+		if (!list_empty (&dev_priv->mm.inactive_list))
+			return i915_gem_evict_from_inactive_list(dev);
+		else
+			return i915_gem_evict_everything(dev);
 	}
-	if (ret == -ENOSPC)
-		return 0;
-	return ret;
 }
 
 int
@@ -2080,7 +2230,6 @@ i915_gem_object_get_pages(struct drm_gem_object *obj)
 	BUG_ON(obj_priv->pages != NULL);
 	obj_priv->pages = drm_calloc_large(page_count, sizeof(struct page *));
 	if (obj_priv->pages == NULL) {
-		DRM_ERROR("Faled to allocate page list\n");
 		obj_priv->pages_refcount--;
 		return -ENOMEM;
 	}
@@ -2091,7 +2240,6 @@ i915_gem_object_get_pages(struct drm_gem_object *obj)
 		page = read_mapping_page(mapping, i, NULL);
 		if (IS_ERR(page)) {
 			ret = PTR_ERR(page);
-			DRM_ERROR("read_mapping_page failed: %d\n", ret);
 			i915_gem_object_put_pages(obj);
 			return ret;
 		}
@@ -2328,6 +2476,8 @@ i915_gem_object_get_fence_reg(struct drm_gem_object *obj)
 	else
 		i830_write_fence_reg(reg);
 
+	trace_i915_gem_object_get_fence(obj, i, obj_priv->tiling_mode);
+
 	return 0;
 }
 
@@ -2410,10 +2560,17 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment)
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_gem_object *obj_priv = obj->driver_private;
 	struct drm_mm_node *free_space;
-	int page_count, ret;
+	bool retry_alloc = false;
+	int ret;
 
 	if (dev_priv->mm.suspended)
 		return -EBUSY;
+
+	if (obj_priv->madv != I915_MADV_WILLNEED) {
+		DRM_ERROR("Attempting to bind a purgeable object\n");
+		return -EINVAL;
+	}
+
 	if (alignment == 0)
 		alignment = i915_gem_get_gtt_alignment(obj);
 	if (alignment & (i915_gem_get_gtt_alignment(obj) - 1)) {
@@ -2433,30 +2590,16 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment)
 		}
 	}
 	if (obj_priv->gtt_space == NULL) {
-		bool lists_empty;
-
 		/* If the gtt is empty and we're still having trouble
 		 * fitting our object in, we're out of memory.
 		 */
 #if WATCH_LRU
 		DRM_INFO("%s: GTT full, evicting something\n", __func__);
 #endif
-		spin_lock(&dev_priv->mm.active_list_lock);
-		lists_empty = (list_empty(&dev_priv->mm.inactive_list) &&
-			       list_empty(&dev_priv->mm.flushing_list) &&
-			       list_empty(&dev_priv->mm.active_list));
-		spin_unlock(&dev_priv->mm.active_list_lock);
-		if (lists_empty) {
-			DRM_ERROR("GTT full, but LRU list empty\n");
-			return -ENOSPC;
-		}
-
-		ret = i915_gem_evict_something(dev);
-		if (ret != 0) {
-			if (ret != -ERESTARTSYS)
-				DRM_ERROR("Failed to evict a buffer %d\n", ret);
+		ret = i915_gem_evict_something(dev, obj->size);
+		if (ret)
 			return ret;
-		}
+
 		goto search_free;
 	}
 
@@ -2464,27 +2607,56 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment)
 	DRM_INFO("Binding object of size %zd at 0x%08x\n",
 		 obj->size, obj_priv->gtt_offset);
 #endif
+	if (retry_alloc) {
+		i915_gem_object_set_page_gfp_mask (obj,
+						   i915_gem_object_get_page_gfp_mask (obj) & ~__GFP_NORETRY);
+	}
 	ret = i915_gem_object_get_pages(obj);
+	if (retry_alloc) {
+		i915_gem_object_set_page_gfp_mask (obj,
+						   i915_gem_object_get_page_gfp_mask (obj) | __GFP_NORETRY);
+	}
 	if (ret) {
 		drm_mm_put_block(obj_priv->gtt_space);
 		obj_priv->gtt_space = NULL;
+
+		if (ret == -ENOMEM) {
+			/* first try to clear up some space from the GTT */
+			ret = i915_gem_evict_something(dev, obj->size);
+			if (ret) {
+				/* now try to shrink everyone else */
+				if (! retry_alloc) {
+				    retry_alloc = true;
+				    goto search_free;
+				}
+
+				return ret;
+			}
+
+			goto search_free;
+		}
+
 		return ret;
 	}
 
-	page_count = obj->size / PAGE_SIZE;
 	/* Create an AGP memory structure pointing at our pages, and bind it
 	 * into the GTT.
 	 */
 	obj_priv->agp_mem = drm_agp_bind_pages(dev,
 					       obj_priv->pages,
-					       page_count,
+					       obj->size >> PAGE_SHIFT,
 					       obj_priv->gtt_offset,
 					       obj_priv->agp_type);
 	if (obj_priv->agp_mem == NULL) {
 		i915_gem_object_put_pages(obj);
 		drm_mm_put_block(obj_priv->gtt_space);
 		obj_priv->gtt_space = NULL;
-		return -ENOMEM;
+
+		ret = i915_gem_evict_something(dev, obj->size);
+		if (ret)
+			return ret;
+
+		goto search_free;
 	}
 	atomic_inc(&dev->gtt_count);
 	atomic_add(obj->size, &dev->gtt_memory);
@@ -2496,6 +2668,8 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment)
 	BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
 	BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
 
+	trace_i915_gem_object_bind(obj, obj_priv->gtt_offset);
+
 	return 0;
 }
 
@@ -2511,15 +2685,7 @@ i915_gem_clflush_object(struct drm_gem_object *obj)
 	if (obj_priv->pages == NULL)
 		return;
 
-	/* XXX: The 865 in particular appears to be weird in how it handles
-	 * cache flushing.  We haven't figured it out, but the
-	 * clflush+agp_chipset_flush doesn't appear to successfully get the
-	 * data visible to the PGU, while wbinvd + agp_chipset_flush does.
-	 */
-	if (IS_I865G(obj->dev)) {
-		wbinvd();
-		return;
-	}
+	trace_i915_gem_object_clflush(obj);
 
 	drm_clflush_pages(obj_priv->pages, obj->size / PAGE_SIZE);
 }
@@ -2530,21 +2696,29 @@ i915_gem_object_flush_gpu_write_domain(struct drm_gem_object *obj)
 {
 	struct drm_device *dev = obj->dev;
 	uint32_t seqno;
+	uint32_t old_write_domain;
 
 	if ((obj->write_domain & I915_GEM_GPU_DOMAINS) == 0)
 		return;
 
 	/* Queue the GPU write cache flushing we need. */
+	old_write_domain = obj->write_domain;
 	i915_gem_flush(dev, 0, obj->write_domain);
 	seqno = i915_add_request(dev, NULL, obj->write_domain);
 	obj->write_domain = 0;
 	i915_gem_object_move_to_active(obj, seqno);
+
+	trace_i915_gem_object_change_domain(obj,
+					    obj->read_domains,
+					    old_write_domain);
 }
 
 /** Flushes the GTT write domain for the object if it's dirty. */
 static void
 i915_gem_object_flush_gtt_write_domain(struct drm_gem_object *obj)
 {
+	uint32_t old_write_domain;
+
 	if (obj->write_domain != I915_GEM_DOMAIN_GTT)
 		return;
 
@@ -2552,7 +2726,12 @@ i915_gem_object_flush_gtt_write_domain(struct drm_gem_object *obj)
 	 * to it immediately go to main memory as far as we know, so there's
 	 * no chipset flush.  It also doesn't land in render cache.
 	 */
+	old_write_domain = obj->write_domain;
 	obj->write_domain = 0;
+
+	trace_i915_gem_object_change_domain(obj,
+					    obj->read_domains,
+					    old_write_domain);
 }
 
 /** Flushes the CPU write domain for the object if it's dirty. */
@@ -2560,13 +2739,19 @@ static void
 i915_gem_object_flush_cpu_write_domain(struct drm_gem_object *obj)
 {
 	struct drm_device *dev = obj->dev;
+	uint32_t old_write_domain;
 
 	if (obj->write_domain != I915_GEM_DOMAIN_CPU)
 		return;
 
 	i915_gem_clflush_object(obj);
 	drm_agp_chipset_flush(dev);
+	old_write_domain = obj->write_domain;
 	obj->write_domain = 0;
+
+	trace_i915_gem_object_change_domain(obj,
+					    obj->read_domains,
+					    old_write_domain);
 }
 
 /**
@@ -2579,6 +2764,7 @@ int
 i915_gem_object_set_to_gtt_domain(struct drm_gem_object *obj, int write)
 {
 	struct drm_i915_gem_object *obj_priv = obj->driver_private;
+	uint32_t old_write_domain, old_read_domains;
 	int ret;
 
 	/* Not valid to be called on unbound objects. */
@@ -2591,6 +2777,9 @@ i915_gem_object_set_to_gtt_domain(struct drm_gem_object *obj, int write)
 	if (ret != 0)
 		return ret;
 
+	old_write_domain = obj->write_domain;
+	old_read_domains = obj->read_domains;
+
 	/* If we're writing through the GTT domain, then CPU and GPU caches
 	 * will need to be invalidated at next use.
 	 */
@@ -2609,6 +2798,10 @@ i915_gem_object_set_to_gtt_domain(struct drm_gem_object *obj, int write)
 		obj_priv->dirty = 1;
 	}
 
+	trace_i915_gem_object_change_domain(obj,
+					    old_read_domains,
+					    old_write_domain);
+
 	return 0;
 }
 
@@ -2621,6 +2814,7 @@ i915_gem_object_set_to_gtt_domain(struct drm_gem_object *obj, int write)
 static int
 i915_gem_object_set_to_cpu_domain(struct drm_gem_object *obj, int write)
 {
+	uint32_t old_write_domain, old_read_domains;
 	int ret;
 
 	i915_gem_object_flush_gpu_write_domain(obj);
@@ -2636,6 +2830,9 @@ i915_gem_object_set_to_cpu_domain(struct drm_gem_object *obj, int write)
 	 */
 	i915_gem_object_set_to_full_cpu_read_domain(obj);
 
+	old_write_domain = obj->write_domain;
+	old_read_domains = obj->read_domains;
+
 	/* Flush the CPU cache if it's still invalid. */
 	if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
 		i915_gem_clflush_object(obj);
@@ -2656,6 +2853,10 @@ i915_gem_object_set_to_cpu_domain(struct drm_gem_object *obj, int write)
 		obj->write_domain = I915_GEM_DOMAIN_CPU;
 	}
 
+	trace_i915_gem_object_change_domain(obj,
+					    old_read_domains,
+					    old_write_domain);
+
 	return 0;
 }
 
@@ -2777,6 +2978,7 @@ i915_gem_object_set_to_gpu_domain(struct drm_gem_object *obj)
 	struct drm_i915_gem_object	*obj_priv = obj->driver_private;
 	uint32_t			invalidate_domains = 0;
 	uint32_t			flush_domains = 0;
+	uint32_t			old_read_domains;
 
 	BUG_ON(obj->pending_read_domains & I915_GEM_DOMAIN_CPU);
 	BUG_ON(obj->pending_write_domain == I915_GEM_DOMAIN_CPU);
@@ -2823,6 +3025,8 @@ i915_gem_object_set_to_gpu_domain(struct drm_gem_object *obj)
 		i915_gem_clflush_object(obj);
 	}
 
+	old_read_domains = obj->read_domains;
+
 	/* The actual obj->write_domain will be updated with
 	 * pending_write_domain after we emit the accumulated flush for all
 	 * of our domain changes in execbuffers (which clears objects'
@@ -2841,6 +3045,10 @@ i915_gem_object_set_to_gpu_domain(struct drm_gem_object *obj)
 		 obj->read_domains, obj->write_domain,
 		 dev->invalidate_domains, dev->flush_domains);
 #endif
+
+	trace_i915_gem_object_change_domain(obj,
+					    old_read_domains,
+					    obj->write_domain);
 }
 
 /**
@@ -2893,6 +3101,7 @@ i915_gem_object_set_cpu_read_domain_range(struct drm_gem_object *obj,
 					  uint64_t offset, uint64_t size)
 {
 	struct drm_i915_gem_object *obj_priv = obj->driver_private;
+	uint32_t old_read_domains;
 	int i, ret;
 
 	if (offset == 0 && size == obj->size)
@@ -2939,8 +3148,13 @@ i915_gem_object_set_cpu_read_domain_range(struct drm_gem_object *obj,
 	 */
 	BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
 
+	old_read_domains = obj->read_domains;
 	obj->read_domains |= I915_GEM_DOMAIN_CPU;
 
+	trace_i915_gem_object_change_domain(obj,
+					    old_read_domains,
+					    obj->write_domain);
+
 	return 0;
 }
 
@@ -2984,6 +3198,21 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 		}
 		target_obj_priv = target_obj->driver_private;
 
+#if WATCH_RELOC
+		DRM_INFO("%s: obj %p offset %08x target %d "
+			 "read %08x write %08x gtt %08x "
+			 "presumed %08x delta %08x\n",
+			 __func__,
+			 obj,
+			 (int) reloc->offset,
+			 (int) reloc->target_handle,
+			 (int) reloc->read_domains,
+			 (int) reloc->write_domain,
+			 (int) target_obj_priv->gtt_offset,
+			 (int) reloc->presumed_offset,
+			 reloc->delta);
+#endif
+
 		/* The target buffer should have appeared before us in the
 		 * exec_object list, so it should have a GTT space bound by now.
 		 */
@@ -2995,25 +3224,7 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 			return -EINVAL;
 		}
 
-		if (reloc->offset > obj->size - 4) {
-			DRM_ERROR("Relocation beyond object bounds: "
-				  "obj %p target %d offset %d size %d.\n",
-				  obj, reloc->target_handle,
-				  (int) reloc->offset, (int) obj->size);
-			drm_gem_object_unreference(target_obj);
-			i915_gem_object_unpin(obj);
-			return -EINVAL;
-		}
-		if (reloc->offset & 3) {
-			DRM_ERROR("Relocation not 4-byte aligned: "
-				  "obj %p target %d offset %d.\n",
-				  obj, reloc->target_handle,
-				  (int) reloc->offset);
-			drm_gem_object_unreference(target_obj);
-			i915_gem_object_unpin(obj);
-			return -EINVAL;
-		}
-
+		/* Validate that the target is in a valid r/w GPU domain */
 		if (reloc->write_domain & I915_GEM_DOMAIN_CPU ||
 		    reloc->read_domains & I915_GEM_DOMAIN_CPU) {
 			DRM_ERROR("reloc with read/write CPU domains: "
@@ -3027,7 +3238,6 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 			i915_gem_object_unpin(obj);
 			return -EINVAL;
 		}
-
 		if (reloc->write_domain && target_obj->pending_write_domain &&
 		    reloc->write_domain != target_obj->pending_write_domain) {
 			DRM_ERROR("Write domain conflict: "
@@ -3042,21 +3252,6 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 			return -EINVAL;
 		}
 
-#if WATCH_RELOC
-		DRM_INFO("%s: obj %p offset %08x target %d "
-			 "read %08x write %08x gtt %08x "
-			 "presumed %08x delta %08x\n",
-			 __func__,
-			 obj,
-			 (int) reloc->offset,
-			 (int) reloc->target_handle,
-			 (int) reloc->read_domains,
-			 (int) reloc->write_domain,
-			 (int) target_obj_priv->gtt_offset,
-			 (int) reloc->presumed_offset,
-			 reloc->delta);
-#endif
-
 		target_obj->pending_read_domains |= reloc->read_domains;
 		target_obj->pending_write_domain |= reloc->write_domain;
 
@@ -3068,6 +3263,37 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 			continue;
 		}
 
+		/* Check that the relocation address is valid... */
+		if (reloc->offset > obj->size - 4) {
+			DRM_ERROR("Relocation beyond object bounds: "
+				  "obj %p target %d offset %d size %d.\n",
+				  obj, reloc->target_handle,
+				  (int) reloc->offset, (int) obj->size);
+			drm_gem_object_unreference(target_obj);
+			i915_gem_object_unpin(obj);
+			return -EINVAL;
+		}
+		if (reloc->offset & 3) {
+			DRM_ERROR("Relocation not 4-byte aligned: "
+				  "obj %p target %d offset %d.\n",
+				  obj, reloc->target_handle,
+				  (int) reloc->offset);
+			drm_gem_object_unreference(target_obj);
+			i915_gem_object_unpin(obj);
+			return -EINVAL;
+		}
+
+		/* and points to somewhere within the target object. */
+		if (reloc->delta >= target_obj->size) {
+			DRM_ERROR("Relocation beyond target object bounds: "
+				  "obj %p target %d delta %d size %d.\n",
+				  obj, reloc->target_handle,
+				  (int) reloc->delta, (int) target_obj->size);
+			drm_gem_object_unreference(target_obj);
+			i915_gem_object_unpin(obj);
+			return -EINVAL;
+		}
+
 		ret = i915_gem_object_set_to_gtt_domain(obj, 1);
 		if (ret != 0) {
 			drm_gem_object_unreference(target_obj);
@@ -3126,6 +3352,8 @@ i915_dispatch_gem_execbuffer(struct drm_device *dev,
 	exec_start = (uint32_t) exec_offset + exec->batch_start_offset;
 	exec_len = (uint32_t) exec->batch_len;
 
+	trace_i915_gem_request_submit(dev, dev_priv->mm.next_gem_seqno);
+
 	count = nbox ? nbox : 1;
 
 	for (i = 0; i < count; i++) {
@@ -3363,7 +3591,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 
 	i915_verify_inactive(dev, __FILE__, __LINE__);
 
-	if (dev_priv->mm.wedged) {
+	if (atomic_read(&dev_priv->mm.wedged)) {
 		DRM_ERROR("Execbuf while wedged\n");
 		mutex_unlock(&dev->struct_mutex);
 		ret = -EIO;
@@ -3421,8 +3649,23 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 
 		/* error other than GTT full, or we've already tried again */
 		if (ret != -ENOSPC || pin_tries >= 1) {
-			if (ret != -ERESTARTSYS)
-				DRM_ERROR("Failed to pin buffers %d\n", ret);
+			if (ret != -ERESTARTSYS) {
+				unsigned long long total_size = 0;
+				for (i = 0; i < args->buffer_count; i++)
+					total_size += object_list[i]->size;
+				DRM_ERROR("Failed to pin buffer %d of %d, total %llu bytes: %d\n",
+					  pinned+1, args->buffer_count,
+					  total_size, ret);
+				DRM_ERROR("%d objects [%d pinned], "
+					  "%d object bytes [%d pinned], "
+					  "%d/%d gtt bytes\n",
+					  atomic_read(&dev->object_count),
+					  atomic_read(&dev->pin_count),
+					  atomic_read(&dev->object_memory),
+					  atomic_read(&dev->pin_memory),
+					  atomic_read(&dev->gtt_memory),
+					  dev->gtt_total);
+			}
 			goto err;
 		}
 
@@ -3433,7 +3676,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 
 		/* evict everyone we can from the aperture */
 		ret = i915_gem_evict_everything(dev);
-		if (ret)
+		if (ret && ret != -ENOSPC)
 			goto err;
 	}
 
@@ -3489,8 +3732,12 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 
 	for (i = 0; i < args->buffer_count; i++) {
 		struct drm_gem_object *obj = object_list[i];
+		uint32_t old_write_domain = obj->write_domain;
 
 		obj->write_domain = obj->pending_write_domain;
+		trace_i915_gem_object_change_domain(obj,
+						    obj->read_domains,
+						    old_write_domain);
 	}
 
 	i915_verify_inactive(dev, __FILE__, __LINE__);
@@ -3607,11 +3854,8 @@ i915_gem_object_pin(struct drm_gem_object *obj, uint32_t alignment)
 	i915_verify_inactive(dev, __FILE__, __LINE__);
 	if (obj_priv->gtt_space == NULL) {
 		ret = i915_gem_object_bind_to_gtt(obj, alignment);
-		if (ret != 0) {
-			if (ret != -EBUSY && ret != -ERESTARTSYS)
-				DRM_ERROR("Failure to bind: %d\n", ret);
+		if (ret)
 			return ret;
-		}
 	}
 	/*
 	 * Pre-965 chips need a fence register set up in order to
@@ -3691,6 +3935,13 @@ i915_gem_pin_ioctl(struct drm_device *dev, void *data,
 	}
 	obj_priv = obj->driver_private;
 
+	if (obj_priv->madv != I915_MADV_WILLNEED) {
+		DRM_ERROR("Attempting to pin a purgeable buffer\n");
+		drm_gem_object_unreference(obj);
+		mutex_unlock(&dev->struct_mutex);
+		return -EINVAL;
+	}
+
 	if (obj_priv->pin_filp != NULL && obj_priv->pin_filp != file_priv) {
 		DRM_ERROR("Already pinned in i915_gem_pin_ioctl(): %d\n",
 			  args->handle);
@@ -3803,6 +4054,56 @@ i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
     return i915_gem_ring_throttle(dev, file_priv);
 }
 
+int
+i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
+		       struct drm_file *file_priv)
+{
+	struct drm_i915_gem_madvise *args = data;
+	struct drm_gem_object *obj;
+	struct drm_i915_gem_object *obj_priv;
+
+	switch (args->madv) {
+	case I915_MADV_DONTNEED:
+	case I915_MADV_WILLNEED:
+	    break;
+	default:
+	    return -EINVAL;
+	}
+
+	obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+	if (obj == NULL) {
+		DRM_ERROR("Bad handle in i915_gem_madvise_ioctl(): %d\n",
+			  args->handle);
+		return -EBADF;
+	}
+
+	mutex_lock(&dev->struct_mutex);
+	obj_priv = obj->driver_private;
+
+	if (obj_priv->pin_count) {
+		drm_gem_object_unreference(obj);
+		mutex_unlock(&dev->struct_mutex);
+
+		DRM_ERROR("Attempted i915_gem_madvise_ioctl() on a pinned object\n");
+		return -EINVAL;
+	}
+
+	if (obj_priv->madv != __I915_MADV_PURGED)
+		obj_priv->madv = args->madv;
+
+	/* if the object is no longer bound, discard its backing storage */
+	if (i915_gem_object_is_purgeable(obj_priv) &&
+	    obj_priv->gtt_space == NULL)
+		i915_gem_object_truncate(obj);
+
+	args->retained = obj_priv->madv != __I915_MADV_PURGED;
+
+	drm_gem_object_unreference(obj);
+	mutex_unlock(&dev->struct_mutex);
+
+	return 0;
+}
+
 int i915_gem_init_object(struct drm_gem_object *obj)
 {
 	struct drm_i915_gem_object *obj_priv;
@@ -3827,6 +4128,9 @@ int i915_gem_init_object(struct drm_gem_object *obj)
 	obj_priv->fence_reg = I915_FENCE_REG_NONE;
 	INIT_LIST_HEAD(&obj_priv->list);
 	INIT_LIST_HEAD(&obj_priv->fence_list);
+	obj_priv->madv = I915_MADV_WILLNEED;
+
+	trace_i915_gem_object_create(obj);
 
 	return 0;
 }
@@ -3836,6 +4140,8 @@ void i915_gem_free_object(struct drm_gem_object *obj)
 	struct drm_device *dev = obj->dev;
 	struct drm_i915_gem_object *obj_priv = obj->driver_private;
 
+	trace_i915_gem_object_destroy(obj);
+
 	while (obj_priv->pin_count > 0)
 		i915_gem_object_unpin(obj);
 
@@ -3844,43 +4150,35 @@ void i915_gem_free_object(struct drm_gem_object *obj)
 
 	i915_gem_object_unbind(obj);
 
-	i915_gem_free_mmap_offset(obj);
+	if (obj_priv->mmap_offset)
+		i915_gem_free_mmap_offset(obj);
 
 	kfree(obj_priv->page_cpu_valid);
 	kfree(obj_priv->bit_17);
 	kfree(obj->driver_private);
 }
 
-/** Unbinds all objects that are on the given buffer list. */
+/** Unbinds all inactive objects. */
 static int
-i915_gem_evict_from_list(struct drm_device *dev, struct list_head *head)
+i915_gem_evict_from_inactive_list(struct drm_device *dev)
 {
-	struct drm_gem_object *obj;
-	struct drm_i915_gem_object *obj_priv;
-	int ret;
+	drm_i915_private_t *dev_priv = dev->dev_private;
 
-	while (!list_empty(head)) {
-		obj_priv = list_first_entry(head,
-					    struct drm_i915_gem_object,
-					    list);
-		obj = obj_priv->obj;
+	while (!list_empty(&dev_priv->mm.inactive_list)) {
+		struct drm_gem_object *obj;
+		int ret;
 
-		if (obj_priv->pin_count != 0) {
-			DRM_ERROR("Pinned object in unbind list\n");
-			mutex_unlock(&dev->struct_mutex);
-			return -EINVAL;
-		}
+		obj = list_first_entry(&dev_priv->mm.inactive_list,
+				       struct drm_i915_gem_object,
+				       list)->obj;
 
 		ret = i915_gem_object_unbind(obj);
 		if (ret != 0) {
-			DRM_ERROR("Error unbinding object in LeaveVT: %d\n",
-				  ret);
-			mutex_unlock(&dev->struct_mutex);
+			DRM_ERROR("Error unbinding object: %d\n", ret);
 			return ret;
 		}
 	}
 
-
 	return 0;
 }
 
@@ -3902,6 +4200,7 @@ i915_gem_idle(struct drm_device *dev)
 	 * We need to replace this with a semaphore, or something.
 	 */
 	dev_priv->mm.suspended = 1;
+	del_timer(&dev_priv->hangcheck_timer);
 
 	/* Cancel the retire work handler, wait for it to finish if running
 	 */
@@ -3931,7 +4230,7 @@ i915_gem_idle(struct drm_device *dev)
 		if (last_seqno == cur_seqno) {
 			if (stuck++ > 100) {
 				DRM_ERROR("hardware wedged\n");
-				dev_priv->mm.wedged = 1;
+				atomic_set(&dev_priv->mm.wedged, 1);
 				DRM_WAKEUP(&dev_priv->irq_queue);
 				break;
 			}
@@ -3944,7 +4243,7 @@ i915_gem_idle(struct drm_device *dev)
 	i915_gem_retire_requests(dev);
 
 	spin_lock(&dev_priv->mm.active_list_lock);
-	if (!dev_priv->mm.wedged) {
+	if (!atomic_read(&dev_priv->mm.wedged)) {
 		/* Active and flushing should now be empty as we've
 		 * waited for a sequence higher than any pending execbuffer
 		 */
@@ -3962,29 +4261,41 @@ i915_gem_idle(struct drm_device *dev)
 	 * the GPU domains and just stuff them onto inactive.
 	 */
 	while (!list_empty(&dev_priv->mm.active_list)) {
-		struct drm_i915_gem_object *obj_priv;
+		struct drm_gem_object *obj;
+		uint32_t old_write_domain;
 
-		obj_priv = list_first_entry(&dev_priv->mm.active_list,
-					    struct drm_i915_gem_object,
-					    list);
-		obj_priv->obj->write_domain &= ~I915_GEM_GPU_DOMAINS;
-		i915_gem_object_move_to_inactive(obj_priv->obj);
+		obj = list_first_entry(&dev_priv->mm.active_list,
+				       struct drm_i915_gem_object,
+				       list)->obj;
+		old_write_domain = obj->write_domain;
+		obj->write_domain &= ~I915_GEM_GPU_DOMAINS;
+		i915_gem_object_move_to_inactive(obj);
+
+		trace_i915_gem_object_change_domain(obj,
+						    obj->read_domains,
+						    old_write_domain);
 	}
 	spin_unlock(&dev_priv->mm.active_list_lock);
 
 	while (!list_empty(&dev_priv->mm.flushing_list)) {
-		struct drm_i915_gem_object *obj_priv;
+		struct drm_gem_object *obj;
+		uint32_t old_write_domain;
 
-		obj_priv = list_first_entry(&dev_priv->mm.flushing_list,
-					    struct drm_i915_gem_object,
-					    list);
-		obj_priv->obj->write_domain &= ~I915_GEM_GPU_DOMAINS;
-		i915_gem_object_move_to_inactive(obj_priv->obj);
+		obj = list_first_entry(&dev_priv->mm.flushing_list,
+				       struct drm_i915_gem_object,
+				       list)->obj;
+		old_write_domain = obj->write_domain;
+		obj->write_domain &= ~I915_GEM_GPU_DOMAINS;
+		i915_gem_object_move_to_inactive(obj);
+
+		trace_i915_gem_object_change_domain(obj,
+						    obj->read_domains,
+						    old_write_domain);
 	}
 
 
 	/* Move all inactive buffers out of the GTT. */
-	ret = i915_gem_evict_from_list(dev, &dev_priv->mm.inactive_list);
+	ret = i915_gem_evict_from_inactive_list(dev);
 	WARN_ON(!list_empty(&dev_priv->mm.inactive_list));
 	if (ret) {
 		mutex_unlock(&dev->struct_mutex);
@@ -4206,9 +4517,9 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		return 0;
 
-	if (dev_priv->mm.wedged) {
+	if (atomic_read(&dev_priv->mm.wedged)) {
 		DRM_ERROR("Reenabling wedged hardware, good luck\n");
-		dev_priv->mm.wedged = 0;
+		atomic_set(&dev_priv->mm.wedged, 0);
 	}
 
 	mutex_lock(&dev->struct_mutex);
@@ -4274,6 +4585,10 @@ i915_gem_load(struct drm_device *dev)
 			  i915_gem_retire_work_handler);
 	dev_priv->mm.next_gem_seqno = 1;
 
+	spin_lock(&shrink_list_lock);
+	list_add(&dev_priv->mm.shrink_list, &shrink_list);
+	spin_unlock(&shrink_list_lock);
+
 	/* Old X drivers will take 0-2 for front, back, depth buffers */
 	dev_priv->fence_reg_start = 3;
 
@@ -4491,3 +4806,116 @@ void i915_gem_release(struct drm_device * dev, struct drm_file *file_priv)
 		list_del_init(i915_file_priv->mm.request_list.next);
 	mutex_unlock(&dev->struct_mutex);
 }
+
+static int
+i915_gem_shrink(int nr_to_scan, gfp_t gfp_mask)
+{
+	drm_i915_private_t *dev_priv, *next_dev;
+	struct drm_i915_gem_object *obj_priv, *next_obj;
+	int cnt = 0;
+	int would_deadlock = 1;
+
+	/* "fast-path" to count number of available objects */
+	if (nr_to_scan == 0) {
+		spin_lock(&shrink_list_lock);
+		list_for_each_entry(dev_priv, &shrink_list, mm.shrink_list) {
+			struct drm_device *dev = dev_priv->dev;
+
+			if (mutex_trylock(&dev->struct_mutex)) {
+				list_for_each_entry(obj_priv,
+						    &dev_priv->mm.inactive_list,
+						    list)
+					cnt++;
+				mutex_unlock(&dev->struct_mutex);
+			}
+		}
+		spin_unlock(&shrink_list_lock);
+
+		return (cnt / 100) * sysctl_vfs_cache_pressure;
+	}
+
+	spin_lock(&shrink_list_lock);
+
+	/* first scan for clean buffers */
+	list_for_each_entry_safe(dev_priv, next_dev,
+				 &shrink_list, mm.shrink_list) {
+		struct drm_device *dev = dev_priv->dev;
+
+		if (! mutex_trylock(&dev->struct_mutex))
+			continue;
+
+		spin_unlock(&shrink_list_lock);
+
+		i915_gem_retire_requests(dev);
+
+		list_for_each_entry_safe(obj_priv, next_obj,
+					 &dev_priv->mm.inactive_list,
+					 list) {
+			if (i915_gem_object_is_purgeable(obj_priv)) {
+				i915_gem_object_unbind(obj_priv->obj);
+				if (--nr_to_scan <= 0)
+					break;
+			}
+		}
+
+		spin_lock(&shrink_list_lock);
+		mutex_unlock(&dev->struct_mutex);
+
+		would_deadlock = 0;
+
+		if (nr_to_scan <= 0)
+			break;
+	}
+
+	/* second pass, evict/count anything still on the inactive list */
+	list_for_each_entry_safe(dev_priv, next_dev,
+				 &shrink_list, mm.shrink_list) {
+		struct drm_device *dev = dev_priv->dev;
+
+		if (! mutex_trylock(&dev->struct_mutex))
+			continue;
+
+		spin_unlock(&shrink_list_lock);
+
+		list_for_each_entry_safe(obj_priv, next_obj,
+					 &dev_priv->mm.inactive_list,
+					 list) {
+			if (nr_to_scan > 0) {
+				i915_gem_object_unbind(obj_priv->obj);
+				nr_to_scan--;
+			} else
+				cnt++;
+		}
+
+		spin_lock(&shrink_list_lock);
+		mutex_unlock(&dev->struct_mutex);
+
+		would_deadlock = 0;
+	}
+
+	spin_unlock(&shrink_list_lock);
+
+	if (would_deadlock)
+		return -1;
+	else if (cnt > 0)
+		return (cnt / 100) * sysctl_vfs_cache_pressure;
+	else
+		return 0;
+}
+
+static struct shrinker shrinker = {
+	.shrink = i915_gem_shrink,
+	.seeks = DEFAULT_SEEKS,
+};
+
+__init void
+i915_gem_shrinker_init(void)
+{
+    register_shrinker(&shrinker);
+}
+
+__exit void
+i915_gem_shrinker_exit(void)
+{
+    unregister_shrinker(&shrinker);
+}
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6c89f2ff249..4dfeec7cdd4 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -31,6 +31,7 @@
 #include "drm.h"
 #include "i915_drm.h"
 #include "i915_drv.h"
+#include "i915_trace.h"
 #include "intel_drv.h"
 
 #define MAX_NOPID ((u32)~0)
@@ -279,7 +280,9 @@ irqreturn_t igdng_irq_handler(struct drm_device *dev)
 		}
 
 		if (gt_iir & GT_USER_INTERRUPT) {
-			dev_priv->mm.irq_gem_seqno = i915_get_gem_seqno(dev);
+			u32 seqno = i915_get_gem_seqno(dev);
+			dev_priv->mm.irq_gem_seqno = seqno;
+			trace_i915_gem_request_complete(dev, seqno);
 			DRM_WAKEUP(&dev_priv->irq_queue);
 		}
 
@@ -302,12 +305,25 @@ static void i915_error_work_func(struct work_struct *work)
 	drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
 						    error_work);
 	struct drm_device *dev = dev_priv->dev;
-	char *event_string = "ERROR=1";
-	char *envp[] = { event_string, NULL };
+	char *error_event[] = { "ERROR=1", NULL };
+	char *reset_event[] = { "RESET=1", NULL };
+	char *reset_done_event[] = { "ERROR=0", NULL };
 
 	DRM_DEBUG("generating error event\n");
-
-	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, envp);
+	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
+
+	if (atomic_read(&dev_priv->mm.wedged)) {
+		if (IS_I965G(dev)) {
+			DRM_DEBUG("resetting chip\n");
+			kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event);
+			if (!i965_reset(dev, GDRST_RENDER)) {
+				atomic_set(&dev_priv->mm.wedged, 0);
+				kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event);
+			}
+		} else {
+			printk("reboot required\n");
+		}
+	}
 }
 
 /**
@@ -372,7 +388,7 @@ out:
  * so userspace knows something bad happened (should trigger collection
  * of a ring dump etc.).
  */
-static void i915_handle_error(struct drm_device *dev)
+static void i915_handle_error(struct drm_device *dev, bool wedged)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 eir = I915_READ(EIR);
@@ -482,6 +498,16 @@ static void i915_handle_error(struct drm_device *dev)
 		I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT);
 	}
 
+	if (wedged) {
+		atomic_set(&dev_priv->mm.wedged, 1);
+
+		/*
+		 * Wakeup waiting processes so they don't hang
+		 */
+		printk("i915: Waking up sleeping processes\n");
+		DRM_WAKEUP(&dev_priv->irq_queue);
+	}
+
 	queue_work(dev_priv->wq, &dev_priv->error_work);
 }
 
@@ -527,7 +553,7 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
 		pipeb_stats = I915_READ(PIPEBSTAT);
 
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev);
+			i915_handle_error(dev, false);
 
 		/*
 		 * Clear the PIPE(A|B)STAT regs before the IIR
@@ -599,8 +625,12 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
 		}
 
 		if (iir & I915_USER_INTERRUPT) {
-			dev_priv->mm.irq_gem_seqno = i915_get_gem_seqno(dev);
+			u32 seqno = i915_get_gem_seqno(dev);
+			dev_priv->mm.irq_gem_seqno = seqno;
+			trace_i915_gem_request_complete(dev, seqno);
 			DRM_WAKEUP(&dev_priv->irq_queue);
+			dev_priv->hangcheck_count = 0;
+			mod_timer(&dev_priv->hangcheck_timer, jiffies + DRM_I915_HANGCHECK_PERIOD);
 		}
 
 		if (pipea_stats & vblank_status) {
@@ -880,6 +910,52 @@ int i915_vblank_swap(struct drm_device *dev, void *data,
 	return -EINVAL;
 }
 
+struct drm_i915_gem_request *i915_get_tail_request(struct drm_device *dev) {
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	return list_entry(dev_priv->mm.request_list.prev, struct drm_i915_gem_request, list);
+}
+
+/**
+ * This is called when the chip hasn't reported back with completed
+ * batchbuffers in a long time. The first time this is called we simply record
+ * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
+ * again, we assume the chip is wedged and try to fix it.
+ */
+void i915_hangcheck_elapsed(unsigned long data)
+{
+	struct drm_device *dev = (struct drm_device *)data;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	uint32_t acthd;
+       
+	if (!IS_I965G(dev))
+		acthd = I915_READ(ACTHD);
+	else
+		acthd = I915_READ(ACTHD_I965);
+
+	/* If all work is done then ACTHD clearly hasn't advanced. */
+	if (list_empty(&dev_priv->mm.request_list) ||
+		       i915_seqno_passed(i915_get_gem_seqno(dev), i915_get_tail_request(dev)->seqno)) {
+		dev_priv->hangcheck_count = 0;
+		return;
+	}
+
+	if (dev_priv->last_acthd == acthd && dev_priv->hangcheck_count > 0) {
+		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
+		i915_handle_error(dev, true);
+		return;
+	} 
+
+	/* Reset timer case chip hangs without another request being added */
+	mod_timer(&dev_priv->hangcheck_timer, jiffies + DRM_I915_HANGCHECK_PERIOD);
+
+	if (acthd != dev_priv->last_acthd)
+		dev_priv->hangcheck_count = 0;
+	else
+		dev_priv->hangcheck_count++;
+
+	dev_priv->last_acthd = acthd;
+}
+
 /* drm_dma.h hooks
 */
 static void igdng_irq_preinstall(struct drm_device *dev)
diff --git a/drivers/gpu/drm/i915/i915_opregion.c b/drivers/gpu/drm/i915/i915_opregion.c
index e4b4e8898e3..2d5193556d3 100644
--- a/drivers/gpu/drm/i915/i915_opregion.c
+++ b/drivers/gpu/drm/i915/i915_opregion.c
@@ -148,6 +148,7 @@ static u32 asle_set_backlight(struct drm_device *dev, u32 bclp)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct opregion_asle *asle = dev_priv->opregion.asle;
 	u32 blc_pwm_ctl, blc_pwm_ctl2;
+	u32 max_backlight, level, shift;
 
 	if (!(bclp & ASLE_BCLP_VALID))
 		return ASLE_BACKLIGHT_FAIL;
@@ -157,14 +158,25 @@ static u32 asle_set_backlight(struct drm_device *dev, u32 bclp)
 		return ASLE_BACKLIGHT_FAIL;
 
 	blc_pwm_ctl = I915_READ(BLC_PWM_CTL);
-	blc_pwm_ctl &= ~BACKLIGHT_DUTY_CYCLE_MASK;
 	blc_pwm_ctl2 = I915_READ(BLC_PWM_CTL2);
 
-	if (blc_pwm_ctl2 & BLM_COMBINATION_MODE)
+	if (IS_I965G(dev) && (blc_pwm_ctl2 & BLM_COMBINATION_MODE))
 		pci_write_config_dword(dev->pdev, PCI_LBPC, bclp);
-	else
-		I915_WRITE(BLC_PWM_CTL, blc_pwm_ctl | ((bclp * 0x101)-1));
-
+	else {
+		if (IS_IGD(dev)) {
+			blc_pwm_ctl &= ~(BACKLIGHT_DUTY_CYCLE_MASK - 1);
+			max_backlight = (blc_pwm_ctl & BACKLIGHT_MODULATION_FREQ_MASK) >> 
+					BACKLIGHT_MODULATION_FREQ_SHIFT;
+			shift = BACKLIGHT_DUTY_CYCLE_SHIFT + 1;
+		} else {
+			blc_pwm_ctl &= ~BACKLIGHT_DUTY_CYCLE_MASK;
+			max_backlight = ((blc_pwm_ctl & BACKLIGHT_MODULATION_FREQ_MASK) >> 
+					BACKLIGHT_MODULATION_FREQ_SHIFT) * 2;
+			shift = BACKLIGHT_DUTY_CYCLE_SHIFT;
+		}
+		level = (bclp * max_backlight) / 255;
+		I915_WRITE(BLC_PWM_CTL, blc_pwm_ctl | (level << shift));
+	}
 	asle->cblv = (bclp*0x64)/0xff | ASLE_CBLV_VALID;
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 3f796355346..0466ddbeba3 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -86,6 +86,10 @@
 #define   I915_GC_RENDER_CLOCK_200_MHZ	(1 << 0)
 #define   I915_GC_RENDER_CLOCK_333_MHZ	(4 << 0)
 #define LBB	0xf4
+#define GDRST 0xc0
+#define  GDRST_FULL	(0<<2)
+#define  GDRST_RENDER	(1<<2)
+#define  GDRST_MEDIA	(3<<2)
 
 /* VGA stuff */
 
@@ -344,9 +348,37 @@
 #define   FBC_CTL_PLANEA	(0<<0)
 #define   FBC_CTL_PLANEB	(1<<0)
 #define FBC_FENCE_OFF		0x0321b
+#define FBC_TAG			0x03300
 
 #define FBC_LL_SIZE		(1536)
 
+/* Framebuffer compression for GM45+ */
+#define DPFC_CB_BASE		0x3200
+#define DPFC_CONTROL		0x3208
+#define   DPFC_CTL_EN		(1<<31)
+#define   DPFC_CTL_PLANEA	(0<<30)
+#define   DPFC_CTL_PLANEB	(1<<30)
+#define   DPFC_CTL_FENCE_EN	(1<<29)
+#define   DPFC_SR_EN		(1<<10)
+#define   DPFC_CTL_LIMIT_1X	(0<<6)
+#define   DPFC_CTL_LIMIT_2X	(1<<6)
+#define   DPFC_CTL_LIMIT_4X	(2<<6)
+#define DPFC_RECOMP_CTL		0x320c
+#define   DPFC_RECOMP_STALL_EN	(1<<27)
+#define   DPFC_RECOMP_STALL_WM_SHIFT (16)
+#define   DPFC_RECOMP_STALL_WM_MASK (0x07ff0000)
+#define   DPFC_RECOMP_TIMER_COUNT_SHIFT (0)
+#define   DPFC_RECOMP_TIMER_COUNT_MASK (0x0000003f)
+#define DPFC_STATUS		0x3210
+#define   DPFC_INVAL_SEG_SHIFT  (16)
+#define   DPFC_INVAL_SEG_MASK	(0x07ff0000)
+#define   DPFC_COMP_SEG_SHIFT	(0)
+#define   DPFC_COMP_SEG_MASK	(0x000003ff)
+#define DPFC_STATUS2		0x3214
+#define DPFC_FENCE_YOFF		0x3218
+#define DPFC_CHICKEN		0x3224
+#define   DPFC_HT_MODIFY	(1<<31)
+
 /*
  * GPIO regs
  */
@@ -2000,6 +2032,8 @@
 #define  PF_ENABLE              (1<<31)
 #define PFA_WIN_SZ		0x68074
 #define PFB_WIN_SZ		0x68874
+#define PFA_WIN_POS		0x68070
+#define PFB_WIN_POS		0x68870
 
 /* legacy palette */
 #define LGC_PALETTE_A           0x4a000
diff --git a/drivers/gpu/drm/i915/i915_suspend.c b/drivers/gpu/drm/i915/i915_suspend.c
index 20d4d19f556..bd6d8d91ca9 100644
--- a/drivers/gpu/drm/i915/i915_suspend.c
+++ b/drivers/gpu/drm/i915/i915_suspend.c
@@ -228,6 +228,7 @@ static void i915_save_modeset_reg(struct drm_device *dev)
 
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		return;
+
 	/* Pipe & plane A info */
 	dev_priv->savePIPEACONF = I915_READ(PIPEACONF);
 	dev_priv->savePIPEASRC = I915_READ(PIPEASRC);
@@ -285,6 +286,7 @@ static void i915_save_modeset_reg(struct drm_device *dev)
 	dev_priv->savePIPEBSTAT = I915_READ(PIPEBSTAT);
 	return;
 }
+
 static void i915_restore_modeset_reg(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -379,19 +381,10 @@ static void i915_restore_modeset_reg(struct drm_device *dev)
 
 	return;
 }
-int i915_save_state(struct drm_device *dev)
+
+void i915_save_display(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	int i;
-
-	pci_read_config_byte(dev->pdev, LBB, &dev_priv->saveLBB);
-
-	/* Render Standby */
-	if (IS_I965G(dev) && IS_MOBILE(dev))
-		dev_priv->saveRENDERSTANDBY = I915_READ(MCHBAR_RENDER_STANDBY);
-
-	/* Hardware status page */
-	dev_priv->saveHWS = I915_READ(HWS_PGA);
 
 	/* Display arbitration control */
 	dev_priv->saveDSPARB = I915_READ(DSPARB);
@@ -399,6 +392,7 @@ int i915_save_state(struct drm_device *dev)
 	/* This is only meaningful in non-KMS mode */
 	/* Don't save them in KMS mode */
 	i915_save_modeset_reg(dev);
+
 	/* Cursor state */
 	dev_priv->saveCURACNTR = I915_READ(CURACNTR);
 	dev_priv->saveCURAPOS = I915_READ(CURAPOS);
@@ -448,81 +442,22 @@ int i915_save_state(struct drm_device *dev)
 	dev_priv->saveFBC_CONTROL2 = I915_READ(FBC_CONTROL2);
 	dev_priv->saveFBC_CONTROL = I915_READ(FBC_CONTROL);
 
-	/* Interrupt state */
-	dev_priv->saveIIR = I915_READ(IIR);
-	dev_priv->saveIER = I915_READ(IER);
-	dev_priv->saveIMR = I915_READ(IMR);
-
 	/* VGA state */
 	dev_priv->saveVGA0 = I915_READ(VGA0);
 	dev_priv->saveVGA1 = I915_READ(VGA1);
 	dev_priv->saveVGA_PD = I915_READ(VGA_PD);
 	dev_priv->saveVGACNTRL = I915_READ(VGACNTRL);
 
-	/* Clock gating state */
-	dev_priv->saveD_STATE = I915_READ(D_STATE);
-	dev_priv->saveDSPCLK_GATE_D = I915_READ(DSPCLK_GATE_D);
-
-	/* Cache mode state */
-	dev_priv->saveCACHE_MODE_0 = I915_READ(CACHE_MODE_0);
-
-	/* Memory Arbitration state */
-	dev_priv->saveMI_ARB_STATE = I915_READ(MI_ARB_STATE);
-
-	/* Scratch space */
-	for (i = 0; i < 16; i++) {
-		dev_priv->saveSWF0[i] = I915_READ(SWF00 + (i << 2));
-		dev_priv->saveSWF1[i] = I915_READ(SWF10 + (i << 2));
-	}
-	for (i = 0; i < 3; i++)
-		dev_priv->saveSWF2[i] = I915_READ(SWF30 + (i << 2));
-
-	/* Fences */
-	if (IS_I965G(dev)) {
-		for (i = 0; i < 16; i++)
-			dev_priv->saveFENCE[i] = I915_READ64(FENCE_REG_965_0 + (i * 8));
-	} else {
-		for (i = 0; i < 8; i++)
-			dev_priv->saveFENCE[i] = I915_READ(FENCE_REG_830_0 + (i * 4));
-
-		if (IS_I945G(dev) || IS_I945GM(dev) || IS_G33(dev))
-			for (i = 0; i < 8; i++)
-				dev_priv->saveFENCE[i+8] = I915_READ(FENCE_REG_945_8 + (i * 4));
-	}
 	i915_save_vga(dev);
-
-	return 0;
 }
 
-int i915_restore_state(struct drm_device *dev)
+void i915_restore_display(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	int i;
-
-	pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
-
-	/* Render Standby */
-	if (IS_I965G(dev) && IS_MOBILE(dev))
-		I915_WRITE(MCHBAR_RENDER_STANDBY, dev_priv->saveRENDERSTANDBY);
-
-	/* Hardware status page */
-	I915_WRITE(HWS_PGA, dev_priv->saveHWS);
 
 	/* Display arbitration */
 	I915_WRITE(DSPARB, dev_priv->saveDSPARB);
 
-	/* Fences */
-	if (IS_I965G(dev)) {
-		for (i = 0; i < 16; i++)
-			I915_WRITE64(FENCE_REG_965_0 + (i * 8), dev_priv->saveFENCE[i]);
-	} else {
-		for (i = 0; i < 8; i++)
-			I915_WRITE(FENCE_REG_830_0 + (i * 4), dev_priv->saveFENCE[i]);
-		if (IS_I945G(dev) || IS_I945GM(dev) || IS_G33(dev))
-			for (i = 0; i < 8; i++)
-				I915_WRITE(FENCE_REG_945_8 + (i * 4), dev_priv->saveFENCE[i+8]);
-	}
-	
 	/* Display port ratios (must be done before clock is set) */
 	if (SUPPORTS_INTEGRATED_DP(dev)) {
 		I915_WRITE(PIPEA_GMCH_DATA_M, dev_priv->savePIPEA_GMCH_DATA_M);
@@ -534,9 +469,11 @@ int i915_restore_state(struct drm_device *dev)
 		I915_WRITE(PIPEA_DP_LINK_N, dev_priv->savePIPEA_DP_LINK_N);
 		I915_WRITE(PIPEB_DP_LINK_N, dev_priv->savePIPEB_DP_LINK_N);
 	}
+
 	/* This is only meaningful in non-KMS mode */
 	/* Don't restore them in KMS mode */
 	i915_restore_modeset_reg(dev);
+
 	/* Cursor state */
 	I915_WRITE(CURAPOS, dev_priv->saveCURAPOS);
 	I915_WRITE(CURACNTR, dev_priv->saveCURACNTR);
@@ -586,6 +523,95 @@ int i915_restore_state(struct drm_device *dev)
 	I915_WRITE(VGA_PD, dev_priv->saveVGA_PD);
 	DRM_UDELAY(150);
 
+	i915_restore_vga(dev);
+}
+
+int i915_save_state(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int i;
+
+	pci_read_config_byte(dev->pdev, LBB, &dev_priv->saveLBB);
+
+	/* Render Standby */
+	if (IS_I965G(dev) && IS_MOBILE(dev))
+		dev_priv->saveRENDERSTANDBY = I915_READ(MCHBAR_RENDER_STANDBY);
+
+	/* Hardware status page */
+	dev_priv->saveHWS = I915_READ(HWS_PGA);
+
+	i915_save_display(dev);
+
+	/* Interrupt state */
+	dev_priv->saveIER = I915_READ(IER);
+	dev_priv->saveIMR = I915_READ(IMR);
+
+	/* Clock gating state */
+	dev_priv->saveD_STATE = I915_READ(D_STATE);
+	dev_priv->saveDSPCLK_GATE_D = I915_READ(DSPCLK_GATE_D); /* Not sure about this */
+
+	/* Cache mode state */
+	dev_priv->saveCACHE_MODE_0 = I915_READ(CACHE_MODE_0);
+
+	/* Memory Arbitration state */
+	dev_priv->saveMI_ARB_STATE = I915_READ(MI_ARB_STATE);
+
+	/* Scratch space */
+	for (i = 0; i < 16; i++) {
+		dev_priv->saveSWF0[i] = I915_READ(SWF00 + (i << 2));
+		dev_priv->saveSWF1[i] = I915_READ(SWF10 + (i << 2));
+	}
+	for (i = 0; i < 3; i++)
+		dev_priv->saveSWF2[i] = I915_READ(SWF30 + (i << 2));
+
+	/* Fences */
+	if (IS_I965G(dev)) {
+		for (i = 0; i < 16; i++)
+			dev_priv->saveFENCE[i] = I915_READ64(FENCE_REG_965_0 + (i * 8));
+	} else {
+		for (i = 0; i < 8; i++)
+			dev_priv->saveFENCE[i] = I915_READ(FENCE_REG_830_0 + (i * 4));
+
+		if (IS_I945G(dev) || IS_I945GM(dev) || IS_G33(dev))
+			for (i = 0; i < 8; i++)
+				dev_priv->saveFENCE[i+8] = I915_READ(FENCE_REG_945_8 + (i * 4));
+	}
+
+	return 0;
+}
+
+int i915_restore_state(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int i;
+
+	pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
+
+	/* Render Standby */
+	if (IS_I965G(dev) && IS_MOBILE(dev))
+		I915_WRITE(MCHBAR_RENDER_STANDBY, dev_priv->saveRENDERSTANDBY);
+
+	/* Hardware status page */
+	I915_WRITE(HWS_PGA, dev_priv->saveHWS);
+
+	/* Fences */
+	if (IS_I965G(dev)) {
+		for (i = 0; i < 16; i++)
+			I915_WRITE64(FENCE_REG_965_0 + (i * 8), dev_priv->saveFENCE[i]);
+	} else {
+		for (i = 0; i < 8; i++)
+			I915_WRITE(FENCE_REG_830_0 + (i * 4), dev_priv->saveFENCE[i]);
+		if (IS_I945G(dev) || IS_I945GM(dev) || IS_G33(dev))
+			for (i = 0; i < 8; i++)
+				I915_WRITE(FENCE_REG_945_8 + (i * 4), dev_priv->saveFENCE[i+8]);
+	}
+
+	i915_restore_display(dev);
+
+	/* Interrupt state */
+	I915_WRITE (IER, dev_priv->saveIER);
+	I915_WRITE (IMR,  dev_priv->saveIMR);
+
 	/* Clock gating state */
 	I915_WRITE (D_STATE, dev_priv->saveD_STATE);
 	I915_WRITE (DSPCLK_GATE_D, dev_priv->saveDSPCLK_GATE_D);
@@ -603,8 +629,6 @@ int i915_restore_state(struct drm_device *dev)
 	for (i = 0; i < 3; i++)
 		I915_WRITE(SWF30 + (i << 2), dev_priv->saveSWF2[i]);
 
-	i915_restore_vga(dev);
-
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
new file mode 100644
index 00000000000..5567a40816f
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -0,0 +1,315 @@
+#if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _I915_TRACE_H_
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#include <drm/drmP.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM i915
+#define TRACE_SYSTEM_STRING __stringify(TRACE_SYSTEM)
+#define TRACE_INCLUDE_FILE i915_trace
+
+/* object tracking */
+
+TRACE_EVENT(i915_gem_object_create,
+
+	    TP_PROTO(struct drm_gem_object *obj),
+
+	    TP_ARGS(obj),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_gem_object *, obj)
+			     __field(u32, size)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->obj = obj;
+			   __entry->size = obj->size;
+			   ),
+
+	    TP_printk("obj=%p, size=%u", __entry->obj, __entry->size)
+);
+
+TRACE_EVENT(i915_gem_object_bind,
+
+	    TP_PROTO(struct drm_gem_object *obj, u32 gtt_offset),
+
+	    TP_ARGS(obj, gtt_offset),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_gem_object *, obj)
+			     __field(u32, gtt_offset)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->obj = obj;
+			   __entry->gtt_offset = gtt_offset;
+			   ),
+
+	    TP_printk("obj=%p, gtt_offset=%08x",
+		      __entry->obj, __entry->gtt_offset)
+);
+
+TRACE_EVENT(i915_gem_object_clflush,
+
+	    TP_PROTO(struct drm_gem_object *obj),
+
+	    TP_ARGS(obj),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_gem_object *, obj)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->obj = obj;
+			   ),
+
+	    TP_printk("obj=%p", __entry->obj)
+);
+
+TRACE_EVENT(i915_gem_object_change_domain,
+
+	    TP_PROTO(struct drm_gem_object *obj, uint32_t old_read_domains, uint32_t old_write_domain),
+
+	    TP_ARGS(obj, old_read_domains, old_write_domain),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_gem_object *, obj)
+			     __field(u32, read_domains)
+			     __field(u32, write_domain)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->obj = obj;
+			   __entry->read_domains = obj->read_domains | (old_read_domains << 16);
+			   __entry->write_domain = obj->write_domain | (old_write_domain << 16);
+			   ),
+
+	    TP_printk("obj=%p, read=%04x, write=%04x",
+		      __entry->obj,
+		      __entry->read_domains, __entry->write_domain)
+);
+
+TRACE_EVENT(i915_gem_object_get_fence,
+
+	    TP_PROTO(struct drm_gem_object *obj, int fence, int tiling_mode),
+
+	    TP_ARGS(obj, fence, tiling_mode),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_gem_object *, obj)
+			     __field(int, fence)
+			     __field(int, tiling_mode)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->obj = obj;
+			   __entry->fence = fence;
+			   __entry->tiling_mode = tiling_mode;
+			   ),
+
+	    TP_printk("obj=%p, fence=%d, tiling=%d",
+		      __entry->obj, __entry->fence, __entry->tiling_mode)
+);
+
+TRACE_EVENT(i915_gem_object_unbind,
+
+	    TP_PROTO(struct drm_gem_object *obj),
+
+	    TP_ARGS(obj),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_gem_object *, obj)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->obj = obj;
+			   ),
+
+	    TP_printk("obj=%p", __entry->obj)
+);
+
+TRACE_EVENT(i915_gem_object_destroy,
+
+	    TP_PROTO(struct drm_gem_object *obj),
+
+	    TP_ARGS(obj),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_gem_object *, obj)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->obj = obj;
+			   ),
+
+	    TP_printk("obj=%p", __entry->obj)
+);
+
+/* batch tracing */
+
+TRACE_EVENT(i915_gem_request_submit,
+
+	    TP_PROTO(struct drm_device *dev, u32 seqno),
+
+	    TP_ARGS(dev, seqno),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_device *, dev)
+			     __field(u32, seqno)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev;
+			   __entry->seqno = seqno;
+			   ),
+
+	    TP_printk("dev=%p, seqno=%u", __entry->dev, __entry->seqno)
+);
+
+TRACE_EVENT(i915_gem_request_flush,
+
+	    TP_PROTO(struct drm_device *dev, u32 seqno,
+		     u32 flush_domains, u32 invalidate_domains),
+
+	    TP_ARGS(dev, seqno, flush_domains, invalidate_domains),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_device *, dev)
+			     __field(u32, seqno)
+			     __field(u32, flush_domains)
+			     __field(u32, invalidate_domains)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev;
+			   __entry->seqno = seqno;
+			   __entry->flush_domains = flush_domains;
+			   __entry->invalidate_domains = invalidate_domains;
+			   ),
+
+	    TP_printk("dev=%p, seqno=%u, flush=%04x, invalidate=%04x",
+		      __entry->dev, __entry->seqno,
+		      __entry->flush_domains, __entry->invalidate_domains)
+);
+
+
+TRACE_EVENT(i915_gem_request_complete,
+
+	    TP_PROTO(struct drm_device *dev, u32 seqno),
+
+	    TP_ARGS(dev, seqno),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_device *, dev)
+			     __field(u32, seqno)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev;
+			   __entry->seqno = seqno;
+			   ),
+
+	    TP_printk("dev=%p, seqno=%u", __entry->dev, __entry->seqno)
+);
+
+TRACE_EVENT(i915_gem_request_retire,
+
+	    TP_PROTO(struct drm_device *dev, u32 seqno),
+
+	    TP_ARGS(dev, seqno),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_device *, dev)
+			     __field(u32, seqno)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev;
+			   __entry->seqno = seqno;
+			   ),
+
+	    TP_printk("dev=%p, seqno=%u", __entry->dev, __entry->seqno)
+);
+
+TRACE_EVENT(i915_gem_request_wait_begin,
+
+	    TP_PROTO(struct drm_device *dev, u32 seqno),
+
+	    TP_ARGS(dev, seqno),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_device *, dev)
+			     __field(u32, seqno)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev;
+			   __entry->seqno = seqno;
+			   ),
+
+	    TP_printk("dev=%p, seqno=%u", __entry->dev, __entry->seqno)
+);
+
+TRACE_EVENT(i915_gem_request_wait_end,
+
+	    TP_PROTO(struct drm_device *dev, u32 seqno),
+
+	    TP_ARGS(dev, seqno),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_device *, dev)
+			     __field(u32, seqno)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev;
+			   __entry->seqno = seqno;
+			   ),
+
+	    TP_printk("dev=%p, seqno=%u", __entry->dev, __entry->seqno)
+);
+
+TRACE_EVENT(i915_ring_wait_begin,
+
+	    TP_PROTO(struct drm_device *dev),
+
+	    TP_ARGS(dev),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_device *, dev)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev;
+			   ),
+
+	    TP_printk("dev=%p", __entry->dev)
+);
+
+TRACE_EVENT(i915_ring_wait_end,
+
+	    TP_PROTO(struct drm_device *dev),
+
+	    TP_ARGS(dev),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_device *, dev)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev;
+			   ),
+
+	    TP_printk("dev=%p", __entry->dev)
+);
+
+#endif /* _I915_TRACE_H_ */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../drivers/gpu/drm/i915
+#include <trace/define_trace.h>
diff --git a/drivers/gpu/drm/i915/i915_trace_points.c b/drivers/gpu/drm/i915/i915_trace_points.c
new file mode 100644
index 00000000000..ead876eb6ea
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_trace_points.c
@@ -0,0 +1,11 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ */
+
+#include "i915_drv.h"
+
+#define CREATE_TRACE_POINTS
+#include "i915_trace.h"
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index 1e28c1652fd..4337414846b 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -217,6 +217,9 @@ parse_general_features(struct drm_i915_private *dev_priv,
 			if (IS_I85X(dev_priv->dev))
 				dev_priv->lvds_ssc_freq =
 					general->ssc_freq ? 66 : 48;
+			else if (IS_IGDNG(dev_priv->dev))
+				dev_priv->lvds_ssc_freq =
+					general->ssc_freq ? 100 : 120;
 			else
 				dev_priv->lvds_ssc_freq =
 					general->ssc_freq ? 100 : 96;
diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c
index 88814fa2dfd..212e22740fc 100644
--- a/drivers/gpu/drm/i915/intel_crt.c
+++ b/drivers/gpu/drm/i915/intel_crt.c
@@ -179,13 +179,10 @@ static bool intel_igdng_crt_detect_hotplug(struct drm_connector *connector)
 {
 	struct drm_device *dev = connector->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 adpa, temp;
+	u32 adpa;
 	bool ret;
 
-	temp = adpa = I915_READ(PCH_ADPA);
-
-	adpa &= ~ADPA_DAC_ENABLE;
-	I915_WRITE(PCH_ADPA, adpa);
+	adpa = I915_READ(PCH_ADPA);
 
 	adpa &= ~ADPA_CRT_HOTPLUG_MASK;
 
@@ -212,8 +209,6 @@ static bool intel_igdng_crt_detect_hotplug(struct drm_connector *connector)
 	else
 		ret = false;
 
-	/* restore origin register */
-	I915_WRITE(PCH_ADPA, temp);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 0227b165290..93ff6c03733 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -24,6 +24,8 @@
  *	Eric Anholt <eric@anholt.net>
  */
 
+#include <linux/module.h>
+#include <linux/input.h>
 #include <linux/i2c.h>
 #include <linux/kernel.h>
 #include "drmP.h"
@@ -875,7 +877,7 @@ intel_igdng_find_best_PLL(const intel_limit_t *limit, struct drm_crtc *crtc,
 					       refclk, best_clock);
 
 	if (intel_pipe_has_type(crtc, INTEL_OUTPUT_LVDS)) {
-		if ((I915_READ(LVDS) & LVDS_CLKB_POWER_MASK) ==
+		if ((I915_READ(PCH_LVDS) & LVDS_CLKB_POWER_MASK) ==
 		    LVDS_CLKB_POWER_UP)
 			clock.p2 = limit->p2.p2_fast;
 		else
@@ -952,6 +954,241 @@ intel_wait_for_vblank(struct drm_device *dev)
 	mdelay(20);
 }
 
+/* Parameters have changed, update FBC info */
+static void i8xx_enable_fbc(struct drm_crtc *crtc, unsigned long interval)
+{
+	struct drm_device *dev = crtc->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_framebuffer *fb = crtc->fb;
+	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
+	struct drm_i915_gem_object *obj_priv = intel_fb->obj->driver_private;
+	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	int plane, i;
+	u32 fbc_ctl, fbc_ctl2;
+
+	dev_priv->cfb_pitch = dev_priv->cfb_size / FBC_LL_SIZE;
+
+	if (fb->pitch < dev_priv->cfb_pitch)
+		dev_priv->cfb_pitch = fb->pitch;
+
+	/* FBC_CTL wants 64B units */
+	dev_priv->cfb_pitch = (dev_priv->cfb_pitch / 64) - 1;
+	dev_priv->cfb_fence = obj_priv->fence_reg;
+	dev_priv->cfb_plane = intel_crtc->plane;
+	plane = dev_priv->cfb_plane == 0 ? FBC_CTL_PLANEA : FBC_CTL_PLANEB;
+
+	/* Clear old tags */
+	for (i = 0; i < (FBC_LL_SIZE / 32) + 1; i++)
+		I915_WRITE(FBC_TAG + (i * 4), 0);
+
+	/* Set it up... */
+	fbc_ctl2 = FBC_CTL_FENCE_DBL | FBC_CTL_IDLE_IMM | plane;
+	if (obj_priv->tiling_mode != I915_TILING_NONE)
+		fbc_ctl2 |= FBC_CTL_CPU_FENCE;
+	I915_WRITE(FBC_CONTROL2, fbc_ctl2);
+	I915_WRITE(FBC_FENCE_OFF, crtc->y);
+
+	/* enable it... */
+	fbc_ctl = FBC_CTL_EN | FBC_CTL_PERIODIC;
+	fbc_ctl |= (dev_priv->cfb_pitch & 0xff) << FBC_CTL_STRIDE_SHIFT;
+	fbc_ctl |= (interval & 0x2fff) << FBC_CTL_INTERVAL_SHIFT;
+	if (obj_priv->tiling_mode != I915_TILING_NONE)
+		fbc_ctl |= dev_priv->cfb_fence;
+	I915_WRITE(FBC_CONTROL, fbc_ctl);
+
+	DRM_DEBUG("enabled FBC, pitch %ld, yoff %d, plane %d, ",
+		  dev_priv->cfb_pitch, crtc->y, dev_priv->cfb_plane);
+}
+
+void i8xx_disable_fbc(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u32 fbc_ctl;
+
+	if (!I915_HAS_FBC(dev))
+		return;
+
+	/* Disable compression */
+	fbc_ctl = I915_READ(FBC_CONTROL);
+	fbc_ctl &= ~FBC_CTL_EN;
+	I915_WRITE(FBC_CONTROL, fbc_ctl);
+
+	/* Wait for compressing bit to clear */
+	while (I915_READ(FBC_STATUS) & FBC_STAT_COMPRESSING)
+		; /* nothing */
+
+	intel_wait_for_vblank(dev);
+
+	DRM_DEBUG("disabled FBC\n");
+}
+
+static bool i8xx_fbc_enabled(struct drm_crtc *crtc)
+{
+	struct drm_device *dev = crtc->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	return I915_READ(FBC_CONTROL) & FBC_CTL_EN;
+}
+
+static void g4x_enable_fbc(struct drm_crtc *crtc, unsigned long interval)
+{
+	struct drm_device *dev = crtc->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_framebuffer *fb = crtc->fb;
+	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
+	struct drm_i915_gem_object *obj_priv = intel_fb->obj->driver_private;
+	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	int plane = (intel_crtc->plane == 0 ? DPFC_CTL_PLANEA :
+		     DPFC_CTL_PLANEB);
+	unsigned long stall_watermark = 200;
+	u32 dpfc_ctl;
+
+	dev_priv->cfb_pitch = (dev_priv->cfb_pitch / 64) - 1;
+	dev_priv->cfb_fence = obj_priv->fence_reg;
+	dev_priv->cfb_plane = intel_crtc->plane;
+
+	dpfc_ctl = plane | DPFC_SR_EN | DPFC_CTL_LIMIT_1X;
+	if (obj_priv->tiling_mode != I915_TILING_NONE) {
+		dpfc_ctl |= DPFC_CTL_FENCE_EN | dev_priv->cfb_fence;
+		I915_WRITE(DPFC_CHICKEN, DPFC_HT_MODIFY);
+	} else {
+		I915_WRITE(DPFC_CHICKEN, ~DPFC_HT_MODIFY);
+	}
+
+	I915_WRITE(DPFC_CONTROL, dpfc_ctl);
+	I915_WRITE(DPFC_RECOMP_CTL, DPFC_RECOMP_STALL_EN |
+		   (stall_watermark << DPFC_RECOMP_STALL_WM_SHIFT) |
+		   (interval << DPFC_RECOMP_TIMER_COUNT_SHIFT));
+	I915_WRITE(DPFC_FENCE_YOFF, crtc->y);
+
+	/* enable it... */
+	I915_WRITE(DPFC_CONTROL, I915_READ(DPFC_CONTROL) | DPFC_CTL_EN);
+
+	DRM_DEBUG("enabled fbc on plane %d\n", intel_crtc->plane);
+}
+
+void g4x_disable_fbc(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u32 dpfc_ctl;
+
+	/* Disable compression */
+	dpfc_ctl = I915_READ(DPFC_CONTROL);
+	dpfc_ctl &= ~DPFC_CTL_EN;
+	I915_WRITE(DPFC_CONTROL, dpfc_ctl);
+	intel_wait_for_vblank(dev);
+
+	DRM_DEBUG("disabled FBC\n");
+}
+
+static bool g4x_fbc_enabled(struct drm_crtc *crtc)
+{
+	struct drm_device *dev = crtc->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	return I915_READ(DPFC_CONTROL) & DPFC_CTL_EN;
+}
+
+/**
+ * intel_update_fbc - enable/disable FBC as needed
+ * @crtc: CRTC to point the compressor at
+ * @mode: mode in use
+ *
+ * Set up the framebuffer compression hardware at mode set time.  We
+ * enable it if possible:
+ *   - plane A only (on pre-965)
+ *   - no pixel mulitply/line duplication
+ *   - no alpha buffer discard
+ *   - no dual wide
+ *   - framebuffer <= 2048 in width, 1536 in height
+ *
+ * We can't assume that any compression will take place (worst case),
+ * so the compressed buffer has to be the same size as the uncompressed
+ * one.  It also must reside (along with the line length buffer) in
+ * stolen memory.
+ *
+ * We need to enable/disable FBC on a global basis.
+ */
+static void intel_update_fbc(struct drm_crtc *crtc,
+			     struct drm_display_mode *mode)
+{
+	struct drm_device *dev = crtc->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_framebuffer *fb = crtc->fb;
+	struct intel_framebuffer *intel_fb;
+	struct drm_i915_gem_object *obj_priv;
+	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	int plane = intel_crtc->plane;
+
+	if (!i915_powersave)
+		return;
+
+	if (!dev_priv->display.fbc_enabled ||
+	    !dev_priv->display.enable_fbc ||
+	    !dev_priv->display.disable_fbc)
+		return;
+
+	if (!crtc->fb)
+		return;
+
+	intel_fb = to_intel_framebuffer(fb);
+	obj_priv = intel_fb->obj->driver_private;
+
+	/*
+	 * If FBC is already on, we just have to verify that we can
+	 * keep it that way...
+	 * Need to disable if:
+	 *   - changing FBC params (stride, fence, mode)
+	 *   - new fb is too large to fit in compressed buffer
+	 *   - going to an unsupported config (interlace, pixel multiply, etc.)
+	 */
+	if (intel_fb->obj->size > dev_priv->cfb_size) {
+		DRM_DEBUG("framebuffer too large, disabling compression\n");
+		goto out_disable;
+	}
+	if ((mode->flags & DRM_MODE_FLAG_INTERLACE) ||
+	    (mode->flags & DRM_MODE_FLAG_DBLSCAN)) {
+		DRM_DEBUG("mode incompatible with compression, disabling\n");
+		goto out_disable;
+	}
+	if ((mode->hdisplay > 2048) ||
+	    (mode->vdisplay > 1536)) {
+		DRM_DEBUG("mode too large for compression, disabling\n");
+		goto out_disable;
+	}
+	if ((IS_I915GM(dev) || IS_I945GM(dev)) && plane != 0) {
+		DRM_DEBUG("plane not 0, disabling compression\n");
+		goto out_disable;
+	}
+	if (obj_priv->tiling_mode != I915_TILING_X) {
+		DRM_DEBUG("framebuffer not tiled, disabling compression\n");
+		goto out_disable;
+	}
+
+	if (dev_priv->display.fbc_enabled(crtc)) {
+		/* We can re-enable it in this case, but need to update pitch */
+		if (fb->pitch > dev_priv->cfb_pitch)
+			dev_priv->display.disable_fbc(dev);
+		if (obj_priv->fence_reg != dev_priv->cfb_fence)
+			dev_priv->display.disable_fbc(dev);
+		if (plane != dev_priv->cfb_plane)
+			dev_priv->display.disable_fbc(dev);
+	}
+
+	if (!dev_priv->display.fbc_enabled(crtc)) {
+		/* Now try to turn it back on if possible */
+		dev_priv->display.enable_fbc(crtc, 500);
+	}
+
+	return;
+
+out_disable:
+	DRM_DEBUG("unsupported config, disabling FBC\n");
+	/* Multiple disables should be harmless */
+	if (dev_priv->display.fbc_enabled(crtc))
+		dev_priv->display.disable_fbc(dev);
+}
+
 static int
 intel_pipe_set_base(struct drm_crtc *crtc, int x, int y,
 		    struct drm_framebuffer *old_fb)
@@ -964,12 +1201,13 @@ intel_pipe_set_base(struct drm_crtc *crtc, int x, int y,
 	struct drm_i915_gem_object *obj_priv;
 	struct drm_gem_object *obj;
 	int pipe = intel_crtc->pipe;
+	int plane = intel_crtc->plane;
 	unsigned long Start, Offset;
-	int dspbase = (pipe == 0 ? DSPAADDR : DSPBADDR);
-	int dspsurf = (pipe == 0 ? DSPASURF : DSPBSURF);
-	int dspstride = (pipe == 0) ? DSPASTRIDE : DSPBSTRIDE;
-	int dsptileoff = (pipe == 0 ? DSPATILEOFF : DSPBTILEOFF);
-	int dspcntr_reg = (pipe == 0) ? DSPACNTR : DSPBCNTR;
+	int dspbase = (plane == 0 ? DSPAADDR : DSPBADDR);
+	int dspsurf = (plane == 0 ? DSPASURF : DSPBSURF);
+	int dspstride = (plane == 0) ? DSPASTRIDE : DSPBSTRIDE;
+	int dsptileoff = (plane == 0 ? DSPATILEOFF : DSPBTILEOFF);
+	int dspcntr_reg = (plane == 0) ? DSPACNTR : DSPBCNTR;
 	u32 dspcntr, alignment;
 	int ret;
 
@@ -979,12 +1217,12 @@ intel_pipe_set_base(struct drm_crtc *crtc, int x, int y,
 		return 0;
 	}
 
-	switch (pipe) {
+	switch (plane) {
 	case 0:
 	case 1:
 		break;
 	default:
-		DRM_ERROR("Can't update pipe %d in SAREA\n", pipe);
+		DRM_ERROR("Can't update plane %d in SAREA\n", plane);
 		return -EINVAL;
 	}
 
@@ -1086,6 +1324,9 @@ intel_pipe_set_base(struct drm_crtc *crtc, int x, int y,
 		I915_READ(dspbase);
 	}
 
+	if ((IS_I965G(dev) || plane == 0))
+		intel_update_fbc(crtc, &crtc->mode);
+
 	intel_wait_for_vblank(dev);
 
 	if (old_fb) {
@@ -1217,6 +1458,7 @@ static void igdng_crtc_dpms(struct drm_crtc *crtc, int mode)
 	int transconf_reg = (pipe == 0) ? TRANSACONF : TRANSBCONF;
 	int pf_ctl_reg = (pipe == 0) ? PFA_CTL_1 : PFB_CTL_1;
 	int pf_win_size = (pipe == 0) ? PFA_WIN_SZ : PFB_WIN_SZ;
+	int pf_win_pos = (pipe == 0) ? PFA_WIN_POS : PFB_WIN_POS;
 	int cpu_htot_reg = (pipe == 0) ? HTOTAL_A : HTOTAL_B;
 	int cpu_hblank_reg = (pipe == 0) ? HBLANK_A : HBLANK_B;
 	int cpu_hsync_reg = (pipe == 0) ? HSYNC_A : HSYNC_B;
@@ -1268,6 +1510,19 @@ static void igdng_crtc_dpms(struct drm_crtc *crtc, int mode)
 			}
 		}
 
+		/* Enable panel fitting for LVDS */
+		if (intel_pipe_has_type(crtc, INTEL_OUTPUT_LVDS)) {
+			temp = I915_READ(pf_ctl_reg);
+			I915_WRITE(pf_ctl_reg, temp | PF_ENABLE);
+
+			/* currently full aspect */
+			I915_WRITE(pf_win_pos, 0);
+
+			I915_WRITE(pf_win_size,
+				   (dev_priv->panel_fixed_mode->hdisplay << 16) |
+				   (dev_priv->panel_fixed_mode->vdisplay));
+		}
+
 		/* Enable CPU pipe */
 		temp = I915_READ(pipeconf_reg);
 		if ((temp & PIPEACONF_ENABLE) == 0) {
@@ -1532,9 +1787,10 @@ static void i9xx_crtc_dpms(struct drm_crtc *crtc, int mode)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
 	int pipe = intel_crtc->pipe;
+	int plane = intel_crtc->plane;
 	int dpll_reg = (pipe == 0) ? DPLL_A : DPLL_B;
-	int dspcntr_reg = (pipe == 0) ? DSPACNTR : DSPBCNTR;
-	int dspbase_reg = (pipe == 0) ? DSPAADDR : DSPBADDR;
+	int dspcntr_reg = (plane == 0) ? DSPACNTR : DSPBCNTR;
+	int dspbase_reg = (plane == 0) ? DSPAADDR : DSPBADDR;
 	int pipeconf_reg = (pipe == 0) ? PIPEACONF : PIPEBCONF;
 	u32 temp;
 
@@ -1577,6 +1833,9 @@ static void i9xx_crtc_dpms(struct drm_crtc *crtc, int mode)
 
 		intel_crtc_load_lut(crtc);
 
+		if ((IS_I965G(dev) || plane == 0))
+			intel_update_fbc(crtc, &crtc->mode);
+
 		/* Give the overlay scaler a chance to enable if it's on this pipe */
 		//intel_crtc_dpms_video(crtc, true); TODO
 		intel_update_watermarks(dev);
@@ -1586,6 +1845,10 @@ static void i9xx_crtc_dpms(struct drm_crtc *crtc, int mode)
 		/* Give the overlay scaler a chance to disable if it's on this pipe */
 		//intel_crtc_dpms_video(crtc, FALSE); TODO
 
+		if (dev_priv->cfb_plane == plane &&
+		    dev_priv->display.disable_fbc)
+			dev_priv->display.disable_fbc(dev);
+
 		/* Disable the VGA plane that we never use */
 		i915_disable_vga(dev);
 
@@ -1634,15 +1897,13 @@ static void i9xx_crtc_dpms(struct drm_crtc *crtc, int mode)
 static void intel_crtc_dpms(struct drm_crtc *crtc, int mode)
 {
 	struct drm_device *dev = crtc->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_master_private *master_priv;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
 	int pipe = intel_crtc->pipe;
 	bool enabled;
 
-	if (IS_IGDNG(dev))
-		igdng_crtc_dpms(crtc, mode);
-	else
-		i9xx_crtc_dpms(crtc, mode);
+	dev_priv->display.dpms(crtc, mode);
 
 	intel_crtc->dpms_mode = mode;
 
@@ -1709,56 +1970,68 @@ static bool intel_crtc_mode_fixup(struct drm_crtc *crtc,
 	return true;
 }
 
+static int i945_get_display_clock_speed(struct drm_device *dev)
+{
+	return 400000;
+}
 
-/** Returns the core display clock speed for i830 - i945 */
-static int intel_get_core_clock_speed(struct drm_device *dev)
+static int i915_get_display_clock_speed(struct drm_device *dev)
 {
+	return 333000;
+}
 
-	/* Core clock values taken from the published datasheets.
-	 * The 830 may go up to 166 Mhz, which we should check.
-	 */
-	if (IS_I945G(dev))
-		return 400000;
-	else if (IS_I915G(dev))
-		return 333000;
-	else if (IS_I945GM(dev) || IS_845G(dev) || IS_IGDGM(dev))
-		return 200000;
-	else if (IS_I915GM(dev)) {
-		u16 gcfgc = 0;
+static int i9xx_misc_get_display_clock_speed(struct drm_device *dev)
+{
+	return 200000;
+}
 
-		pci_read_config_word(dev->pdev, GCFGC, &gcfgc);
+static int i915gm_get_display_clock_speed(struct drm_device *dev)
+{
+	u16 gcfgc = 0;
 
-		if (gcfgc & GC_LOW_FREQUENCY_ENABLE)
-			return 133000;
-		else {
-			switch (gcfgc & GC_DISPLAY_CLOCK_MASK) {
-			case GC_DISPLAY_CLOCK_333_MHZ:
-				return 333000;
-			default:
-			case GC_DISPLAY_CLOCK_190_200_MHZ:
-				return 190000;
-			}
-		}
-	} else if (IS_I865G(dev))
-		return 266000;
-	else if (IS_I855(dev)) {
-		u16 hpllcc = 0;
-		/* Assume that the hardware is in the high speed state.  This
-		 * should be the default.
-		 */
-		switch (hpllcc & GC_CLOCK_CONTROL_MASK) {
-		case GC_CLOCK_133_200:
-		case GC_CLOCK_100_200:
-			return 200000;
-		case GC_CLOCK_166_250:
-			return 250000;
-		case GC_CLOCK_100_133:
-			return 133000;
+	pci_read_config_word(dev->pdev, GCFGC, &gcfgc);
+
+	if (gcfgc & GC_LOW_FREQUENCY_ENABLE)
+		return 133000;
+	else {
+		switch (gcfgc & GC_DISPLAY_CLOCK_MASK) {
+		case GC_DISPLAY_CLOCK_333_MHZ:
+			return 333000;
+		default:
+		case GC_DISPLAY_CLOCK_190_200_MHZ:
+			return 190000;
 		}
-	} else /* 852, 830 */
+	}
+}
+
+static int i865_get_display_clock_speed(struct drm_device *dev)
+{
+	return 266000;
+}
+
+static int i855_get_display_clock_speed(struct drm_device *dev)
+{
+	u16 hpllcc = 0;
+	/* Assume that the hardware is in the high speed state.  This
+	 * should be the default.
+	 */
+	switch (hpllcc & GC_CLOCK_CONTROL_MASK) {
+	case GC_CLOCK_133_200:
+	case GC_CLOCK_100_200:
+		return 200000;
+	case GC_CLOCK_166_250:
+		return 250000;
+	case GC_CLOCK_100_133:
 		return 133000;
+	}
+
+	/* Shouldn't happen */
+	return 0;
+}
 
-	return 0; /* Silence gcc warning */
+static int i830_get_display_clock_speed(struct drm_device *dev)
+{
+	return 133000;
 }
 
 /**
@@ -1921,7 +2194,14 @@ static unsigned long intel_calculate_wm(unsigned long clock_in_khz,
 {
 	long entries_required, wm_size;
 
-	entries_required = (clock_in_khz * pixel_size * latency_ns) / 1000000;
+	/*
+	 * Note: we need to make sure we don't overflow for various clock &
+	 * latency values.
+	 * clocks go from a few thousand to several hundred thousand.
+	 * latency is usually a few thousand
+	 */
+	entries_required = ((clock_in_khz / 1000) * pixel_size * latency_ns) /
+		1000;
 	entries_required /= wm->cacheline_size;
 
 	DRM_DEBUG("FIFO entries required for mode: %d\n", entries_required);
@@ -1986,14 +2266,13 @@ static struct cxsr_latency *intel_get_cxsr_latency(int is_desktop, int fsb,
 	for (i = 0; i < ARRAY_SIZE(cxsr_latency_table); i++) {
 		latency = &cxsr_latency_table[i];
 		if (is_desktop == latency->is_desktop &&
-			fsb == latency->fsb_freq && mem == latency->mem_freq)
-			break;
+		    fsb == latency->fsb_freq && mem == latency->mem_freq)
+			return latency;
 	}
-	if (i >= ARRAY_SIZE(cxsr_latency_table)) {
-		DRM_DEBUG("Unknown FSB/MEM found, disable CxSR\n");
-		return NULL;
-	}
-	return latency;
+
+	DRM_DEBUG("Unknown FSB/MEM found, disable CxSR\n");
+
+	return NULL;
 }
 
 static void igd_disable_cxsr(struct drm_device *dev)
@@ -2084,32 +2363,36 @@ static void igd_enable_cxsr(struct drm_device *dev, unsigned long clock,
  */
 const static int latency_ns = 5000;
 
-static int intel_get_fifo_size(struct drm_device *dev, int plane)
+static int i9xx_get_fifo_size(struct drm_device *dev, int plane)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t dsparb = I915_READ(DSPARB);
 	int size;
 
-	if (IS_I9XX(dev)) {
-		if (plane == 0)
-			size = dsparb & 0x7f;
-		else
-			size = ((dsparb >> DSPARB_CSTART_SHIFT) & 0x7f) -
-				(dsparb & 0x7f);
-	} else if (IS_I85X(dev)) {
-		if (plane == 0)
-			size = dsparb & 0x1ff;
-		else
-			size = ((dsparb >> DSPARB_BEND_SHIFT) & 0x1ff) -
-				(dsparb & 0x1ff);
-		size >>= 1; /* Convert to cachelines */
-	} else if (IS_845G(dev)) {
+	if (plane == 0)
 		size = dsparb & 0x7f;
-		size >>= 2; /* Convert to cachelines */
-	} else {
-		size = dsparb & 0x7f;
-		size >>= 1; /* Convert to cachelines */
-	}
+	else
+		size = ((dsparb >> DSPARB_CSTART_SHIFT) & 0x7f) -
+			(dsparb & 0x7f);
+
+	DRM_DEBUG("FIFO size - (0x%08x) %s: %d\n", dsparb, plane ? "B" : "A",
+		  size);
+
+	return size;
+}
+
+static int i85x_get_fifo_size(struct drm_device *dev, int plane)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t dsparb = I915_READ(DSPARB);
+	int size;
+
+	if (plane == 0)
+		size = dsparb & 0x1ff;
+	else
+		size = ((dsparb >> DSPARB_BEND_SHIFT) & 0x1ff) -
+			(dsparb & 0x1ff);
+	size >>= 1; /* Convert to cachelines */
 
 	DRM_DEBUG("FIFO size - (0x%08x) %s: %d\n", dsparb, plane ? "B" : "A",
 		  size);
@@ -2117,7 +2400,38 @@ static int intel_get_fifo_size(struct drm_device *dev, int plane)
 	return size;
 }
 
-static void g4x_update_wm(struct drm_device *dev)
+static int i845_get_fifo_size(struct drm_device *dev, int plane)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t dsparb = I915_READ(DSPARB);
+	int size;
+
+	size = dsparb & 0x7f;
+	size >>= 2; /* Convert to cachelines */
+
+	DRM_DEBUG("FIFO size - (0x%08x) %s: %d\n", dsparb, plane ? "B" : "A",
+		  size);
+
+	return size;
+}
+
+static int i830_get_fifo_size(struct drm_device *dev, int plane)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t dsparb = I915_READ(DSPARB);
+	int size;
+
+	size = dsparb & 0x7f;
+	size >>= 1; /* Convert to cachelines */
+
+	DRM_DEBUG("FIFO size - (0x%08x) %s: %d\n", dsparb, plane ? "B" : "A",
+		  size);
+
+	return size;
+}
+
+static void g4x_update_wm(struct drm_device *dev, int unused, int unused2,
+			  int unused3, int unused4)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 fw_blc_self = I915_READ(FW_BLC_SELF);
@@ -2129,7 +2443,8 @@ static void g4x_update_wm(struct drm_device *dev)
 	I915_WRITE(FW_BLC_SELF, fw_blc_self);
 }
 
-static void i965_update_wm(struct drm_device *dev)
+static void i965_update_wm(struct drm_device *dev, int unused, int unused2,
+			   int unused3, int unused4)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
@@ -2165,8 +2480,8 @@ static void i9xx_update_wm(struct drm_device *dev, int planea_clock,
 	cacheline_size = planea_params.cacheline_size;
 
 	/* Update per-plane FIFO sizes */
-	planea_params.fifo_size = intel_get_fifo_size(dev, 0);
-	planeb_params.fifo_size = intel_get_fifo_size(dev, 1);
+	planea_params.fifo_size = dev_priv->display.get_fifo_size(dev, 0);
+	planeb_params.fifo_size = dev_priv->display.get_fifo_size(dev, 1);
 
 	planea_wm = intel_calculate_wm(planea_clock, &planea_params,
 				       pixel_size, latency_ns);
@@ -2213,14 +2528,14 @@ static void i9xx_update_wm(struct drm_device *dev, int planea_clock,
 	I915_WRITE(FW_BLC2, fwater_hi);
 }
 
-static void i830_update_wm(struct drm_device *dev, int planea_clock,
-			   int pixel_size)
+static void i830_update_wm(struct drm_device *dev, int planea_clock, int unused,
+			   int unused2, int pixel_size)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t fwater_lo = I915_READ(FW_BLC) & ~0xfff;
 	int planea_wm;
 
-	i830_wm_info.fifo_size = intel_get_fifo_size(dev, 0);
+	i830_wm_info.fifo_size = dev_priv->display.get_fifo_size(dev, 0);
 
 	planea_wm = intel_calculate_wm(planea_clock, &i830_wm_info,
 				       pixel_size, latency_ns);
@@ -2264,6 +2579,7 @@ static void i830_update_wm(struct drm_device *dev, int planea_clock,
   */
 static void intel_update_watermarks(struct drm_device *dev)
 {
+	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_crtc *crtc;
 	struct intel_crtc *intel_crtc;
 	int sr_hdisplay = 0;
@@ -2302,15 +2618,8 @@ static void intel_update_watermarks(struct drm_device *dev)
 	else if (IS_IGD(dev))
 		igd_disable_cxsr(dev);
 
-	if (IS_G4X(dev))
-		g4x_update_wm(dev);
-	else if (IS_I965G(dev))
-		i965_update_wm(dev);
-	else if (IS_I9XX(dev) || IS_MOBILE(dev))
-		i9xx_update_wm(dev, planea_clock, planeb_clock, sr_hdisplay,
-			       pixel_size);
-	else
-		i830_update_wm(dev, planea_clock, pixel_size);
+	dev_priv->display.update_wm(dev, planea_clock, planeb_clock,
+				    sr_hdisplay, pixel_size);
 }
 
 static int intel_crtc_mode_set(struct drm_crtc *crtc,
@@ -2323,10 +2632,11 @@ static int intel_crtc_mode_set(struct drm_crtc *crtc,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
 	int pipe = intel_crtc->pipe;
+	int plane = intel_crtc->plane;
 	int fp_reg = (pipe == 0) ? FPA0 : FPB0;
 	int dpll_reg = (pipe == 0) ? DPLL_A : DPLL_B;
 	int dpll_md_reg = (intel_crtc->pipe == 0) ? DPLL_A_MD : DPLL_B_MD;
-	int dspcntr_reg = (pipe == 0) ? DSPACNTR : DSPBCNTR;
+	int dspcntr_reg = (plane == 0) ? DSPACNTR : DSPBCNTR;
 	int pipeconf_reg = (pipe == 0) ? PIPEACONF : PIPEBCONF;
 	int htot_reg = (pipe == 0) ? HTOTAL_A : HTOTAL_B;
 	int hblank_reg = (pipe == 0) ? HBLANK_A : HBLANK_B;
@@ -2334,8 +2644,8 @@ static int intel_crtc_mode_set(struct drm_crtc *crtc,
 	int vtot_reg = (pipe == 0) ? VTOTAL_A : VTOTAL_B;
 	int vblank_reg = (pipe == 0) ? VBLANK_A : VBLANK_B;
 	int vsync_reg = (pipe == 0) ? VSYNC_A : VSYNC_B;
-	int dspsize_reg = (pipe == 0) ? DSPASIZE : DSPBSIZE;
-	int dsppos_reg = (pipe == 0) ? DSPAPOS : DSPBPOS;
+	int dspsize_reg = (plane == 0) ? DSPASIZE : DSPBSIZE;
+	int dsppos_reg = (plane == 0) ? DSPAPOS : DSPBPOS;
 	int pipesrc_reg = (pipe == 0) ? PIPEASRC : PIPEBSRC;
 	int refclk, num_outputs = 0;
 	intel_clock_t clock, reduced_clock;
@@ -2568,7 +2878,7 @@ static int intel_crtc_mode_set(struct drm_crtc *crtc,
 	   enable color space conversion */
 	if (!IS_IGDNG(dev)) {
 		if (pipe == 0)
-			dspcntr |= DISPPLANE_SEL_PIPE_A;
+			dspcntr &= ~DISPPLANE_SEL_PIPE_MASK;
 		else
 			dspcntr |= DISPPLANE_SEL_PIPE_B;
 	}
@@ -2580,7 +2890,8 @@ static int intel_crtc_mode_set(struct drm_crtc *crtc,
 		 * XXX: No double-wide on 915GM pipe B. Is that the only reason for the
 		 * pipe == 0 check?
 		 */
-		if (mode->clock > intel_get_core_clock_speed(dev) * 9 / 10)
+		if (mode->clock >
+		    dev_priv->display.get_display_clock_speed(dev) * 9 / 10)
 			pipeconf |= PIPEACONF_DOUBLE_WIDE;
 		else
 			pipeconf &= ~PIPEACONF_DOUBLE_WIDE;
@@ -2652,9 +2963,12 @@ static int intel_crtc_mode_set(struct drm_crtc *crtc,
 		udelay(150);
 
 		if (IS_I965G(dev) && !IS_IGDNG(dev)) {
-			sdvo_pixel_multiply = adjusted_mode->clock / mode->clock;
-			I915_WRITE(dpll_md_reg, (0 << DPLL_MD_UDI_DIVIDER_SHIFT) |
+			if (is_sdvo) {
+				sdvo_pixel_multiply = adjusted_mode->clock / mode->clock;
+				I915_WRITE(dpll_md_reg, (0 << DPLL_MD_UDI_DIVIDER_SHIFT) |
 					((sdvo_pixel_multiply - 1) << DPLL_MD_UDI_MULTIPLIER_SHIFT));
+			} else
+				I915_WRITE(dpll_md_reg, 0);
 		} else {
 			/* write it again -- the BIOS does, after all */
 			I915_WRITE(dpll_reg, dpll);
@@ -2734,6 +3048,9 @@ static int intel_crtc_mode_set(struct drm_crtc *crtc,
 	/* Flush the plane changes */
 	ret = intel_pipe_set_base(crtc, x, y, old_fb);
 
+	if ((IS_I965G(dev) || plane == 0))
+		intel_update_fbc(crtc, &crtc->mode);
+
 	intel_update_watermarks(dev);
 
 	drm_vblank_post_modeset(dev, pipe);
@@ -2778,6 +3095,7 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc,
 	struct drm_gem_object *bo;
 	struct drm_i915_gem_object *obj_priv;
 	int pipe = intel_crtc->pipe;
+	int plane = intel_crtc->plane;
 	uint32_t control = (pipe == 0) ? CURACNTR : CURBCNTR;
 	uint32_t base = (pipe == 0) ? CURABASE : CURBBASE;
 	uint32_t temp = I915_READ(control);
@@ -2863,6 +3181,10 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc,
 			i915_gem_object_unpin(intel_crtc->cursor_bo);
 		drm_gem_object_unreference(intel_crtc->cursor_bo);
 	}
+
+	if ((IS_I965G(dev) || plane == 0))
+		intel_update_fbc(crtc, &crtc->mode);
+
 	mutex_unlock(&dev->struct_mutex);
 
 	intel_crtc->cursor_addr = addr;
@@ -3544,6 +3866,14 @@ static void intel_crtc_init(struct drm_device *dev, int pipe)
 		intel_crtc->lut_b[i] = i;
 	}
 
+	/* Swap pipes & planes for FBC on pre-965 */
+	intel_crtc->pipe = pipe;
+	intel_crtc->plane = pipe;
+	if (IS_MOBILE(dev) && (IS_I9XX(dev) && !IS_I965G(dev))) {
+		DRM_DEBUG("swapping pipes & planes for FBC\n");
+		intel_crtc->plane = ((pipe == 0) ? 1 : 0);
+	}
+
 	intel_crtc->cursor_addr = 0;
 	intel_crtc->dpms_mode = DRM_MODE_DPMS_OFF;
 	drm_crtc_helper_add(&intel_crtc->base, &intel_helper_funcs);
@@ -3826,6 +4156,73 @@ void intel_init_clock_gating(struct drm_device *dev)
 	}
 }
 
+/* Set up chip specific display functions */
+static void intel_init_display(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	/* We always want a DPMS function */
+	if (IS_IGDNG(dev))
+		dev_priv->display.dpms = igdng_crtc_dpms;
+	else
+		dev_priv->display.dpms = i9xx_crtc_dpms;
+
+	/* Only mobile has FBC, leave pointers NULL for other chips */
+	if (IS_MOBILE(dev)) {
+		if (IS_GM45(dev)) {
+			dev_priv->display.fbc_enabled = g4x_fbc_enabled;
+			dev_priv->display.enable_fbc = g4x_enable_fbc;
+			dev_priv->display.disable_fbc = g4x_disable_fbc;
+		} else if (IS_I965GM(dev) || IS_I945GM(dev) || IS_I915GM(dev)) {
+			dev_priv->display.fbc_enabled = i8xx_fbc_enabled;
+			dev_priv->display.enable_fbc = i8xx_enable_fbc;
+			dev_priv->display.disable_fbc = i8xx_disable_fbc;
+		}
+		/* 855GM needs testing */
+	}
+
+	/* Returns the core display clock speed */
+	if (IS_I945G(dev))
+		dev_priv->display.get_display_clock_speed =
+			i945_get_display_clock_speed;
+	else if (IS_I915G(dev))
+		dev_priv->display.get_display_clock_speed =
+			i915_get_display_clock_speed;
+	else if (IS_I945GM(dev) || IS_845G(dev) || IS_IGDGM(dev))
+		dev_priv->display.get_display_clock_speed =
+			i9xx_misc_get_display_clock_speed;
+	else if (IS_I915GM(dev))
+		dev_priv->display.get_display_clock_speed =
+			i915gm_get_display_clock_speed;
+	else if (IS_I865G(dev))
+		dev_priv->display.get_display_clock_speed =
+			i865_get_display_clock_speed;
+	else if (IS_I855(dev))
+		dev_priv->display.get_display_clock_speed =
+			i855_get_display_clock_speed;
+	else /* 852, 830 */
+		dev_priv->display.get_display_clock_speed =
+			i830_get_display_clock_speed;
+
+	/* For FIFO watermark updates */
+	if (IS_G4X(dev))
+		dev_priv->display.update_wm = g4x_update_wm;
+	else if (IS_I965G(dev))
+		dev_priv->display.update_wm = i965_update_wm;
+	else if (IS_I9XX(dev) || IS_MOBILE(dev)) {
+		dev_priv->display.update_wm = i9xx_update_wm;
+		dev_priv->display.get_fifo_size = i9xx_get_fifo_size;
+	} else {
+		if (IS_I85X(dev))
+			dev_priv->display.get_fifo_size = i85x_get_fifo_size;
+		else if (IS_845G(dev))
+			dev_priv->display.get_fifo_size = i845_get_fifo_size;
+		else
+			dev_priv->display.get_fifo_size = i830_get_fifo_size;
+		dev_priv->display.update_wm = i830_update_wm;
+	}
+}
+
 void intel_modeset_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -3839,6 +4236,8 @@ void intel_modeset_init(struct drm_device *dev)
 
 	dev->mode_config.funcs = (void *)&intel_mode_funcs;
 
+	intel_init_display(dev);
+
 	if (IS_I965G(dev)) {
 		dev->mode_config.max_width = 8192;
 		dev->mode_config.max_height = 8192;
@@ -3904,6 +4303,9 @@ void intel_modeset_cleanup(struct drm_device *dev)
 
 	mutex_unlock(&dev->struct_mutex);
 
+	if (dev_priv->display.disable_fbc)
+		dev_priv->display.disable_fbc(dev);
+
 	drm_mode_config_cleanup(dev);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 3ebbbabfe59..8aa4b7f30da 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -28,6 +28,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-id.h>
 #include <linux/i2c-algo-bit.h>
+#include "i915_drv.h"
 #include "drm_crtc.h"
 
 #include "drm_crtc_helper.h"
@@ -111,8 +112,8 @@ struct intel_output {
 
 struct intel_crtc {
 	struct drm_crtc base;
-	int pipe;
-	int plane;
+	enum pipe pipe;
+	enum plane plane;
 	struct drm_gem_object *cursor_bo;
 	uint32_t cursor_addr;
 	u8 lut_r[256], lut_g[256], lut_b[256];
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index dafc0da1c25..98ae3d73577 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -27,6 +27,7 @@
  *      Jesse Barnes <jesse.barnes@intel.com>
  */
 
+#include <acpi/button.h>
 #include <linux/dmi.h>
 #include <linux/i2c.h>
 #include "drmP.h"
@@ -295,6 +296,10 @@ static bool intel_lvds_mode_fixup(struct drm_encoder *encoder,
 		goto out;
 	}
 
+	/* full screen scale for now */
+	if (IS_IGDNG(dev))
+		goto out;
+
 	/* 965+ wants fuzzy fitting */
 	if (IS_I965G(dev))
 		pfit_control |= (intel_crtc->pipe << PFIT_PIPE_SHIFT) |
@@ -322,8 +327,10 @@ static bool intel_lvds_mode_fixup(struct drm_encoder *encoder,
 	 * to register description and PRM.
 	 * Change the value here to see the borders for debugging
 	 */
-	I915_WRITE(BCLRPAT_A, 0);
-	I915_WRITE(BCLRPAT_B, 0);
+	if (!IS_IGDNG(dev)) {
+		I915_WRITE(BCLRPAT_A, 0);
+		I915_WRITE(BCLRPAT_B, 0);
+	}
 
 	switch (lvds_priv->fitting_mode) {
 	case DRM_MODE_SCALE_CENTER:
@@ -572,7 +579,6 @@ static void intel_lvds_mode_set(struct drm_encoder *encoder,
 	 * settings.
 	 */
 
-	/* No panel fitting yet, fixme */
 	if (IS_IGDNG(dev))
 		return;
 
@@ -585,15 +591,33 @@ static void intel_lvds_mode_set(struct drm_encoder *encoder,
 	I915_WRITE(PFIT_CONTROL, lvds_priv->pfit_control);
 }
 
+/* Some lid devices report incorrect lid status, assume they're connected */
+static const struct dmi_system_id bad_lid_status[] = {
+	{
+		.ident = "Aspire One",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Aspire one"),
+		},
+	},
+	{ }
+};
+
 /**
  * Detect the LVDS connection.
  *
- * This always returns CONNECTOR_STATUS_CONNECTED.  This connector should only have
- * been set up if the LVDS was actually connected anyway.
+ * Since LVDS doesn't have hotlug, we use the lid as a proxy.  Open means
+ * connected and closed means disconnected.  We also send hotplug events as
+ * needed, using lid status notification from the input layer.
  */
 static enum drm_connector_status intel_lvds_detect(struct drm_connector *connector)
 {
-	return connector_status_connected;
+	enum drm_connector_status status = connector_status_connected;
+
+	if (!acpi_lid_open() && !dmi_check_system(bad_lid_status))
+		status = connector_status_disconnected;
+
+	return status;
 }
 
 /**
@@ -632,6 +656,24 @@ static int intel_lvds_get_modes(struct drm_connector *connector)
 	return 0;
 }
 
+static int intel_lid_notify(struct notifier_block *nb, unsigned long val,
+			    void *unused)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(nb, struct drm_i915_private, lid_notifier);
+	struct drm_device *dev = dev_priv->dev;
+
+	if (acpi_lid_open() && !dev_priv->suspended) {
+		mutex_lock(&dev->mode_config.mutex);
+		drm_helper_resume_force_mode(dev);
+		mutex_unlock(&dev->mode_config.mutex);
+	}
+
+	drm_sysfs_hotplug_event(dev_priv->dev);
+
+	return NOTIFY_OK;
+}
+
 /**
  * intel_lvds_destroy - unregister and free LVDS structures
  * @connector: connector to free
@@ -641,10 +683,14 @@ static int intel_lvds_get_modes(struct drm_connector *connector)
  */
 static void intel_lvds_destroy(struct drm_connector *connector)
 {
+	struct drm_device *dev = connector->dev;
 	struct intel_output *intel_output = to_intel_output(connector);
+	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (intel_output->ddc_bus)
 		intel_i2c_destroy(intel_output->ddc_bus);
+	if (dev_priv->lid_notifier.notifier_call)
+		acpi_lid_notifier_unregister(&dev_priv->lid_notifier);
 	drm_sysfs_connector_remove(connector);
 	drm_connector_cleanup(connector);
 	kfree(connector);
@@ -1011,6 +1057,11 @@ out:
 		pwm |= PWM_PCH_ENABLE;
 		I915_WRITE(BLC_PWM_PCH_CTL1, pwm);
 	}
+	dev_priv->lid_notifier.notifier_call = intel_lid_notify;
+	if (acpi_lid_notifier_register(&dev_priv->lid_notifier)) {
+		DRM_DEBUG("lid notifier registration failed\n");
+		dev_priv->lid_notifier.notifier_call = NULL;
+	}
 	drm_sysfs_connector_add(connector);
 	return;
 
diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c
index 0bf28efcf2c..083bec2e50f 100644
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -135,6 +135,30 @@ struct intel_sdvo_priv {
 	struct intel_sdvo_dtd save_input_dtd_1, save_input_dtd_2;
 	struct intel_sdvo_dtd save_output_dtd[16];
 	u32 save_SDVOX;
+	/* add the property for the SDVO-TV */
+	struct drm_property *left_property;
+	struct drm_property *right_property;
+	struct drm_property *top_property;
+	struct drm_property *bottom_property;
+	struct drm_property *hpos_property;
+	struct drm_property *vpos_property;
+
+	/* add the property for the SDVO-TV/LVDS */
+	struct drm_property *brightness_property;
+	struct drm_property *contrast_property;
+	struct drm_property *saturation_property;
+	struct drm_property *hue_property;
+
+	/* Add variable to record current setting for the above property */
+	u32	left_margin, right_margin, top_margin, bottom_margin;
+	/* this is to get the range of margin.*/
+	u32	max_hscan,  max_vscan;
+	u32	max_hpos, cur_hpos;
+	u32	max_vpos, cur_vpos;
+	u32	cur_brightness, max_brightness;
+	u32	cur_contrast,	max_contrast;
+	u32	cur_saturation, max_saturation;
+	u32	cur_hue,	max_hue;
 };
 
 static bool
@@ -281,6 +305,31 @@ static const struct _sdvo_cmd_name {
     SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_SDTV_RESOLUTION_SUPPORT),
     SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_SCALED_HDTV_RESOLUTION_SUPPORT),
     SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_SUPPORTED_ENHANCEMENTS),
+    /* Add the op code for SDVO enhancements */
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_MAX_POSITION_H),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_POSITION_H),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_SET_POSITION_H),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_MAX_POSITION_V),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_POSITION_V),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_SET_POSITION_V),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_MAX_SATURATION),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_SATURATION),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_SET_SATURATION),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_MAX_HUE),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_HUE),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_SET_HUE),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_MAX_CONTRAST),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_CONTRAST),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_SET_CONTRAST),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_MAX_BRIGHTNESS),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_BRIGHTNESS),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_SET_BRIGHTNESS),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_MAX_OVERSCAN_H),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_OVERSCAN_H),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_SET_OVERSCAN_H),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_MAX_OVERSCAN_V),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_OVERSCAN_V),
+    SDVO_CMD_NAME_ENTRY(SDVO_CMD_SET_OVERSCAN_V),
     /* HDMI op code */
     SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_SUPP_ENCODE),
     SDVO_CMD_NAME_ENTRY(SDVO_CMD_GET_ENCODE),
@@ -981,7 +1030,7 @@ static void intel_sdvo_set_tv_format(struct intel_output *output)
 
 	status = intel_sdvo_read_response(output, NULL, 0);
 	if (status != SDVO_CMD_STATUS_SUCCESS)
-		DRM_DEBUG("%s: Failed to set TV format\n",
+		DRM_DEBUG_KMS("%s: Failed to set TV format\n",
 			  SDVO_NAME(sdvo_priv));
 }
 
@@ -1792,6 +1841,45 @@ static int intel_sdvo_get_modes(struct drm_connector *connector)
 	return 1;
 }
 
+static
+void intel_sdvo_destroy_enhance_property(struct drm_connector *connector)
+{
+	struct intel_output *intel_output = to_intel_output(connector);
+	struct intel_sdvo_priv *sdvo_priv = intel_output->dev_priv;
+	struct drm_device *dev = connector->dev;
+
+	if (sdvo_priv->is_tv) {
+		if (sdvo_priv->left_property)
+			drm_property_destroy(dev, sdvo_priv->left_property);
+		if (sdvo_priv->right_property)
+			drm_property_destroy(dev, sdvo_priv->right_property);
+		if (sdvo_priv->top_property)
+			drm_property_destroy(dev, sdvo_priv->top_property);
+		if (sdvo_priv->bottom_property)
+			drm_property_destroy(dev, sdvo_priv->bottom_property);
+		if (sdvo_priv->hpos_property)
+			drm_property_destroy(dev, sdvo_priv->hpos_property);
+		if (sdvo_priv->vpos_property)
+			drm_property_destroy(dev, sdvo_priv->vpos_property);
+	}
+	if (sdvo_priv->is_tv) {
+		if (sdvo_priv->saturation_property)
+			drm_property_destroy(dev,
+					sdvo_priv->saturation_property);
+		if (sdvo_priv->contrast_property)
+			drm_property_destroy(dev,
+					sdvo_priv->contrast_property);
+		if (sdvo_priv->hue_property)
+			drm_property_destroy(dev, sdvo_priv->hue_property);
+	}
+	if (sdvo_priv->is_tv || sdvo_priv->is_lvds) {
+		if (sdvo_priv->brightness_property)
+			drm_property_destroy(dev,
+					sdvo_priv->brightness_property);
+	}
+	return;
+}
+
 static void intel_sdvo_destroy(struct drm_connector *connector)
 {
 	struct intel_output *intel_output = to_intel_output(connector);
@@ -1812,6 +1900,9 @@ static void intel_sdvo_destroy(struct drm_connector *connector)
 		drm_property_destroy(connector->dev,
 				     sdvo_priv->tv_format_property);
 
+	if (sdvo_priv->is_tv || sdvo_priv->is_lvds)
+		intel_sdvo_destroy_enhance_property(connector);
+
 	drm_sysfs_connector_remove(connector);
 	drm_connector_cleanup(connector);
 
@@ -1829,6 +1920,8 @@ intel_sdvo_set_property(struct drm_connector *connector,
 	struct drm_crtc *crtc = encoder->crtc;
 	int ret = 0;
 	bool changed = false;
+	uint8_t cmd, status;
+	uint16_t temp_value;
 
 	ret = drm_connector_property_set_value(connector, property, val);
 	if (ret < 0)
@@ -1845,11 +1938,102 @@ intel_sdvo_set_property(struct drm_connector *connector,
 
 		sdvo_priv->tv_format_name = sdvo_priv->tv_format_supported[val];
 		changed = true;
-	} else {
-		ret = -EINVAL;
-		goto out;
 	}
 
+	if (sdvo_priv->is_tv || sdvo_priv->is_lvds) {
+		cmd = 0;
+		temp_value = val;
+		if (sdvo_priv->left_property == property) {
+			drm_connector_property_set_value(connector,
+				sdvo_priv->right_property, val);
+			if (sdvo_priv->left_margin == temp_value)
+				goto out;
+
+			sdvo_priv->left_margin = temp_value;
+			sdvo_priv->right_margin = temp_value;
+			temp_value = sdvo_priv->max_hscan -
+					sdvo_priv->left_margin;
+			cmd = SDVO_CMD_SET_OVERSCAN_H;
+		} else if (sdvo_priv->right_property == property) {
+			drm_connector_property_set_value(connector,
+				sdvo_priv->left_property, val);
+			if (sdvo_priv->right_margin == temp_value)
+				goto out;
+
+			sdvo_priv->left_margin = temp_value;
+			sdvo_priv->right_margin = temp_value;
+			temp_value = sdvo_priv->max_hscan -
+				sdvo_priv->left_margin;
+			cmd = SDVO_CMD_SET_OVERSCAN_H;
+		} else if (sdvo_priv->top_property == property) {
+			drm_connector_property_set_value(connector,
+				sdvo_priv->bottom_property, val);
+			if (sdvo_priv->top_margin == temp_value)
+				goto out;
+
+			sdvo_priv->top_margin = temp_value;
+			sdvo_priv->bottom_margin = temp_value;
+			temp_value = sdvo_priv->max_vscan -
+					sdvo_priv->top_margin;
+			cmd = SDVO_CMD_SET_OVERSCAN_V;
+		} else if (sdvo_priv->bottom_property == property) {
+			drm_connector_property_set_value(connector,
+				sdvo_priv->top_property, val);
+			if (sdvo_priv->bottom_margin == temp_value)
+				goto out;
+			sdvo_priv->top_margin = temp_value;
+			sdvo_priv->bottom_margin = temp_value;
+			temp_value = sdvo_priv->max_vscan -
+					sdvo_priv->top_margin;
+			cmd = SDVO_CMD_SET_OVERSCAN_V;
+		} else if (sdvo_priv->hpos_property == property) {
+			if (sdvo_priv->cur_hpos == temp_value)
+				goto out;
+
+			cmd = SDVO_CMD_SET_POSITION_H;
+			sdvo_priv->cur_hpos = temp_value;
+		} else if (sdvo_priv->vpos_property == property) {
+			if (sdvo_priv->cur_vpos == temp_value)
+				goto out;
+
+			cmd = SDVO_CMD_SET_POSITION_V;
+			sdvo_priv->cur_vpos = temp_value;
+		} else if (sdvo_priv->saturation_property == property) {
+			if (sdvo_priv->cur_saturation == temp_value)
+				goto out;
+
+			cmd = SDVO_CMD_SET_SATURATION;
+			sdvo_priv->cur_saturation = temp_value;
+		} else if (sdvo_priv->contrast_property == property) {
+			if (sdvo_priv->cur_contrast == temp_value)
+				goto out;
+
+			cmd = SDVO_CMD_SET_CONTRAST;
+			sdvo_priv->cur_contrast = temp_value;
+		} else if (sdvo_priv->hue_property == property) {
+			if (sdvo_priv->cur_hue == temp_value)
+				goto out;
+
+			cmd = SDVO_CMD_SET_HUE;
+			sdvo_priv->cur_hue = temp_value;
+		} else if (sdvo_priv->brightness_property == property) {
+			if (sdvo_priv->cur_brightness == temp_value)
+				goto out;
+
+			cmd = SDVO_CMD_SET_BRIGHTNESS;
+			sdvo_priv->cur_brightness = temp_value;
+		}
+		if (cmd) {
+			intel_sdvo_write_cmd(intel_output, cmd, &temp_value, 2);
+			status = intel_sdvo_read_response(intel_output,
+								NULL, 0);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO command \n");
+				return -EINVAL;
+			}
+			changed = true;
+		}
+	}
 	if (changed && crtc)
 		drm_crtc_helper_set_mode(crtc, &crtc->mode, crtc->x,
 				crtc->y, crtc->fb);
@@ -2090,6 +2274,8 @@ intel_sdvo_output_setup(struct intel_output *intel_output, uint16_t flags)
 		sdvo_priv->controlled_output = SDVO_OUTPUT_RGB1;
 		encoder->encoder_type = DRM_MODE_ENCODER_DAC;
 		connector->connector_type = DRM_MODE_CONNECTOR_VGA;
+		intel_output->clone_mask = (1 << INTEL_SDVO_NON_TV_CLONE_BIT) |
+					(1 << INTEL_ANALOG_CLONE_BIT);
 	} else if (flags & SDVO_OUTPUT_LVDS0) {
 
 		sdvo_priv->controlled_output = SDVO_OUTPUT_LVDS0;
@@ -2176,6 +2362,310 @@ static void intel_sdvo_tv_create_property(struct drm_connector *connector)
 
 }
 
+static void intel_sdvo_create_enhance_property(struct drm_connector *connector)
+{
+	struct intel_output *intel_output = to_intel_output(connector);
+	struct intel_sdvo_priv *sdvo_priv = intel_output->dev_priv;
+	struct intel_sdvo_enhancements_reply sdvo_data;
+	struct drm_device *dev = connector->dev;
+	uint8_t status;
+	uint16_t response, data_value[2];
+
+	intel_sdvo_write_cmd(intel_output, SDVO_CMD_GET_SUPPORTED_ENHANCEMENTS,
+						NULL, 0);
+	status = intel_sdvo_read_response(intel_output, &sdvo_data,
+					sizeof(sdvo_data));
+	if (status != SDVO_CMD_STATUS_SUCCESS) {
+		DRM_DEBUG_KMS(" incorrect response is returned\n");
+		return;
+	}
+	response = *((uint16_t *)&sdvo_data);
+	if (!response) {
+		DRM_DEBUG_KMS("No enhancement is supported\n");
+		return;
+	}
+	if (sdvo_priv->is_tv) {
+		/* when horizontal overscan is supported, Add the left/right
+		 * property
+		 */
+		if (sdvo_data.overscan_h) {
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_MAX_OVERSCAN_H, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&data_value, 4);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO max "
+						"h_overscan\n");
+				return;
+			}
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_OVERSCAN_H, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&response, 2);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO h_overscan\n");
+				return;
+			}
+			sdvo_priv->max_hscan = data_value[0];
+			sdvo_priv->left_margin = data_value[0] - response;
+			sdvo_priv->right_margin = sdvo_priv->left_margin;
+			sdvo_priv->left_property =
+				drm_property_create(dev, DRM_MODE_PROP_RANGE,
+						"left_margin", 2);
+			sdvo_priv->left_property->values[0] = 0;
+			sdvo_priv->left_property->values[1] = data_value[0];
+			drm_connector_attach_property(connector,
+						sdvo_priv->left_property,
+						sdvo_priv->left_margin);
+			sdvo_priv->right_property =
+				drm_property_create(dev, DRM_MODE_PROP_RANGE,
+						"right_margin", 2);
+			sdvo_priv->right_property->values[0] = 0;
+			sdvo_priv->right_property->values[1] = data_value[0];
+			drm_connector_attach_property(connector,
+						sdvo_priv->right_property,
+						sdvo_priv->right_margin);
+			DRM_DEBUG_KMS("h_overscan: max %d, "
+					"default %d, current %d\n",
+					data_value[0], data_value[1], response);
+		}
+		if (sdvo_data.overscan_v) {
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_MAX_OVERSCAN_V, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&data_value, 4);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO max "
+						"v_overscan\n");
+				return;
+			}
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_OVERSCAN_V, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&response, 2);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO v_overscan\n");
+				return;
+			}
+			sdvo_priv->max_vscan = data_value[0];
+			sdvo_priv->top_margin = data_value[0] - response;
+			sdvo_priv->bottom_margin = sdvo_priv->top_margin;
+			sdvo_priv->top_property =
+				drm_property_create(dev, DRM_MODE_PROP_RANGE,
+						"top_margin", 2);
+			sdvo_priv->top_property->values[0] = 0;
+			sdvo_priv->top_property->values[1] = data_value[0];
+			drm_connector_attach_property(connector,
+						sdvo_priv->top_property,
+						sdvo_priv->top_margin);
+			sdvo_priv->bottom_property =
+				drm_property_create(dev, DRM_MODE_PROP_RANGE,
+						"bottom_margin", 2);
+			sdvo_priv->bottom_property->values[0] = 0;
+			sdvo_priv->bottom_property->values[1] = data_value[0];
+			drm_connector_attach_property(connector,
+						sdvo_priv->bottom_property,
+						sdvo_priv->bottom_margin);
+			DRM_DEBUG_KMS("v_overscan: max %d, "
+					"default %d, current %d\n",
+					data_value[0], data_value[1], response);
+		}
+		if (sdvo_data.position_h) {
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_MAX_POSITION_H, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&data_value, 4);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO Max h_pos\n");
+				return;
+			}
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_POSITION_H, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&response, 2);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO get h_postion\n");
+				return;
+			}
+			sdvo_priv->max_hpos = data_value[0];
+			sdvo_priv->cur_hpos = response;
+			sdvo_priv->hpos_property =
+				drm_property_create(dev, DRM_MODE_PROP_RANGE,
+						"hpos", 2);
+			sdvo_priv->hpos_property->values[0] = 0;
+			sdvo_priv->hpos_property->values[1] = data_value[0];
+			drm_connector_attach_property(connector,
+						sdvo_priv->hpos_property,
+						sdvo_priv->cur_hpos);
+			DRM_DEBUG_KMS("h_position: max %d, "
+					"default %d, current %d\n",
+					data_value[0], data_value[1], response);
+		}
+		if (sdvo_data.position_v) {
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_MAX_POSITION_V, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&data_value, 4);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO Max v_pos\n");
+				return;
+			}
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_POSITION_V, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&response, 2);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO get v_postion\n");
+				return;
+			}
+			sdvo_priv->max_vpos = data_value[0];
+			sdvo_priv->cur_vpos = response;
+			sdvo_priv->vpos_property =
+				drm_property_create(dev, DRM_MODE_PROP_RANGE,
+						"vpos", 2);
+			sdvo_priv->vpos_property->values[0] = 0;
+			sdvo_priv->vpos_property->values[1] = data_value[0];
+			drm_connector_attach_property(connector,
+						sdvo_priv->vpos_property,
+						sdvo_priv->cur_vpos);
+			DRM_DEBUG_KMS("v_position: max %d, "
+					"default %d, current %d\n",
+					data_value[0], data_value[1], response);
+		}
+	}
+	if (sdvo_priv->is_tv) {
+		if (sdvo_data.saturation) {
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_MAX_SATURATION, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&data_value, 4);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO Max sat\n");
+				return;
+			}
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_SATURATION, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&response, 2);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO get sat\n");
+				return;
+			}
+			sdvo_priv->max_saturation = data_value[0];
+			sdvo_priv->cur_saturation = response;
+			sdvo_priv->saturation_property =
+				drm_property_create(dev, DRM_MODE_PROP_RANGE,
+						"saturation", 2);
+			sdvo_priv->saturation_property->values[0] = 0;
+			sdvo_priv->saturation_property->values[1] =
+							data_value[0];
+			drm_connector_attach_property(connector,
+						sdvo_priv->saturation_property,
+						sdvo_priv->cur_saturation);
+			DRM_DEBUG_KMS("saturation: max %d, "
+					"default %d, current %d\n",
+					data_value[0], data_value[1], response);
+		}
+		if (sdvo_data.contrast) {
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_MAX_CONTRAST, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&data_value, 4);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO Max contrast\n");
+				return;
+			}
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_CONTRAST, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&response, 2);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO get contrast\n");
+				return;
+			}
+			sdvo_priv->max_contrast = data_value[0];
+			sdvo_priv->cur_contrast = response;
+			sdvo_priv->contrast_property =
+				drm_property_create(dev, DRM_MODE_PROP_RANGE,
+						"contrast", 2);
+			sdvo_priv->contrast_property->values[0] = 0;
+			sdvo_priv->contrast_property->values[1] = data_value[0];
+			drm_connector_attach_property(connector,
+						sdvo_priv->contrast_property,
+						sdvo_priv->cur_contrast);
+			DRM_DEBUG_KMS("contrast: max %d, "
+					"default %d, current %d\n",
+					data_value[0], data_value[1], response);
+		}
+		if (sdvo_data.hue) {
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_MAX_HUE, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&data_value, 4);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO Max hue\n");
+				return;
+			}
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_HUE, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&response, 2);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO get hue\n");
+				return;
+			}
+			sdvo_priv->max_hue = data_value[0];
+			sdvo_priv->cur_hue = response;
+			sdvo_priv->hue_property =
+				drm_property_create(dev, DRM_MODE_PROP_RANGE,
+						"hue", 2);
+			sdvo_priv->hue_property->values[0] = 0;
+			sdvo_priv->hue_property->values[1] =
+							data_value[0];
+			drm_connector_attach_property(connector,
+						sdvo_priv->hue_property,
+						sdvo_priv->cur_hue);
+			DRM_DEBUG_KMS("hue: max %d, default %d, current %d\n",
+					data_value[0], data_value[1], response);
+		}
+	}
+	if (sdvo_priv->is_tv || sdvo_priv->is_lvds) {
+		if (sdvo_data.brightness) {
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_MAX_BRIGHTNESS, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&data_value, 4);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO Max bright\n");
+				return;
+			}
+			intel_sdvo_write_cmd(intel_output,
+				SDVO_CMD_GET_BRIGHTNESS, NULL, 0);
+			status = intel_sdvo_read_response(intel_output,
+				&response, 2);
+			if (status != SDVO_CMD_STATUS_SUCCESS) {
+				DRM_DEBUG_KMS("Incorrect SDVO get brigh\n");
+				return;
+			}
+			sdvo_priv->max_brightness = data_value[0];
+			sdvo_priv->cur_brightness = response;
+			sdvo_priv->brightness_property =
+				drm_property_create(dev, DRM_MODE_PROP_RANGE,
+						"brightness", 2);
+			sdvo_priv->brightness_property->values[0] = 0;
+			sdvo_priv->brightness_property->values[1] =
+							data_value[0];
+			drm_connector_attach_property(connector,
+						sdvo_priv->brightness_property,
+						sdvo_priv->cur_brightness);
+			DRM_DEBUG_KMS("brightness: max %d, "
+					"default %d, current %d\n",
+					data_value[0], data_value[1], response);
+		}
+	}
+	return;
+}
+
 bool intel_sdvo_init(struct drm_device *dev, int output_device)
 {
 	struct drm_connector *connector;
@@ -2264,6 +2754,10 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 	drm_mode_connector_attach_encoder(&intel_output->base, &intel_output->enc);
 	if (sdvo_priv->is_tv)
 		intel_sdvo_tv_create_property(connector);
+
+	if (sdvo_priv->is_tv || sdvo_priv->is_lvds)
+		intel_sdvo_create_enhance_property(connector);
+
 	drm_sysfs_connector_add(connector);
 
 	intel_sdvo_select_ddc_bus(sdvo_priv);
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index ed7711d11ae..6857560144b 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -325,34 +325,6 @@ config SENSORS_F75375S
 	  This driver can also be built as a module.  If so, the module
 	  will be called f75375s.
 
-config SENSORS_FSCHER
-	tristate "FSC Hermes (DEPRECATED)"
-	depends on X86 && I2C
-	help
-	  This driver is DEPRECATED please use the new merged fschmd
-	  ("FSC Poseidon, Scylla, Hermes, Heimdall and Heracles") driver
-	  instead.
-
-	  If you say yes here you get support for Fujitsu Siemens
-	  Computers Hermes sensor chips.
-
-	  This driver can also be built as a module.  If so, the module
-	  will be called fscher.
-
-config SENSORS_FSCPOS
-	tristate "FSC Poseidon (DEPRECATED)"
-	depends on X86 && I2C
-	help
-	  This driver is DEPRECATED please use the new merged fschmd
-	  ("FSC Poseidon, Scylla, Hermes, Heimdall and Heracles") driver
-	  instead.
-
-	  If you say yes here you get support for Fujitsu Siemens
-	  Computers Poseidon sensor chips.
-
-	  This driver can also be built as a module.  If so, the module
-	  will be called fscpos.
-
 config SENSORS_FSCHMD
 	tristate "Fujitsu Siemens Computers sensor chips"
 	depends on X86 && I2C
@@ -401,12 +373,12 @@ config SENSORS_GL520SM
 	  will be called gl520sm.
 
 config SENSORS_CORETEMP
-	tristate "Intel Core (2) Duo/Solo temperature sensor"
+	tristate "Intel Core/Core2/Atom temperature sensor"
 	depends on X86 && EXPERIMENTAL
 	help
 	  If you say yes here you get support for the temperature
-	  sensor inside your CPU. Supported all are all known variants
-	  of Intel Core family.
+	  sensor inside your CPU. Most of the family 6 CPUs
+	  are supported. Check documentation/driver for details.
 
 config SENSORS_IBMAEM
 	tristate "IBM Active Energy Manager temperature/power sensors and control"
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index bcf73a9bb61..9f46cb019cc 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -42,9 +42,7 @@ obj-$(CONFIG_SENSORS_DS1621)	+= ds1621.o
 obj-$(CONFIG_SENSORS_F71805F)	+= f71805f.o
 obj-$(CONFIG_SENSORS_F71882FG)	+= f71882fg.o
 obj-$(CONFIG_SENSORS_F75375S)	+= f75375s.o
-obj-$(CONFIG_SENSORS_FSCHER)	+= fscher.o
 obj-$(CONFIG_SENSORS_FSCHMD)	+= fschmd.o
-obj-$(CONFIG_SENSORS_FSCPOS)	+= fscpos.o
 obj-$(CONFIG_SENSORS_G760A)	+= g760a.o
 obj-$(CONFIG_SENSORS_GL518SM)	+= gl518sm.o
 obj-$(CONFIG_SENSORS_GL520SM)	+= gl520sm.o
diff --git a/drivers/hwmon/adm1031.c b/drivers/hwmon/adm1031.c
index 789441830cd..56905955352 100644
--- a/drivers/hwmon/adm1031.c
+++ b/drivers/hwmon/adm1031.c
@@ -37,6 +37,7 @@
 #define ADM1031_REG_PWM			(0x22)
 #define ADM1031_REG_FAN_MIN(nr)		(0x10 + (nr))
 
+#define ADM1031_REG_TEMP_OFFSET(nr)	(0x0d + (nr))
 #define ADM1031_REG_TEMP_MAX(nr)	(0x14 + 4 * (nr))
 #define ADM1031_REG_TEMP_MIN(nr)	(0x15 + 4 * (nr))
 #define ADM1031_REG_TEMP_CRIT(nr)	(0x16 + 4 * (nr))
@@ -93,6 +94,7 @@ struct adm1031_data {
 	u8 auto_temp_min[3];
 	u8 auto_temp_off[3];
 	u8 auto_temp_max[3];
+	s8 temp_offset[3];
 	s8 temp_min[3];
 	s8 temp_max[3];
 	s8 temp_crit[3];
@@ -145,6 +147,10 @@ adm1031_write_value(struct i2c_client *client, u8 reg, unsigned int value)
 
 #define TEMP_FROM_REG_EXT(val, ext)	(TEMP_FROM_REG(val) + (ext) * 125)
 
+#define TEMP_OFFSET_TO_REG(val)		(TEMP_TO_REG(val) & 0x8f)
+#define TEMP_OFFSET_FROM_REG(val)	TEMP_FROM_REG((val) < 0 ? \
+						      (val) | 0x70 : (val))
+
 #define FAN_FROM_REG(reg, div)		((reg) ? (11250 * 60) / ((reg) * (div)) : 0)
 
 static int FAN_TO_REG(int reg, int div)
@@ -585,6 +591,14 @@ static ssize_t show_temp(struct device *dev,
 	    (((data->ext_temp[nr] >> ((nr - 1) * 3)) & 7));
 	return sprintf(buf, "%d\n", TEMP_FROM_REG_EXT(data->temp[nr], ext));
 }
+static ssize_t show_temp_offset(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	int nr = to_sensor_dev_attr(attr)->index;
+	struct adm1031_data *data = adm1031_update_device(dev);
+	return sprintf(buf, "%d\n",
+		       TEMP_OFFSET_FROM_REG(data->temp_offset[nr]));
+}
 static ssize_t show_temp_min(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
@@ -606,6 +620,24 @@ static ssize_t show_temp_crit(struct device *dev,
 	struct adm1031_data *data = adm1031_update_device(dev);
 	return sprintf(buf, "%d\n", TEMP_FROM_REG(data->temp_crit[nr]));
 }
+static ssize_t set_temp_offset(struct device *dev,
+			       struct device_attribute *attr, const char *buf,
+			       size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct adm1031_data *data = i2c_get_clientdata(client);
+	int nr = to_sensor_dev_attr(attr)->index;
+	int val;
+
+	val = simple_strtol(buf, NULL, 10);
+	val = SENSORS_LIMIT(val, -15000, 15000);
+	mutex_lock(&data->update_lock);
+	data->temp_offset[nr] = TEMP_OFFSET_TO_REG(val);
+	adm1031_write_value(client, ADM1031_REG_TEMP_OFFSET(nr),
+			    data->temp_offset[nr]);
+	mutex_unlock(&data->update_lock);
+	return count;
+}
 static ssize_t set_temp_min(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
@@ -661,6 +693,8 @@ static ssize_t set_temp_crit(struct device *dev, struct device_attribute *attr,
 #define temp_reg(offset)						\
 static SENSOR_DEVICE_ATTR(temp##offset##_input, S_IRUGO,		\
 		show_temp, NULL, offset - 1);				\
+static SENSOR_DEVICE_ATTR(temp##offset##_offset, S_IRUGO | S_IWUSR,	\
+		show_temp_offset, set_temp_offset, offset - 1);		\
 static SENSOR_DEVICE_ATTR(temp##offset##_min, S_IRUGO | S_IWUSR,	\
 		show_temp_min, set_temp_min, offset - 1);		\
 static SENSOR_DEVICE_ATTR(temp##offset##_max, S_IRUGO | S_IWUSR,	\
@@ -714,6 +748,7 @@ static struct attribute *adm1031_attributes[] = {
 	&sensor_dev_attr_pwm1.dev_attr.attr,
 	&sensor_dev_attr_auto_fan1_channel.dev_attr.attr,
 	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	&sensor_dev_attr_temp1_offset.dev_attr.attr,
 	&sensor_dev_attr_temp1_min.dev_attr.attr,
 	&sensor_dev_attr_temp1_min_alarm.dev_attr.attr,
 	&sensor_dev_attr_temp1_max.dev_attr.attr,
@@ -721,6 +756,7 @@ static struct attribute *adm1031_attributes[] = {
 	&sensor_dev_attr_temp1_crit.dev_attr.attr,
 	&sensor_dev_attr_temp1_crit_alarm.dev_attr.attr,
 	&sensor_dev_attr_temp2_input.dev_attr.attr,
+	&sensor_dev_attr_temp2_offset.dev_attr.attr,
 	&sensor_dev_attr_temp2_min.dev_attr.attr,
 	&sensor_dev_attr_temp2_min_alarm.dev_attr.attr,
 	&sensor_dev_attr_temp2_max.dev_attr.attr,
@@ -757,6 +793,7 @@ static struct attribute *adm1031_attributes_opt[] = {
 	&sensor_dev_attr_pwm2.dev_attr.attr,
 	&sensor_dev_attr_auto_fan2_channel.dev_attr.attr,
 	&sensor_dev_attr_temp3_input.dev_attr.attr,
+	&sensor_dev_attr_temp3_offset.dev_attr.attr,
 	&sensor_dev_attr_temp3_min.dev_attr.attr,
 	&sensor_dev_attr_temp3_min_alarm.dev_attr.attr,
 	&sensor_dev_attr_temp3_max.dev_attr.attr,
@@ -937,6 +974,9 @@ static struct adm1031_data *adm1031_update_device(struct device *dev)
 			}
 			data->temp[chan] = newh;
 
+			data->temp_offset[chan] =
+			    adm1031_read_value(client,
+					       ADM1031_REG_TEMP_OFFSET(chan));
 			data->temp_min[chan] =
 			    adm1031_read_value(client,
 					       ADM1031_REG_TEMP_MIN(chan));
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 972cf4ba963..caef39cda8c 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -157,17 +157,26 @@ static int __devinit adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *
 	/* The 100C is default for both mobile and non mobile CPUs */
 
 	int tjmax = 100000;
-	int ismobile = 1;
+	int tjmax_ee = 85000;
+	int usemsr_ee = 1;
 	int err;
 	u32 eax, edx;
 
 	/* Early chips have no MSR for TjMax */
 
 	if ((c->x86_model == 0xf) && (c->x86_mask < 4)) {
-		ismobile = 0;
+		usemsr_ee = 0;
 	}
 
-	if ((c->x86_model > 0xe) && (ismobile)) {
+	/* Atoms seems to have TjMax at 90C */
+
+	if (c->x86_model == 0x1c) {
+		usemsr_ee = 0;
+		tjmax = 90000;
+	}
+
+	if ((c->x86_model > 0xe) && (usemsr_ee)) {
+		u8 platform_id;
 
 		/* Now we can detect the mobile CPU using Intel provided table
 		   http://softwarecommunity.intel.com/Wiki/Mobility/720.htm
@@ -179,13 +188,29 @@ static int __devinit adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *
 			dev_warn(dev,
 				 "Unable to access MSR 0x17, assuming desktop"
 				 " CPU\n");
-			ismobile = 0;
-		} else if (!(eax & 0x10000000)) {
-			ismobile = 0;
+			usemsr_ee = 0;
+		} else if (c->x86_model < 0x17 && !(eax & 0x10000000)) {
+			/* Trust bit 28 up to Penryn, I could not find any
+			   documentation on that; if you happen to know
+			   someone at Intel please ask */
+			usemsr_ee = 0;
+		} else {
+			/* Platform ID bits 52:50 (EDX starts at bit 32) */
+			platform_id = (edx >> 18) & 0x7;
+
+			/* Mobile Penryn CPU seems to be platform ID 7 or 5
+			  (guesswork) */
+			if ((c->x86_model == 0x17) &&
+			    ((platform_id == 5) || (platform_id == 7))) {
+				/* If MSR EE bit is set, set it to 90 degrees C,
+				   otherwise 105 degrees C */
+				tjmax_ee = 90000;
+				tjmax = 105000;
+			}
 		}
 	}
 
-	if (ismobile || c->x86_model == 0x1c) {
+	if (usemsr_ee) {
 
 		err = rdmsr_safe_on_cpu(id, 0xee, &eax, &edx);
 		if (err) {
@@ -193,9 +218,11 @@ static int __devinit adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *
 				 "Unable to access MSR 0xEE, for Tjmax, left"
 				 " at default");
 		} else if (eax & 0x40000000) {
-			tjmax = 85000;
+			tjmax = tjmax_ee;
 		}
-	} else {
+	/* if we dont use msr EE it means we are desktop CPU (with exeception
+	   of Atom) */
+	} else if (tjmax == 100000) {
 		dev_warn(dev, "Using relative temperature scale!\n");
 	}
 
@@ -248,9 +275,9 @@ static int __devinit coretemp_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, data);
 
 	/* read the still undocumented IA32_TEMPERATURE_TARGET it exists
-	   on older CPUs but not in this register */
+	   on older CPUs but not in this register, Atoms don't have it either */
 
-	if (c->x86_model > 0xe) {
+	if ((c->x86_model > 0xe) && (c->x86_model != 0x1c)) {
 		err = rdmsr_safe_on_cpu(data->id, 0x1a2, &eax, &edx);
 		if (err) {
 			dev_warn(&pdev->dev, "Unable to read"
@@ -413,11 +440,15 @@ static int __init coretemp_init(void)
 	for_each_online_cpu(i) {
 		struct cpuinfo_x86 *c = &cpu_data(i);
 
-		/* check if family 6, models 0xe, 0xf, 0x16, 0x17, 0x1A */
+		/* check if family 6, models 0xe (Pentium M DC),
+		  0xf (Core 2 DC 65nm), 0x16 (Core 2 SC 65nm),
+		  0x17 (Penryn 45nm), 0x1a (Nehalem), 0x1c (Atom),
+		  0x1e (Lynnfield) */
 		if ((c->cpuid_level < 0) || (c->x86 != 0x6) ||
 		    !((c->x86_model == 0xe) || (c->x86_model == 0xf) ||
 			(c->x86_model == 0x16) || (c->x86_model == 0x17) ||
-			(c->x86_model == 0x1A) || (c->x86_model == 0x1c))) {
+			(c->x86_model == 0x1a) || (c->x86_model == 0x1c) ||
+			(c->x86_model == 0x1e))) {
 
 			/* supported CPU not found, but report the unknown
 			   family 6 CPU */
diff --git a/drivers/hwmon/fscher.c b/drivers/hwmon/fscher.c
deleted file mode 100644
index 12c70e402cb..00000000000
--- a/drivers/hwmon/fscher.c
+++ /dev/null
@@ -1,680 +0,0 @@
-/*
- * fscher.c - Part of lm_sensors, Linux kernel modules for hardware
- * monitoring
- * Copyright (C) 2003, 2004 Reinhard Nissl <rnissl@gmx.de>
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* 
- *  fujitsu siemens hermes chip, 
- *  module based on fscpos.c 
- *  Copyright (C) 2000 Hermann Jung <hej@odn.de>
- *  Copyright (C) 1998, 1999 Frodo Looijaard <frodol@dds.nl>
- *  and Philip Edelbrock <phil@netroedge.com>
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/jiffies.h>
-#include <linux/i2c.h>
-#include <linux/hwmon.h>
-#include <linux/err.h>
-#include <linux/mutex.h>
-#include <linux/sysfs.h>
-
-/*
- * Addresses to scan
- */
-
-static const unsigned short normal_i2c[] = { 0x73, I2C_CLIENT_END };
-
-/*
- * Insmod parameters
- */
-
-I2C_CLIENT_INSMOD_1(fscher);
-
-/*
- * The FSCHER registers
- */
-
-/* chip identification */
-#define FSCHER_REG_IDENT_0		0x00
-#define FSCHER_REG_IDENT_1		0x01
-#define FSCHER_REG_IDENT_2		0x02
-#define FSCHER_REG_REVISION		0x03
-
-/* global control and status */
-#define FSCHER_REG_EVENT_STATE		0x04
-#define FSCHER_REG_CONTROL		0x05
-
-/* watchdog */
-#define FSCHER_REG_WDOG_PRESET		0x28
-#define FSCHER_REG_WDOG_STATE		0x23
-#define FSCHER_REG_WDOG_CONTROL		0x21
-
-/* fan 0 */
-#define FSCHER_REG_FAN0_MIN		0x55
-#define FSCHER_REG_FAN0_ACT		0x0e
-#define FSCHER_REG_FAN0_STATE		0x0d
-#define FSCHER_REG_FAN0_RIPPLE		0x0f
-
-/* fan 1 */
-#define FSCHER_REG_FAN1_MIN		0x65
-#define FSCHER_REG_FAN1_ACT		0x6b
-#define FSCHER_REG_FAN1_STATE		0x62
-#define FSCHER_REG_FAN1_RIPPLE		0x6f
-
-/* fan 2 */
-#define FSCHER_REG_FAN2_MIN		0xb5
-#define FSCHER_REG_FAN2_ACT		0xbb
-#define FSCHER_REG_FAN2_STATE		0xb2
-#define FSCHER_REG_FAN2_RIPPLE		0xbf
-
-/* voltage supervision */
-#define FSCHER_REG_VOLT_12		0x45
-#define FSCHER_REG_VOLT_5		0x42
-#define FSCHER_REG_VOLT_BATT		0x48
-
-/* temperature 0 */
-#define FSCHER_REG_TEMP0_ACT		0x64
-#define FSCHER_REG_TEMP0_STATE		0x71
-
-/* temperature 1 */
-#define FSCHER_REG_TEMP1_ACT		0x32
-#define FSCHER_REG_TEMP1_STATE		0x81
-
-/* temperature 2 */
-#define FSCHER_REG_TEMP2_ACT		0x35
-#define FSCHER_REG_TEMP2_STATE		0x91
-
-/*
- * Functions declaration
- */
-
-static int fscher_probe(struct i2c_client *client,
-			const struct i2c_device_id *id);
-static int fscher_detect(struct i2c_client *client, int kind,
-			 struct i2c_board_info *info);
-static int fscher_remove(struct i2c_client *client);
-static struct fscher_data *fscher_update_device(struct device *dev);
-static void fscher_init_client(struct i2c_client *client);
-
-static int fscher_read_value(struct i2c_client *client, u8 reg);
-static int fscher_write_value(struct i2c_client *client, u8 reg, u8 value);
-
-/*
- * Driver data (common to all clients)
- */
- 
-static const struct i2c_device_id fscher_id[] = {
-	{ "fscher", fscher },
-	{ }
-};
-
-static struct i2c_driver fscher_driver = {
-	.class		= I2C_CLASS_HWMON,
-	.driver = {
-		.name	= "fscher",
-	},
-	.probe		= fscher_probe,
-	.remove		= fscher_remove,
-	.id_table	= fscher_id,
-	.detect		= fscher_detect,
-	.address_data	= &addr_data,
-};
-
-/*
- * Client data (each client gets its own)
- */
-
-struct fscher_data {
-	struct device *hwmon_dev;
-	struct mutex update_lock;
-	char valid; /* zero until following fields are valid */
-	unsigned long last_updated; /* in jiffies */
-
-	/* register values */
-	u8 revision;		/* revision of chip */
-	u8 global_event;	/* global event status */
-	u8 global_control;	/* global control register */
-	u8 watchdog[3];		/* watchdog */
-	u8 volt[3];		/* 12, 5, battery voltage */ 
-	u8 temp_act[3];		/* temperature */
-	u8 temp_status[3];	/* status of sensor */
-	u8 fan_act[3];		/* fans revolutions per second */
-	u8 fan_status[3];	/* fan status */
-	u8 fan_min[3];		/* fan min value for rps */
-	u8 fan_ripple[3];	/* divider for rps */
-};
-
-/*
- * Sysfs stuff
- */
-
-#define sysfs_r(kind, sub, offset, reg) \
-static ssize_t show_##kind##sub (struct fscher_data *, char *, int); \
-static ssize_t show_##kind##offset##sub (struct device *, struct device_attribute *attr, char *); \
-static ssize_t show_##kind##offset##sub (struct device *dev, struct device_attribute *attr, char *buf) \
-{ \
-	struct fscher_data *data = fscher_update_device(dev); \
-	return show_##kind##sub(data, buf, (offset)); \
-}
-
-#define sysfs_w(kind, sub, offset, reg) \
-static ssize_t set_##kind##sub (struct i2c_client *, struct fscher_data *, const char *, size_t, int, int); \
-static ssize_t set_##kind##offset##sub (struct device *, struct device_attribute *attr, const char *, size_t); \
-static ssize_t set_##kind##offset##sub (struct device *dev, struct device_attribute *attr, const char *buf, size_t count) \
-{ \
-	struct i2c_client *client = to_i2c_client(dev); \
-	struct fscher_data *data = i2c_get_clientdata(client); \
-	return set_##kind##sub(client, data, buf, count, (offset), reg); \
-}
-
-#define sysfs_rw_n(kind, sub, offset, reg) \
-sysfs_r(kind, sub, offset, reg) \
-sysfs_w(kind, sub, offset, reg) \
-static DEVICE_ATTR(kind##offset##sub, S_IRUGO | S_IWUSR, show_##kind##offset##sub, set_##kind##offset##sub);
-
-#define sysfs_rw(kind, sub, reg) \
-sysfs_r(kind, sub, 0, reg) \
-sysfs_w(kind, sub, 0, reg) \
-static DEVICE_ATTR(kind##sub, S_IRUGO | S_IWUSR, show_##kind##0##sub, set_##kind##0##sub);
-
-#define sysfs_ro_n(kind, sub, offset, reg) \
-sysfs_r(kind, sub, offset, reg) \
-static DEVICE_ATTR(kind##offset##sub, S_IRUGO, show_##kind##offset##sub, NULL);
-
-#define sysfs_ro(kind, sub, reg) \
-sysfs_r(kind, sub, 0, reg) \
-static DEVICE_ATTR(kind, S_IRUGO, show_##kind##0##sub, NULL);
-
-#define sysfs_fan(offset, reg_status, reg_min, reg_ripple, reg_act) \
-sysfs_rw_n(pwm,        , offset, reg_min) \
-sysfs_rw_n(fan, _status, offset, reg_status) \
-sysfs_rw_n(fan, _div   , offset, reg_ripple) \
-sysfs_ro_n(fan, _input , offset, reg_act)
-
-#define sysfs_temp(offset, reg_status, reg_act) \
-sysfs_rw_n(temp, _status, offset, reg_status) \
-sysfs_ro_n(temp, _input , offset, reg_act)
-    
-#define sysfs_in(offset, reg_act) \
-sysfs_ro_n(in, _input, offset, reg_act)
-
-#define sysfs_revision(reg_revision) \
-sysfs_ro(revision, , reg_revision)
-
-#define sysfs_alarms(reg_events) \
-sysfs_ro(alarms, , reg_events)
-
-#define sysfs_control(reg_control) \
-sysfs_rw(control, , reg_control)
-
-#define sysfs_watchdog(reg_control, reg_status, reg_preset) \
-sysfs_rw(watchdog, _control, reg_control) \
-sysfs_rw(watchdog, _status , reg_status) \
-sysfs_rw(watchdog, _preset , reg_preset)
-
-sysfs_fan(1, FSCHER_REG_FAN0_STATE, FSCHER_REG_FAN0_MIN,
-	     FSCHER_REG_FAN0_RIPPLE, FSCHER_REG_FAN0_ACT)
-sysfs_fan(2, FSCHER_REG_FAN1_STATE, FSCHER_REG_FAN1_MIN,
-	     FSCHER_REG_FAN1_RIPPLE, FSCHER_REG_FAN1_ACT)
-sysfs_fan(3, FSCHER_REG_FAN2_STATE, FSCHER_REG_FAN2_MIN,
-	     FSCHER_REG_FAN2_RIPPLE, FSCHER_REG_FAN2_ACT)
-
-sysfs_temp(1, FSCHER_REG_TEMP0_STATE, FSCHER_REG_TEMP0_ACT)
-sysfs_temp(2, FSCHER_REG_TEMP1_STATE, FSCHER_REG_TEMP1_ACT)
-sysfs_temp(3, FSCHER_REG_TEMP2_STATE, FSCHER_REG_TEMP2_ACT)
-
-sysfs_in(0, FSCHER_REG_VOLT_12)
-sysfs_in(1, FSCHER_REG_VOLT_5)
-sysfs_in(2, FSCHER_REG_VOLT_BATT)
-
-sysfs_revision(FSCHER_REG_REVISION)
-sysfs_alarms(FSCHER_REG_EVENTS)
-sysfs_control(FSCHER_REG_CONTROL)
-sysfs_watchdog(FSCHER_REG_WDOG_CONTROL, FSCHER_REG_WDOG_STATE, FSCHER_REG_WDOG_PRESET)
-  
-static struct attribute *fscher_attributes[] = {
-	&dev_attr_revision.attr,
-	&dev_attr_alarms.attr,
-	&dev_attr_control.attr,
-
-	&dev_attr_watchdog_status.attr,
-	&dev_attr_watchdog_control.attr,
-	&dev_attr_watchdog_preset.attr,
-
-	&dev_attr_in0_input.attr,
-	&dev_attr_in1_input.attr,
-	&dev_attr_in2_input.attr,
-
-	&dev_attr_fan1_status.attr,
-	&dev_attr_fan1_div.attr,
-	&dev_attr_fan1_input.attr,
-	&dev_attr_pwm1.attr,
-	&dev_attr_fan2_status.attr,
-	&dev_attr_fan2_div.attr,
-	&dev_attr_fan2_input.attr,
-	&dev_attr_pwm2.attr,
-	&dev_attr_fan3_status.attr,
-	&dev_attr_fan3_div.attr,
-	&dev_attr_fan3_input.attr,
-	&dev_attr_pwm3.attr,
-
-	&dev_attr_temp1_status.attr,
-	&dev_attr_temp1_input.attr,
-	&dev_attr_temp2_status.attr,
-	&dev_attr_temp2_input.attr,
-	&dev_attr_temp3_status.attr,
-	&dev_attr_temp3_input.attr,
-	NULL
-};
-
-static const struct attribute_group fscher_group = {
-	.attrs = fscher_attributes,
-};
-
-/*
- * Real code
- */
-
-/* Return 0 if detection is successful, -ENODEV otherwise */
-static int fscher_detect(struct i2c_client *new_client, int kind,
-			 struct i2c_board_info *info)
-{
-	struct i2c_adapter *adapter = new_client->adapter;
-
-	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA))
-		return -ENODEV;
-
-	/* Do the remaining detection unless force or force_fscher parameter */
-	if (kind < 0) {
-		if ((i2c_smbus_read_byte_data(new_client,
-		     FSCHER_REG_IDENT_0) != 0x48)	/* 'H' */
-		 || (i2c_smbus_read_byte_data(new_client,
-		     FSCHER_REG_IDENT_1) != 0x45)	/* 'E' */
-		 || (i2c_smbus_read_byte_data(new_client,
-		     FSCHER_REG_IDENT_2) != 0x52))	/* 'R' */
-			return -ENODEV;
-	}
-
-	strlcpy(info->type, "fscher", I2C_NAME_SIZE);
-
-	return 0;
-}
-
-static int fscher_probe(struct i2c_client *new_client,
-			const struct i2c_device_id *id)
-{
-	struct fscher_data *data;
-	int err;
-
-	data = kzalloc(sizeof(struct fscher_data), GFP_KERNEL);
-	if (!data) {
-		err = -ENOMEM;
-		goto exit;
-	}
-
-	i2c_set_clientdata(new_client, data);
-	data->valid = 0;
-	mutex_init(&data->update_lock);
-
-	fscher_init_client(new_client);
-
-	/* Register sysfs hooks */
-	if ((err = sysfs_create_group(&new_client->dev.kobj, &fscher_group)))
-		goto exit_free;
-
-	data->hwmon_dev = hwmon_device_register(&new_client->dev);
-	if (IS_ERR(data->hwmon_dev)) {
-		err = PTR_ERR(data->hwmon_dev);
-		goto exit_remove_files;
-	}
-
-	return 0;
-
-exit_remove_files:
-	sysfs_remove_group(&new_client->dev.kobj, &fscher_group);
-exit_free:
-	kfree(data);
-exit:
-	return err;
-}
-
-static int fscher_remove(struct i2c_client *client)
-{
-	struct fscher_data *data = i2c_get_clientdata(client);
-
-	hwmon_device_unregister(data->hwmon_dev);
-	sysfs_remove_group(&client->dev.kobj, &fscher_group);
-
-	kfree(data);
-	return 0;
-}
-
-static int fscher_read_value(struct i2c_client *client, u8 reg)
-{
-	dev_dbg(&client->dev, "read reg 0x%02x\n", reg);
-
-	return i2c_smbus_read_byte_data(client, reg);
-}
-
-static int fscher_write_value(struct i2c_client *client, u8 reg, u8 value)
-{
-	dev_dbg(&client->dev, "write reg 0x%02x, val 0x%02x\n",
-		reg, value);
-
-	return i2c_smbus_write_byte_data(client, reg, value);
-}
-
-/* Called when we have found a new FSC Hermes. */
-static void fscher_init_client(struct i2c_client *client)
-{
-	struct fscher_data *data = i2c_get_clientdata(client);
-
-	/* Read revision from chip */
-	data->revision =  fscher_read_value(client, FSCHER_REG_REVISION);
-}
-
-static struct fscher_data *fscher_update_device(struct device *dev)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	struct fscher_data *data = i2c_get_clientdata(client);
-
-	mutex_lock(&data->update_lock);
-
-	if (time_after(jiffies, data->last_updated + 2 * HZ) || !data->valid) {
-
-		dev_dbg(&client->dev, "Starting fscher update\n");
-
-		data->temp_act[0] = fscher_read_value(client, FSCHER_REG_TEMP0_ACT);
-		data->temp_act[1] = fscher_read_value(client, FSCHER_REG_TEMP1_ACT);
-		data->temp_act[2] = fscher_read_value(client, FSCHER_REG_TEMP2_ACT);
-		data->temp_status[0] = fscher_read_value(client, FSCHER_REG_TEMP0_STATE);
-		data->temp_status[1] = fscher_read_value(client, FSCHER_REG_TEMP1_STATE);
-		data->temp_status[2] = fscher_read_value(client, FSCHER_REG_TEMP2_STATE);
-
-		data->volt[0] = fscher_read_value(client, FSCHER_REG_VOLT_12);
-		data->volt[1] = fscher_read_value(client, FSCHER_REG_VOLT_5);
-		data->volt[2] = fscher_read_value(client, FSCHER_REG_VOLT_BATT);
-
-		data->fan_act[0] = fscher_read_value(client, FSCHER_REG_FAN0_ACT);
-		data->fan_act[1] = fscher_read_value(client, FSCHER_REG_FAN1_ACT);
-		data->fan_act[2] = fscher_read_value(client, FSCHER_REG_FAN2_ACT);
-		data->fan_status[0] = fscher_read_value(client, FSCHER_REG_FAN0_STATE);
-		data->fan_status[1] = fscher_read_value(client, FSCHER_REG_FAN1_STATE);
-		data->fan_status[2] = fscher_read_value(client, FSCHER_REG_FAN2_STATE);
-		data->fan_min[0] = fscher_read_value(client, FSCHER_REG_FAN0_MIN);
-		data->fan_min[1] = fscher_read_value(client, FSCHER_REG_FAN1_MIN);
-		data->fan_min[2] = fscher_read_value(client, FSCHER_REG_FAN2_MIN);
-		data->fan_ripple[0] = fscher_read_value(client, FSCHER_REG_FAN0_RIPPLE);
-		data->fan_ripple[1] = fscher_read_value(client, FSCHER_REG_FAN1_RIPPLE);
-		data->fan_ripple[2] = fscher_read_value(client, FSCHER_REG_FAN2_RIPPLE);
-
-		data->watchdog[0] = fscher_read_value(client, FSCHER_REG_WDOG_PRESET);
-		data->watchdog[1] = fscher_read_value(client, FSCHER_REG_WDOG_STATE);
-		data->watchdog[2] = fscher_read_value(client, FSCHER_REG_WDOG_CONTROL);
-
-		data->global_event = fscher_read_value(client, FSCHER_REG_EVENT_STATE);
-		data->global_control = fscher_read_value(client,
-							FSCHER_REG_CONTROL);
-
-		data->last_updated = jiffies;
-		data->valid = 1;                 
-	}
-
-	mutex_unlock(&data->update_lock);
-
-	return data;
-}
-
-
-
-#define FAN_INDEX_FROM_NUM(nr)	((nr) - 1)
-
-static ssize_t set_fan_status(struct i2c_client *client, struct fscher_data *data,
-			      const char *buf, size_t count, int nr, int reg)
-{
-	/* bits 0..1, 3..7 reserved => mask with 0x04 */  
-	unsigned long v = simple_strtoul(buf, NULL, 10) & 0x04;
-	
-	mutex_lock(&data->update_lock);
-	data->fan_status[FAN_INDEX_FROM_NUM(nr)] &= ~v;
-	fscher_write_value(client, reg, v);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_fan_status(struct fscher_data *data, char *buf, int nr)
-{
-	/* bits 0..1, 3..7 reserved => mask with 0x04 */  
-	return sprintf(buf, "%u\n", data->fan_status[FAN_INDEX_FROM_NUM(nr)] & 0x04);
-}
-
-static ssize_t set_pwm(struct i2c_client *client, struct fscher_data *data,
-		       const char *buf, size_t count, int nr, int reg)
-{
-	unsigned long v = simple_strtoul(buf, NULL, 10);
-
-	mutex_lock(&data->update_lock);
-	data->fan_min[FAN_INDEX_FROM_NUM(nr)] = v > 0xff ? 0xff : v;
-	fscher_write_value(client, reg, data->fan_min[FAN_INDEX_FROM_NUM(nr)]);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_pwm(struct fscher_data *data, char *buf, int nr)
-{
-	return sprintf(buf, "%u\n", data->fan_min[FAN_INDEX_FROM_NUM(nr)]);
-}
-
-static ssize_t set_fan_div(struct i2c_client *client, struct fscher_data *data,
-			   const char *buf, size_t count, int nr, int reg)
-{
-	/* supported values: 2, 4, 8 */
-	unsigned long v = simple_strtoul(buf, NULL, 10);
-
-	switch (v) {
-	case 2: v = 1; break;
-	case 4: v = 2; break;
-	case 8: v = 3; break;
-	default:
-		dev_err(&client->dev, "fan_div value %ld not "
-			 "supported. Choose one of 2, 4 or 8!\n", v);
-		return -EINVAL;
-	}
-
-	mutex_lock(&data->update_lock);
-
-	/* bits 2..7 reserved => mask with 0x03 */
-	data->fan_ripple[FAN_INDEX_FROM_NUM(nr)] &= ~0x03;
-	data->fan_ripple[FAN_INDEX_FROM_NUM(nr)] |= v;
-
-	fscher_write_value(client, reg, data->fan_ripple[FAN_INDEX_FROM_NUM(nr)]);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_fan_div(struct fscher_data *data, char *buf, int nr)
-{
-	/* bits 2..7 reserved => mask with 0x03 */  
-	return sprintf(buf, "%u\n", 1 << (data->fan_ripple[FAN_INDEX_FROM_NUM(nr)] & 0x03));
-}
-
-#define RPM_FROM_REG(val)	(val*60)
-
-static ssize_t show_fan_input (struct fscher_data *data, char *buf, int nr)
-{
-	return sprintf(buf, "%u\n", RPM_FROM_REG(data->fan_act[FAN_INDEX_FROM_NUM(nr)]));
-}
-
-
-
-#define TEMP_INDEX_FROM_NUM(nr)		((nr) - 1)
-
-static ssize_t set_temp_status(struct i2c_client *client, struct fscher_data *data,
-			       const char *buf, size_t count, int nr, int reg)
-{
-	/* bits 2..7 reserved, 0 read only => mask with 0x02 */  
-	unsigned long v = simple_strtoul(buf, NULL, 10) & 0x02;
-
-	mutex_lock(&data->update_lock);
-	data->temp_status[TEMP_INDEX_FROM_NUM(nr)] &= ~v;
-	fscher_write_value(client, reg, v);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_temp_status(struct fscher_data *data, char *buf, int nr)
-{
-	/* bits 2..7 reserved => mask with 0x03 */
-	return sprintf(buf, "%u\n", data->temp_status[TEMP_INDEX_FROM_NUM(nr)] & 0x03);
-}
-
-#define TEMP_FROM_REG(val)	(((val) - 128) * 1000)
-
-static ssize_t show_temp_input(struct fscher_data *data, char *buf, int nr)
-{
-	return sprintf(buf, "%d\n", TEMP_FROM_REG(data->temp_act[TEMP_INDEX_FROM_NUM(nr)]));
-}
-
-/*
- * The final conversion is specified in sensors.conf, as it depends on
- * mainboard specific values. We export the registers contents as
- * pseudo-hundredths-of-Volts (range 0V - 2.55V). Not that it makes much
- * sense per se, but it minimizes the conversions count and keeps the
- * values within a usual range.
- */
-#define VOLT_FROM_REG(val)	((val) * 10)
-
-static ssize_t show_in_input(struct fscher_data *data, char *buf, int nr)
-{
-	return sprintf(buf, "%u\n", VOLT_FROM_REG(data->volt[nr]));
-}
-
-
-
-static ssize_t show_revision(struct fscher_data *data, char *buf, int nr)
-{
-	return sprintf(buf, "%u\n", data->revision);
-}
-
-
-
-static ssize_t show_alarms(struct fscher_data *data, char *buf, int nr)
-{
-	/* bits 2, 5..6 reserved => mask with 0x9b */
-	return sprintf(buf, "%u\n", data->global_event & 0x9b);
-}
-
-
-
-static ssize_t set_control(struct i2c_client *client, struct fscher_data *data,
-			   const char *buf, size_t count, int nr, int reg)
-{
-	/* bits 1..7 reserved => mask with 0x01 */  
-	unsigned long v = simple_strtoul(buf, NULL, 10) & 0x01;
-
-	mutex_lock(&data->update_lock);
-	data->global_control = v;
-	fscher_write_value(client, reg, v);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_control(struct fscher_data *data, char *buf, int nr)
-{
-	/* bits 1..7 reserved => mask with 0x01 */
-	return sprintf(buf, "%u\n", data->global_control & 0x01);
-}
-
-
-
-static ssize_t set_watchdog_control(struct i2c_client *client, struct
-				    fscher_data *data, const char *buf, size_t count,
-				    int nr, int reg)
-{
-	/* bits 0..3 reserved => mask with 0xf0 */  
-	unsigned long v = simple_strtoul(buf, NULL, 10) & 0xf0;
-
-	mutex_lock(&data->update_lock);
-	data->watchdog[2] &= ~0xf0;
-	data->watchdog[2] |= v;
-	fscher_write_value(client, reg, data->watchdog[2]);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_watchdog_control(struct fscher_data *data, char *buf, int nr)
-{
-	/* bits 0..3 reserved, bit 5 write only => mask with 0xd0 */
-	return sprintf(buf, "%u\n", data->watchdog[2] & 0xd0);
-}
-
-static ssize_t set_watchdog_status(struct i2c_client *client, struct fscher_data *data,
-				   const char *buf, size_t count, int nr, int reg)
-{
-	/* bits 0, 2..7 reserved => mask with 0x02 */  
-	unsigned long v = simple_strtoul(buf, NULL, 10) & 0x02;
-
-	mutex_lock(&data->update_lock);
-	data->watchdog[1] &= ~v;
-	fscher_write_value(client, reg, v);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_watchdog_status(struct fscher_data *data, char *buf, int nr)
-{
-	/* bits 0, 2..7 reserved => mask with 0x02 */
-	return sprintf(buf, "%u\n", data->watchdog[1] & 0x02);
-}
-
-static ssize_t set_watchdog_preset(struct i2c_client *client, struct fscher_data *data,
-				   const char *buf, size_t count, int nr, int reg)
-{
-	unsigned long v = simple_strtoul(buf, NULL, 10) & 0xff;
-	
-	mutex_lock(&data->update_lock);
-	data->watchdog[0] = v;
-	fscher_write_value(client, reg, data->watchdog[0]);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_watchdog_preset(struct fscher_data *data, char *buf, int nr)
-{
-	return sprintf(buf, "%u\n", data->watchdog[0]);
-}
-
-static int __init sensors_fscher_init(void)
-{
-	return i2c_add_driver(&fscher_driver);
-}
-
-static void __exit sensors_fscher_exit(void)
-{
-	i2c_del_driver(&fscher_driver);
-}
-
-MODULE_AUTHOR("Reinhard Nissl <rnissl@gmx.de>");
-MODULE_DESCRIPTION("FSC Hermes driver");
-MODULE_LICENSE("GPL");
-
-module_init(sensors_fscher_init);
-module_exit(sensors_fscher_exit);
diff --git a/drivers/hwmon/fscpos.c b/drivers/hwmon/fscpos.c
deleted file mode 100644
index 8a7bcf500b4..00000000000
--- a/drivers/hwmon/fscpos.c
+++ /dev/null
@@ -1,654 +0,0 @@
-/*
-	fscpos.c - Kernel module for hardware monitoring with FSC Poseidon chips
-	Copyright (C) 2004, 2005 Stefan Ott <stefan@desire.ch>
-
-	This program is free software; you can redistribute it and/or modify
-	it under the terms of the GNU General Public License as published by
-	the Free Software Foundation; either version 2 of the License, or
-	(at your option) any later version.
-
-	This program is distributed in the hope that it will be useful,
-	but WITHOUT ANY WARRANTY; without even the implied warranty of
-	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-	GNU General Public License for more details.
-
-	You should have received a copy of the GNU General Public License
-	along with this program; if not, write to the Free Software
-	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-
-/*
-	fujitsu siemens poseidon chip,
-	module based on the old fscpos module by Hermann Jung <hej@odn.de> and
-	the fscher module by Reinhard Nissl <rnissl@gmx.de>
-
-	original module based on lm80.c
-	Copyright (C) 1998, 1999 Frodo Looijaard <frodol@dds.nl>
-	and Philip Edelbrock <phil@netroedge.com>
-
-	Thanks to Jean Delvare for reviewing my code and suggesting a lot of
-	improvements.
-*/
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/jiffies.h>
-#include <linux/i2c.h>
-#include <linux/init.h>
-#include <linux/hwmon.h>
-#include <linux/err.h>
-#include <linux/mutex.h>
-#include <linux/sysfs.h>
-
-/*
- * Addresses to scan
- */
-static const unsigned short normal_i2c[] = { 0x73, I2C_CLIENT_END };
-
-/*
- * Insmod parameters
- */
-I2C_CLIENT_INSMOD_1(fscpos);
-
-/*
- * The FSCPOS registers
- */
-
-/* chip identification */
-#define FSCPOS_REG_IDENT_0		0x00
-#define FSCPOS_REG_IDENT_1		0x01
-#define FSCPOS_REG_IDENT_2		0x02
-#define FSCPOS_REG_REVISION		0x03
-
-/* global control and status */
-#define FSCPOS_REG_EVENT_STATE		0x04
-#define FSCPOS_REG_CONTROL		0x05
-
-/* watchdog */
-#define FSCPOS_REG_WDOG_PRESET		0x28
-#define FSCPOS_REG_WDOG_STATE		0x23
-#define FSCPOS_REG_WDOG_CONTROL		0x21
-
-/* voltages */
-#define FSCPOS_REG_VOLT_12		0x45
-#define FSCPOS_REG_VOLT_5		0x42
-#define FSCPOS_REG_VOLT_BATT		0x48
-
-/* fans - the chip does not support minimum speed for fan2 */
-static u8 FSCPOS_REG_PWM[] = { 0x55, 0x65 };
-static u8 FSCPOS_REG_FAN_ACT[] = { 0x0e, 0x6b, 0xab };
-static u8 FSCPOS_REG_FAN_STATE[] = { 0x0d, 0x62, 0xa2 };
-static u8 FSCPOS_REG_FAN_RIPPLE[] = { 0x0f, 0x6f, 0xaf };
-
-/* temperatures */
-static u8 FSCPOS_REG_TEMP_ACT[] = { 0x64, 0x32, 0x35 };
-static u8 FSCPOS_REG_TEMP_STATE[] = { 0x71, 0x81, 0x91 };
-
-/*
- * Functions declaration
- */
-static int fscpos_probe(struct i2c_client *client,
-			const struct i2c_device_id *id);
-static int fscpos_detect(struct i2c_client *client, int kind,
-			 struct i2c_board_info *info);
-static int fscpos_remove(struct i2c_client *client);
-
-static int fscpos_read_value(struct i2c_client *client, u8 reg);
-static int fscpos_write_value(struct i2c_client *client, u8 reg, u8 value);
-static struct fscpos_data *fscpos_update_device(struct device *dev);
-static void fscpos_init_client(struct i2c_client *client);
-
-static void reset_fan_alarm(struct i2c_client *client, int nr);
-
-/*
- * Driver data (common to all clients)
- */
-static const struct i2c_device_id fscpos_id[] = {
-	{ "fscpos", fscpos },
-	{ }
-};
-
-static struct i2c_driver fscpos_driver = {
-	.class		= I2C_CLASS_HWMON,
-	.driver = {
-		.name	= "fscpos",
-	},
-	.probe		= fscpos_probe,
-	.remove		= fscpos_remove,
-	.id_table	= fscpos_id,
-	.detect		= fscpos_detect,
-	.address_data	= &addr_data,
-};
-
-/*
- * Client data (each client gets its own)
- */
-struct fscpos_data {
-	struct device *hwmon_dev;
-	struct mutex update_lock;
-	char valid; 		/* 0 until following fields are valid */
-	unsigned long last_updated;	/* In jiffies */
-
-	/* register values */
-	u8 revision;		/* revision of chip */
-	u8 global_event;	/* global event status */
-	u8 global_control;	/* global control register */
-	u8 wdog_control;	/* watchdog control */
-	u8 wdog_state;		/* watchdog status */
-	u8 wdog_preset;		/* watchdog preset */
-	u8 volt[3];		/* 12, 5, battery current */
-	u8 temp_act[3];		/* temperature */
-	u8 temp_status[3];	/* status of sensor */
-	u8 fan_act[3];		/* fans revolutions per second */
-	u8 fan_status[3];	/* fan status */
-	u8 pwm[2];		/* fan min value for rps */
-	u8 fan_ripple[3];	/* divider for rps */
-};
-
-/* Temperature */
-#define TEMP_FROM_REG(val)	(((val) - 128) * 1000)
-
-static ssize_t show_temp_input(struct fscpos_data *data, char *buf, int nr)
-{
-	return sprintf(buf, "%d\n", TEMP_FROM_REG(data->temp_act[nr - 1]));
-}
-
-static ssize_t show_temp_status(struct fscpos_data *data, char *buf, int nr)
-{
-	/* bits 2..7 reserved => mask with 0x03 */
-	return sprintf(buf, "%u\n", data->temp_status[nr - 1] & 0x03);
-}
-
-static ssize_t show_temp_reset(struct fscpos_data *data, char *buf, int nr)
-{
-	return sprintf(buf, "1\n");
-}
-
-static ssize_t set_temp_reset(struct i2c_client *client, struct fscpos_data
-			*data, const char *buf,	size_t count, int nr, int reg)
-{
-	unsigned long v = simple_strtoul(buf, NULL, 10);
-	if (v != 1) {
-		dev_err(&client->dev, "temp_reset value %ld not supported. "
-					"Use 1 to reset the alarm!\n", v);
-		return -EINVAL;
-	}
-
-	dev_info(&client->dev, "You used the temp_reset feature which has not "
-				"been proplerly tested. Please report your "
-				"experience to the module author.\n");
-
-	/* Supported value: 2 (clears the status) */
-	fscpos_write_value(client, FSCPOS_REG_TEMP_STATE[nr - 1], 2);
-	return count;
-}
-
-/* Fans */
-#define RPM_FROM_REG(val)	((val) * 60)
-
-static ssize_t show_fan_status(struct fscpos_data *data, char *buf, int nr)
-{
-	/* bits 0..1, 3..7 reserved => mask with 0x04 */
-	return sprintf(buf, "%u\n", data->fan_status[nr - 1] & 0x04);
-}
-
-static ssize_t show_fan_input(struct fscpos_data *data, char *buf, int nr)
-{
-	return sprintf(buf, "%u\n", RPM_FROM_REG(data->fan_act[nr - 1]));
-}
-
-static ssize_t show_fan_ripple(struct fscpos_data *data, char *buf, int nr)
-{
-	/* bits 2..7 reserved => mask with 0x03 */
-	return sprintf(buf, "%u\n", data->fan_ripple[nr - 1] & 0x03);
-}
-
-static ssize_t set_fan_ripple(struct i2c_client *client, struct fscpos_data
-			*data, const char *buf,	size_t count, int nr, int reg)
-{
-	/* supported values: 2, 4, 8 */
-	unsigned long v = simple_strtoul(buf, NULL, 10);
-
-	switch (v) {
-		case 2: v = 1; break;
-		case 4: v = 2; break;
-		case 8: v = 3; break;
-	default:
-		dev_err(&client->dev, "fan_ripple value %ld not supported. "
-					"Must be one of 2, 4 or 8!\n", v);
-		return -EINVAL;
-	}
-	
-	mutex_lock(&data->update_lock);
-	/* bits 2..7 reserved => mask with 0x03 */
-	data->fan_ripple[nr - 1] &= ~0x03;
-	data->fan_ripple[nr - 1] |= v;
-	
-	fscpos_write_value(client, reg, data->fan_ripple[nr - 1]);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_pwm(struct fscpos_data *data, char *buf, int nr)
-{
-	return sprintf(buf, "%u\n", data->pwm[nr - 1]);
-}
-
-static ssize_t set_pwm(struct i2c_client *client, struct fscpos_data *data,
-				const char *buf, size_t count, int nr, int reg)
-{
-	unsigned long v = simple_strtoul(buf, NULL, 10);
-
-	/* Range: 0..255 */
-	if (v < 0) v = 0;
-	if (v > 255) v = 255;
-
-	mutex_lock(&data->update_lock);
-	data->pwm[nr - 1] = v;
-	fscpos_write_value(client, reg, data->pwm[nr - 1]);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static void reset_fan_alarm(struct i2c_client *client, int nr)
-{
-	fscpos_write_value(client, FSCPOS_REG_FAN_STATE[nr], 4);
-}
-
-/* Volts */
-#define VOLT_FROM_REG(val, mult)	((val) * (mult) / 255)
-
-static ssize_t show_volt_12(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct fscpos_data *data = fscpos_update_device(dev);
-	return sprintf(buf, "%u\n", VOLT_FROM_REG(data->volt[0], 14200));
-}
-
-static ssize_t show_volt_5(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct fscpos_data *data = fscpos_update_device(dev);
-	return sprintf(buf, "%u\n", VOLT_FROM_REG(data->volt[1], 6600));
-}
-
-static ssize_t show_volt_batt(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct fscpos_data *data = fscpos_update_device(dev);
-	return sprintf(buf, "%u\n", VOLT_FROM_REG(data->volt[2], 3300));
-}
-
-/* Watchdog */
-static ssize_t show_wdog_control(struct fscpos_data *data, char *buf)
-{
-	/* bits 0..3 reserved, bit 6 write only => mask with 0xb0 */
-	return sprintf(buf, "%u\n", data->wdog_control & 0xb0);
-}
-
-static ssize_t set_wdog_control(struct i2c_client *client, struct fscpos_data
-				*data, const char *buf,	size_t count, int reg)
-{
-	/* bits 0..3 reserved => mask with 0xf0 */
-	unsigned long v = simple_strtoul(buf, NULL, 10) & 0xf0;
-
-	mutex_lock(&data->update_lock);
-	data->wdog_control &= ~0xf0;
-	data->wdog_control |= v;
-	fscpos_write_value(client, reg, data->wdog_control);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_wdog_state(struct fscpos_data *data, char *buf)
-{
-	/* bits 0, 2..7 reserved => mask with 0x02 */
-	return sprintf(buf, "%u\n", data->wdog_state & 0x02);
-}
-
-static ssize_t set_wdog_state(struct i2c_client *client, struct fscpos_data
-				*data, const char *buf, size_t count, int reg)
-{
-	unsigned long v = simple_strtoul(buf, NULL, 10) & 0x02;
-
-	/* Valid values: 2 (clear) */
-	if (v != 2) {
-		dev_err(&client->dev, "wdog_state value %ld not supported. "
-					"Must be 2 to clear the state!\n", v);
-		return -EINVAL;
-	}
-
-	mutex_lock(&data->update_lock);
-	data->wdog_state &= ~v;
-	fscpos_write_value(client, reg, v);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-static ssize_t show_wdog_preset(struct fscpos_data *data, char *buf)
-{
-	return sprintf(buf, "%u\n", data->wdog_preset);
-}
-
-static ssize_t set_wdog_preset(struct i2c_client *client, struct fscpos_data
-				*data, const char *buf,	size_t count, int reg)
-{
-	unsigned long v = simple_strtoul(buf, NULL, 10) & 0xff;
-
-	mutex_lock(&data->update_lock);
-	data->wdog_preset = v;
-	fscpos_write_value(client, reg, data->wdog_preset);
-	mutex_unlock(&data->update_lock);
-	return count;
-}
-
-/* Event */
-static ssize_t show_event(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	/* bits 5..7 reserved => mask with 0x1f */
-	struct fscpos_data *data = fscpos_update_device(dev);
-	return sprintf(buf, "%u\n", data->global_event & 0x9b);
-}
-
-/*
- * Sysfs stuff
- */
-#define create_getter(kind, sub) \
-	static ssize_t sysfs_show_##kind##sub(struct device *dev, struct device_attribute *attr, char *buf) \
-	{ \
-		struct fscpos_data *data = fscpos_update_device(dev); \
-		return show_##kind##sub(data, buf); \
-	}
-
-#define create_getter_n(kind, offset, sub) \
-	static ssize_t sysfs_show_##kind##offset##sub(struct device *dev, struct device_attribute *attr, char\
-								 	*buf) \
-	{ \
-		struct fscpos_data *data = fscpos_update_device(dev); \
-		return show_##kind##sub(data, buf, offset); \
-	}
-
-#define create_setter(kind, sub, reg) \
-	static ssize_t sysfs_set_##kind##sub (struct device *dev, struct device_attribute *attr, const char \
-							*buf, size_t count) \
-	{ \
-		struct i2c_client *client = to_i2c_client(dev); \
-		struct fscpos_data *data = i2c_get_clientdata(client); \
-		return set_##kind##sub(client, data, buf, count, reg); \
-	}
-
-#define create_setter_n(kind, offset, sub, reg) \
-	static ssize_t sysfs_set_##kind##offset##sub (struct device *dev, struct device_attribute *attr, \
-					const char *buf, size_t count) \
-	{ \
-		struct i2c_client *client = to_i2c_client(dev); \
-		struct fscpos_data *data = i2c_get_clientdata(client); \
-		return set_##kind##sub(client, data, buf, count, offset, reg);\
-	}
-
-#define create_sysfs_device_ro(kind, sub, offset) \
-	static DEVICE_ATTR(kind##offset##sub, S_IRUGO, \
-					sysfs_show_##kind##offset##sub, NULL);
-
-#define create_sysfs_device_rw(kind, sub, offset) \
-	static DEVICE_ATTR(kind##offset##sub, S_IRUGO | S_IWUSR, \
-		sysfs_show_##kind##offset##sub, sysfs_set_##kind##offset##sub);
-
-#define sysfs_ro_n(kind, sub, offset) \
-	create_getter_n(kind, offset, sub); \
-	create_sysfs_device_ro(kind, sub, offset);
-
-#define sysfs_rw_n(kind, sub, offset, reg) \
-	create_getter_n(kind, offset, sub); \
-	create_setter_n(kind, offset, sub, reg); \
-	create_sysfs_device_rw(kind, sub, offset);
-
-#define sysfs_rw(kind, sub, reg) \
-	create_getter(kind, sub); \
-	create_setter(kind, sub, reg); \
-	create_sysfs_device_rw(kind, sub,);
-
-#define sysfs_fan_with_min(offset, reg_status, reg_ripple, reg_min) \
-	sysfs_fan(offset, reg_status, reg_ripple); \
-	sysfs_rw_n(pwm,, offset, reg_min);
-
-#define sysfs_fan(offset, reg_status, reg_ripple) \
-	sysfs_ro_n(fan, _input, offset); \
-	sysfs_ro_n(fan, _status, offset); \
-	sysfs_rw_n(fan, _ripple, offset, reg_ripple);
-
-#define sysfs_temp(offset, reg_status) \
-	sysfs_ro_n(temp, _input, offset); \
-	sysfs_ro_n(temp, _status, offset); \
-	sysfs_rw_n(temp, _reset, offset, reg_status);
-
-#define sysfs_watchdog(reg_wdog_preset, reg_wdog_state, reg_wdog_control) \
-	sysfs_rw(wdog, _control, reg_wdog_control); \
-	sysfs_rw(wdog, _preset, reg_wdog_preset); \
-	sysfs_rw(wdog, _state, reg_wdog_state);
-
-sysfs_fan_with_min(1, FSCPOS_REG_FAN_STATE[0], FSCPOS_REG_FAN_RIPPLE[0],
-							FSCPOS_REG_PWM[0]);
-sysfs_fan_with_min(2, FSCPOS_REG_FAN_STATE[1], FSCPOS_REG_FAN_RIPPLE[1],
-							FSCPOS_REG_PWM[1]);
-sysfs_fan(3, FSCPOS_REG_FAN_STATE[2], FSCPOS_REG_FAN_RIPPLE[2]);
-
-sysfs_temp(1, FSCPOS_REG_TEMP_STATE[0]);
-sysfs_temp(2, FSCPOS_REG_TEMP_STATE[1]);
-sysfs_temp(3, FSCPOS_REG_TEMP_STATE[2]);
-
-sysfs_watchdog(FSCPOS_REG_WDOG_PRESET, FSCPOS_REG_WDOG_STATE,
-						FSCPOS_REG_WDOG_CONTROL);
-
-static DEVICE_ATTR(event, S_IRUGO, show_event, NULL);
-static DEVICE_ATTR(in0_input, S_IRUGO, show_volt_12, NULL);
-static DEVICE_ATTR(in1_input, S_IRUGO, show_volt_5, NULL);
-static DEVICE_ATTR(in2_input, S_IRUGO, show_volt_batt, NULL);
-
-static struct attribute *fscpos_attributes[] = {
-	&dev_attr_event.attr,
-	&dev_attr_in0_input.attr,
-	&dev_attr_in1_input.attr,
-	&dev_attr_in2_input.attr,
-
-	&dev_attr_wdog_control.attr,
-	&dev_attr_wdog_preset.attr,
-	&dev_attr_wdog_state.attr,
-
-	&dev_attr_temp1_input.attr,
-	&dev_attr_temp1_status.attr,
-	&dev_attr_temp1_reset.attr,
-	&dev_attr_temp2_input.attr,
-	&dev_attr_temp2_status.attr,
-	&dev_attr_temp2_reset.attr,
-	&dev_attr_temp3_input.attr,
-	&dev_attr_temp3_status.attr,
-	&dev_attr_temp3_reset.attr,
-
-	&dev_attr_fan1_input.attr,
-	&dev_attr_fan1_status.attr,
-	&dev_attr_fan1_ripple.attr,
-	&dev_attr_pwm1.attr,
-	&dev_attr_fan2_input.attr,
-	&dev_attr_fan2_status.attr,
-	&dev_attr_fan2_ripple.attr,
-	&dev_attr_pwm2.attr,
-	&dev_attr_fan3_input.attr,
-	&dev_attr_fan3_status.attr,
-	&dev_attr_fan3_ripple.attr,
-	NULL
-};
-
-static const struct attribute_group fscpos_group = {
-	.attrs = fscpos_attributes,
-};
-
-/* Return 0 if detection is successful, -ENODEV otherwise */
-static int fscpos_detect(struct i2c_client *new_client, int kind,
-			 struct i2c_board_info *info)
-{
-	struct i2c_adapter *adapter = new_client->adapter;
-
-	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA))
-		return -ENODEV;
-
-	/* Do the remaining detection unless force or force_fscpos parameter */
-	if (kind < 0) {
-		if ((fscpos_read_value(new_client, FSCPOS_REG_IDENT_0)
-			!= 0x50) /* 'P' */
-		|| (fscpos_read_value(new_client, FSCPOS_REG_IDENT_1)
-			!= 0x45) /* 'E' */
-		|| (fscpos_read_value(new_client, FSCPOS_REG_IDENT_2)
-			!= 0x47))/* 'G' */
-			return -ENODEV;
-	}
-
-	strlcpy(info->type, "fscpos", I2C_NAME_SIZE);
-
-	return 0;
-}
-
-static int fscpos_probe(struct i2c_client *new_client,
-			const struct i2c_device_id *id)
-{
-	struct fscpos_data *data;
-	int err;
-
-	data = kzalloc(sizeof(struct fscpos_data), GFP_KERNEL);
-	if (!data) {
-		err = -ENOMEM;
-		goto exit;
-	}
-
-	i2c_set_clientdata(new_client, data);
-	data->valid = 0;
-	mutex_init(&data->update_lock);
-
-	/* Inizialize the fscpos chip */
-	fscpos_init_client(new_client);
-
-	/* Announce that the chip was found */
-	dev_info(&new_client->dev, "Found fscpos chip, rev %u\n", data->revision);
-
-	/* Register sysfs hooks */
-	if ((err = sysfs_create_group(&new_client->dev.kobj, &fscpos_group)))
-		goto exit_free;
-
-	data->hwmon_dev = hwmon_device_register(&new_client->dev);
-	if (IS_ERR(data->hwmon_dev)) {
-		err = PTR_ERR(data->hwmon_dev);
-		goto exit_remove_files;
-	}
-
-	return 0;
-
-exit_remove_files:
-	sysfs_remove_group(&new_client->dev.kobj, &fscpos_group);
-exit_free:
-	kfree(data);
-exit:
-	return err;
-}
-
-static int fscpos_remove(struct i2c_client *client)
-{
-	struct fscpos_data *data = i2c_get_clientdata(client);
-
-	hwmon_device_unregister(data->hwmon_dev);
-	sysfs_remove_group(&client->dev.kobj, &fscpos_group);
-
-	kfree(data);
-	return 0;
-}
-
-static int fscpos_read_value(struct i2c_client *client, u8 reg)
-{
-	dev_dbg(&client->dev, "Read reg 0x%02x\n", reg);
-	return i2c_smbus_read_byte_data(client, reg);
-}
-
-static int fscpos_write_value(struct i2c_client *client, u8 reg, u8 value)
-{
-	dev_dbg(&client->dev, "Write reg 0x%02x, val 0x%02x\n", reg, value);
-	return i2c_smbus_write_byte_data(client, reg, value);
-}
-
-/* Called when we have found a new FSCPOS chip */
-static void fscpos_init_client(struct i2c_client *client)
-{
-	struct fscpos_data *data = i2c_get_clientdata(client);
-
-	/* read revision from chip */
-	data->revision = fscpos_read_value(client, FSCPOS_REG_REVISION);
-}
-
-static struct fscpos_data *fscpos_update_device(struct device *dev)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	struct fscpos_data *data = i2c_get_clientdata(client);
-
-	mutex_lock(&data->update_lock);
-
-	if (time_after(jiffies, data->last_updated + 2 * HZ) || !data->valid) {
-		int i;
-
-		dev_dbg(&client->dev, "Starting fscpos update\n");
-
-		for (i = 0; i < 3; i++) {
-			data->temp_act[i] = fscpos_read_value(client,
-						FSCPOS_REG_TEMP_ACT[i]);
-			data->temp_status[i] = fscpos_read_value(client,
-						FSCPOS_REG_TEMP_STATE[i]);
-			data->fan_act[i] = fscpos_read_value(client,
-						FSCPOS_REG_FAN_ACT[i]);
-			data->fan_status[i] = fscpos_read_value(client,
-						FSCPOS_REG_FAN_STATE[i]);
-			data->fan_ripple[i] = fscpos_read_value(client,
-						FSCPOS_REG_FAN_RIPPLE[i]);
-			if (i < 2) {
-				/* fan2_min is not supported by the chip */
-				data->pwm[i] = fscpos_read_value(client,
-							FSCPOS_REG_PWM[i]);
-			}
-			/* reset fan status if speed is back to > 0 */
-			if (data->fan_status[i] != 0 && data->fan_act[i] > 0) {
-				reset_fan_alarm(client, i);
-			}
-		}
-
-		data->volt[0] = fscpos_read_value(client, FSCPOS_REG_VOLT_12);
-		data->volt[1] = fscpos_read_value(client, FSCPOS_REG_VOLT_5);
-		data->volt[2] = fscpos_read_value(client, FSCPOS_REG_VOLT_BATT);
-
-		data->wdog_preset = fscpos_read_value(client,
-							FSCPOS_REG_WDOG_PRESET);
-		data->wdog_state = fscpos_read_value(client,
-							FSCPOS_REG_WDOG_STATE);
-		data->wdog_control = fscpos_read_value(client,
-						FSCPOS_REG_WDOG_CONTROL);
-
-		data->global_event = fscpos_read_value(client,
-						FSCPOS_REG_EVENT_STATE);
-
-		data->last_updated = jiffies;
-		data->valid = 1;
-	}
-	mutex_unlock(&data->update_lock);
-	return data;
-}
-
-static int __init sm_fscpos_init(void)
-{
-	return i2c_add_driver(&fscpos_driver);
-}
-
-static void __exit sm_fscpos_exit(void)
-{
-	i2c_del_driver(&fscpos_driver);
-}
-
-MODULE_AUTHOR("Stefan Ott <stefan@desire.ch> based on work from Hermann Jung "
-				"<hej@odn.de>, Frodo Looijaard <frodol@dds.nl>"
-				" and Philip Edelbrock <phil@netroedge.com>");
-MODULE_DESCRIPTION("fujitsu siemens poseidon chip driver");
-MODULE_LICENSE("GPL");
-
-module_init(sm_fscpos_init);
-module_exit(sm_fscpos_exit);
diff --git a/drivers/hwmon/ltc4215.c b/drivers/hwmon/ltc4215.c
index 9386e2a3921..6c9a04136e0 100644
--- a/drivers/hwmon/ltc4215.c
+++ b/drivers/hwmon/ltc4215.c
@@ -259,7 +259,7 @@ static int ltc4215_probe(struct i2c_client *client,
 	mutex_init(&data->update_lock);
 
 	/* Initialize the LTC4215 chip */
-	/* TODO */
+	i2c_smbus_write_byte_data(client, LTC4215_FAULT, 0x00);
 
 	/* Register sysfs hooks */
 	ret = sysfs_create_group(&client->dev.kobj, &ltc4215_group);
diff --git a/drivers/hwmon/ltc4245.c b/drivers/hwmon/ltc4245.c
index 034b2c51584..e3896433361 100644
--- a/drivers/hwmon/ltc4245.c
+++ b/drivers/hwmon/ltc4245.c
@@ -382,7 +382,8 @@ static int ltc4245_probe(struct i2c_client *client,
 	mutex_init(&data->update_lock);
 
 	/* Initialize the LTC4245 chip */
-	/* TODO */
+	i2c_smbus_write_byte_data(client, LTC4245_FAULT1, 0x00);
+	i2c_smbus_write_byte_data(client, LTC4245_FAULT2, 0x00);
 
 	/* Register sysfs hooks */
 	ret = sysfs_create_group(&client->dev.kobj, &ltc4245_group);
diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
index 949c97ff57e..1f20a042a4f 100644
--- a/drivers/idle/i7300_idle.c
+++ b/drivers/idle/i7300_idle.c
@@ -29,8 +29,8 @@
 
 #include <asm/idle.h>
 
-#include "../dma/ioatdma_hw.h"
-#include "../dma/ioatdma_registers.h"
+#include "../dma/ioat/hw.h"
+#include "../dma/ioat/registers.h"
 
 #define I7300_IDLE_DRIVER_VERSION	"1.55"
 #define I7300_PRINT			"i7300_idle:"
@@ -126,9 +126,9 @@ static void i7300_idle_ioat_stop(void)
 		udelay(10);
 
 		sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
-			IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+			IOAT_CHANSTS_STATUS;
 
-		if (sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE)
+		if (sts != IOAT_CHANSTS_ACTIVE)
 			break;
 
 	}
@@ -160,9 +160,9 @@ static int __init i7300_idle_ioat_selftest(u8 *ctl,
 	udelay(1000);
 
 	chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
-			IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+			IOAT_CHANSTS_STATUS;
 
-	if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE) {
+	if (chan_sts != IOAT_CHANSTS_DONE) {
 		/* Not complete, reset the channel */
 		writeb(IOAT_CHANCMD_RESET,
 		       ioat_chanbase + IOAT1_CHANCMD_OFFSET);
@@ -288,9 +288,9 @@ static void __exit i7300_idle_ioat_exit(void)
 		       ioat_chanbase + IOAT1_CHANCMD_OFFSET);
 
 		chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
-			IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+			IOAT_CHANSTS_STATUS;
 
-		if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) {
+		if (chan_sts != IOAT_CHANSTS_ACTIVE) {
 			writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET);
 			break;
 		}
@@ -298,14 +298,14 @@ static void __exit i7300_idle_ioat_exit(void)
 	}
 
 	chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
-			IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+			IOAT_CHANSTS_STATUS;
 
 	/*
 	 * We tried to reset multiple times. If IO A/T channel is still active
 	 * flag an error and return without cleanup. Memory leak is better
 	 * than random corruption in that extreme error situation.
 	 */
-	if (chan_sts == IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) {
+	if (chan_sts == IOAT_CHANSTS_ACTIVE) {
 		printk(KERN_ERR I7300_PRINT "Unable to stop IO A/T channels."
 			" Not freeing resources\n");
 		return;
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 556539d617a..e828aab7dac 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/types.h>
 #include <linux/input.h>
 #include <linux/module.h>
 #include <linux/random.h>
@@ -514,7 +515,7 @@ static void input_disconnect_device(struct input_dev *dev)
 	 * that there are no threads in the middle of input_open_device()
 	 */
 	mutex_lock(&dev->mutex);
-	dev->going_away = 1;
+	dev->going_away = true;
 	mutex_unlock(&dev->mutex);
 
 	spin_lock_irq(&dev->event_lock);
@@ -1259,10 +1260,71 @@ static int input_dev_uevent(struct device *device, struct kobj_uevent_env *env)
 	return 0;
 }
 
+#define INPUT_DO_TOGGLE(dev, type, bits, on)			\
+	do {							\
+		int i;						\
+		if (!test_bit(EV_##type, dev->evbit))		\
+			break;					\
+		for (i = 0; i < type##_MAX; i++) {		\
+			if (!test_bit(i, dev->bits##bit) ||	\
+			    !test_bit(i, dev->bits))		\
+				continue;			\
+			dev->event(dev, EV_##type, i, on);	\
+		}						\
+	} while (0)
+
+static void input_dev_reset(struct input_dev *dev, bool activate)
+{
+	if (!dev->event)
+		return;
+
+	INPUT_DO_TOGGLE(dev, LED, led, activate);
+	INPUT_DO_TOGGLE(dev, SND, snd, activate);
+
+	if (activate && test_bit(EV_REP, dev->evbit)) {
+		dev->event(dev, EV_REP, REP_PERIOD, dev->rep[REP_PERIOD]);
+		dev->event(dev, EV_REP, REP_DELAY, dev->rep[REP_DELAY]);
+	}
+}
+
+#ifdef CONFIG_PM
+static int input_dev_suspend(struct device *dev)
+{
+	struct input_dev *input_dev = to_input_dev(dev);
+
+	mutex_lock(&input_dev->mutex);
+	input_dev_reset(input_dev, false);
+	mutex_unlock(&input_dev->mutex);
+
+	return 0;
+}
+
+static int input_dev_resume(struct device *dev)
+{
+	struct input_dev *input_dev = to_input_dev(dev);
+
+	mutex_lock(&input_dev->mutex);
+	input_dev_reset(input_dev, true);
+	mutex_unlock(&input_dev->mutex);
+
+	return 0;
+}
+
+static const struct dev_pm_ops input_dev_pm_ops = {
+	.suspend	= input_dev_suspend,
+	.resume		= input_dev_resume,
+	.poweroff	= input_dev_suspend,
+	.restore	= input_dev_resume,
+};
+#endif /* CONFIG_PM */
+
 static struct device_type input_dev_type = {
 	.groups		= input_dev_attr_groups,
 	.release	= input_dev_release,
 	.uevent		= input_dev_uevent,
+#ifdef CONFIG_PM
+	.pm		= &input_dev_pm_ops,
+#endif
 };
 
 static char *input_devnode(struct device *dev, mode_t *mode)
diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index 3525c19be42..ee98b1bc5d8 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -24,6 +24,16 @@ config KEYBOARD_AAED2000
 	  To compile this driver as a module, choose M here: the
 	  module will be called aaed2000_kbd.
 
+config KEYBOARD_ADP5588
+	tristate "ADP5588 I2C QWERTY Keypad and IO Expander"
+	depends on I2C
+	help
+	  Say Y here if you want to use a ADP5588 attached to your
+	  system I2C bus.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called adp5588-keys.
+
 config KEYBOARD_AMIGA
 	tristate "Amiga keyboard"
 	depends on AMIGA
@@ -104,6 +114,16 @@ config KEYBOARD_ATKBD_RDI_KEYCODES
 	  right-hand column will be interpreted as the key shown in the
 	  left-hand column.
 
+config QT2160
+	tristate "Atmel AT42QT2160 Touch Sensor Chip"
+	depends on I2C && EXPERIMENTAL
+	help
+	  If you say yes here you get support for Atmel AT42QT2160 Touch
+	  Sensor chip as a keyboard input.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called qt2160.
+
 config KEYBOARD_BFIN
 	tristate "Blackfin BF54x keypad support"
 	depends on (BF54x && !BF544)
@@ -251,6 +271,17 @@ config KEYBOARD_MAPLE
 	  To compile this driver as a module, choose M here: the
 	  module will be called maple_keyb.
 
+config KEYBOARD_MAX7359
+	tristate "Maxim MAX7359 Key Switch Controller"
+	depends on I2C
+	help
+	  If you say yes here you get support for the Maxim MAX7359 Key
+	  Switch Controller chip. This providers microprocessors with
+	  management of up to 64 key switches
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called max7359_keypad.
+
 config KEYBOARD_NEWTON
 	tristate "Newton keyboard"
 	select SERIO
@@ -260,6 +291,15 @@ config KEYBOARD_NEWTON
 	  To compile this driver as a module, choose M here: the
 	  module will be called newtonkbd.
 
+config KEYBOARD_OPENCORES
+	tristate "OpenCores Keyboard Controller"
+	help
+	  Say Y here if you want to use the OpenCores Keyboard Controller
+	  http://www.opencores.org/project,keyboardcontroller
+
+	  To compile this driver as a module, choose M here; the
+	  module will be called opencores-kbd.
+
 config KEYBOARD_PXA27x
 	tristate "PXA27x/PXA3xx keypad support"
 	depends on PXA27x || PXA3xx
diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile
index 8a7a22b3026..babad5e58b7 100644
--- a/drivers/input/keyboard/Makefile
+++ b/drivers/input/keyboard/Makefile
@@ -5,6 +5,7 @@
 # Each configuration option enables a list of files.
 
 obj-$(CONFIG_KEYBOARD_AAED2000)		+= aaed2000_kbd.o
+obj-$(CONFIG_KEYBOARD_ADP5588)		+= adp5588-keys.o
 obj-$(CONFIG_KEYBOARD_AMIGA)		+= amikbd.o
 obj-$(CONFIG_KEYBOARD_ATARI)		+= atakbd.o
 obj-$(CONFIG_KEYBOARD_ATKBD)		+= atkbd.o
@@ -21,10 +22,13 @@ obj-$(CONFIG_KEYBOARD_LM8323)		+= lm8323.o
 obj-$(CONFIG_KEYBOARD_LOCOMO)		+= locomokbd.o
 obj-$(CONFIG_KEYBOARD_MAPLE)		+= maple_keyb.o
 obj-$(CONFIG_KEYBOARD_MATRIX)		+= matrix_keypad.o
+obj-$(CONFIG_KEYBOARD_MAX7359)		+= max7359_keypad.o
 obj-$(CONFIG_KEYBOARD_NEWTON)		+= newtonkbd.o
 obj-$(CONFIG_KEYBOARD_OMAP)		+= omap-keypad.o
+obj-$(CONFIG_KEYBOARD_OPENCORES)	+= opencores-kbd.o
 obj-$(CONFIG_KEYBOARD_PXA27x)		+= pxa27x_keypad.o
 obj-$(CONFIG_KEYBOARD_PXA930_ROTARY)	+= pxa930_rotary.o
+obj-$(CONFIG_KEYBOARD_QT2160)		+= qt2160.o
 obj-$(CONFIG_KEYBOARD_SH_KEYSC)		+= sh_keysc.o
 obj-$(CONFIG_KEYBOARD_SPITZ)		+= spitzkbd.o
 obj-$(CONFIG_KEYBOARD_STOWAWAY)		+= stowaway.o
diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c
new file mode 100644
index 00000000000..d48c808d592
--- /dev/null
+++ b/drivers/input/keyboard/adp5588-keys.c
@@ -0,0 +1,361 @@
+/*
+ * File: drivers/input/keyboard/adp5588_keys.c
+ * Description:  keypad driver for ADP5588 I2C QWERTY Keypad and IO Expander
+ * Bugs: Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Copyright (C) 2008-2009 Analog Devices Inc.
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/pm.h>
+#include <linux/platform_device.h>
+#include <linux/input.h>
+#include <linux/i2c.h>
+
+#include <linux/i2c/adp5588.h>
+
+ /* Configuration Register1 */
+#define AUTO_INC	(1 << 7)
+#define GPIEM_CFG	(1 << 6)
+#define OVR_FLOW_M	(1 << 5)
+#define INT_CFG		(1 << 4)
+#define OVR_FLOW_IEN	(1 << 3)
+#define K_LCK_IM	(1 << 2)
+#define GPI_IEN		(1 << 1)
+#define KE_IEN		(1 << 0)
+
+/* Interrupt Status Register */
+#define CMP2_INT	(1 << 5)
+#define CMP1_INT	(1 << 4)
+#define OVR_FLOW_INT	(1 << 3)
+#define K_LCK_INT	(1 << 2)
+#define GPI_INT		(1 << 1)
+#define KE_INT		(1 << 0)
+
+/* Key Lock and Event Counter Register */
+#define K_LCK_EN	(1 << 6)
+#define LCK21		0x30
+#define KEC		0xF
+
+/* Key Event Register xy */
+#define KEY_EV_PRESSED		(1 << 7)
+#define KEY_EV_MASK		(0x7F)
+
+#define KP_SEL(x)		(0xFFFF >> (16 - x))	/* 2^x-1 */
+
+#define KEYP_MAX_EVENT		10
+
+/*
+ * Early pre 4.0 Silicon required to delay readout by at least 25ms,
+ * since the Event Counter Register updated 25ms after the interrupt
+ * asserted.
+ */
+#define WA_DELAYED_READOUT_REVID(rev)		((rev) < 4)
+
+struct adp5588_kpad {
+	struct i2c_client *client;
+	struct input_dev *input;
+	struct delayed_work work;
+	unsigned long delay;
+	unsigned short keycode[ADP5588_KEYMAPSIZE];
+};
+
+static int adp5588_read(struct i2c_client *client, u8 reg)
+{
+	int ret = i2c_smbus_read_byte_data(client, reg);
+
+	if (ret < 0)
+		dev_err(&client->dev, "Read Error\n");
+
+	return ret;
+}
+
+static int adp5588_write(struct i2c_client *client, u8 reg, u8 val)
+{
+	return i2c_smbus_write_byte_data(client, reg, val);
+}
+
+static void adp5588_work(struct work_struct *work)
+{
+	struct adp5588_kpad *kpad = container_of(work,
+						struct adp5588_kpad, work.work);
+	struct i2c_client *client = kpad->client;
+	int i, key, status, ev_cnt;
+
+	status = adp5588_read(client, INT_STAT);
+
+	if (status & OVR_FLOW_INT)	/* Unlikely and should never happen */
+		dev_err(&client->dev, "Event Overflow Error\n");
+
+	if (status & KE_INT) {
+		ev_cnt = adp5588_read(client, KEY_LCK_EC_STAT) & KEC;
+		if (ev_cnt) {
+			for (i = 0; i < ev_cnt; i++) {
+				key = adp5588_read(client, Key_EVENTA + i);
+				input_report_key(kpad->input,
+					kpad->keycode[(key & KEY_EV_MASK) - 1],
+					key & KEY_EV_PRESSED);
+			}
+			input_sync(kpad->input);
+		}
+	}
+	adp5588_write(client, INT_STAT, status); /* Status is W1C */
+}
+
+static irqreturn_t adp5588_irq(int irq, void *handle)
+{
+	struct adp5588_kpad *kpad = handle;
+
+	/*
+	 * use keventd context to read the event fifo registers
+	 * Schedule readout at least 25ms after notification for
+	 * REVID < 4
+	 */
+
+	schedule_delayed_work(&kpad->work, kpad->delay);
+
+	return IRQ_HANDLED;
+}
+
+static int __devinit adp5588_setup(struct i2c_client *client)
+{
+	struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
+	int i, ret;
+
+	ret = adp5588_write(client, KP_GPIO1, KP_SEL(pdata->rows));
+	ret |= adp5588_write(client, KP_GPIO2, KP_SEL(pdata->cols) & 0xFF);
+	ret |= adp5588_write(client, KP_GPIO3, KP_SEL(pdata->cols) >> 8);
+
+	if (pdata->en_keylock) {
+		ret |= adp5588_write(client, UNLOCK1, pdata->unlock_key1);
+		ret |= adp5588_write(client, UNLOCK2, pdata->unlock_key2);
+		ret |= adp5588_write(client, KEY_LCK_EC_STAT, K_LCK_EN);
+	}
+
+	for (i = 0; i < KEYP_MAX_EVENT; i++)
+		ret |= adp5588_read(client, Key_EVENTA);
+
+	ret |= adp5588_write(client, INT_STAT, CMP2_INT | CMP1_INT |
+					OVR_FLOW_INT | K_LCK_INT |
+					GPI_INT | KE_INT); /* Status is W1C */
+
+	ret |= adp5588_write(client, CFG, INT_CFG | OVR_FLOW_IEN | KE_IEN);
+
+	if (ret < 0) {
+		dev_err(&client->dev, "Write Error\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int __devinit adp5588_probe(struct i2c_client *client,
+					const struct i2c_device_id *id)
+{
+	struct adp5588_kpad *kpad;
+	struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
+	struct input_dev *input;
+	unsigned int revid;
+	int ret, i;
+	int error;
+
+	if (!i2c_check_functionality(client->adapter,
+					I2C_FUNC_SMBUS_BYTE_DATA)) {
+		dev_err(&client->dev, "SMBUS Byte Data not Supported\n");
+		return -EIO;
+	}
+
+	if (!pdata) {
+		dev_err(&client->dev, "no platform data?\n");
+		return -EINVAL;
+	}
+
+	if (!pdata->rows || !pdata->cols || !pdata->keymap) {
+		dev_err(&client->dev, "no rows, cols or keymap from pdata\n");
+		return -EINVAL;
+	}
+
+	if (pdata->keymapsize != ADP5588_KEYMAPSIZE) {
+		dev_err(&client->dev, "invalid keymapsize\n");
+		return -EINVAL;
+	}
+
+	if (!client->irq) {
+		dev_err(&client->dev, "no IRQ?\n");
+		return -EINVAL;
+	}
+
+	kpad = kzalloc(sizeof(*kpad), GFP_KERNEL);
+	input = input_allocate_device();
+	if (!kpad || !input) {
+		error = -ENOMEM;
+		goto err_free_mem;
+	}
+
+	kpad->client = client;
+	kpad->input = input;
+	INIT_DELAYED_WORK(&kpad->work, adp5588_work);
+
+	ret = adp5588_read(client, DEV_ID);
+	if (ret < 0) {
+		error = ret;
+		goto err_free_mem;
+	}
+
+	revid = (u8) ret & ADP5588_DEVICE_ID_MASK;
+	if (WA_DELAYED_READOUT_REVID(revid))
+		kpad->delay = msecs_to_jiffies(30);
+
+	input->name = client->name;
+	input->phys = "adp5588-keys/input0";
+	input->dev.parent = &client->dev;
+
+	input_set_drvdata(input, kpad);
+
+	input->id.bustype = BUS_I2C;
+	input->id.vendor = 0x0001;
+	input->id.product = 0x0001;
+	input->id.version = revid;
+
+	input->keycodesize = sizeof(kpad->keycode[0]);
+	input->keycodemax = pdata->keymapsize;
+	input->keycode = kpad->keycode;
+
+	memcpy(kpad->keycode, pdata->keymap,
+		pdata->keymapsize * input->keycodesize);
+
+	/* setup input device */
+	__set_bit(EV_KEY, input->evbit);
+
+	if (pdata->repeat)
+		__set_bit(EV_REP, input->evbit);
+
+	for (i = 0; i < input->keycodemax; i++)
+		__set_bit(kpad->keycode[i] & KEY_MAX, input->keybit);
+	__clear_bit(KEY_RESERVED, input->keybit);
+
+	error = input_register_device(input);
+	if (error) {
+		dev_err(&client->dev, "unable to register input device\n");
+		goto err_free_mem;
+	}
+
+	error = request_irq(client->irq, adp5588_irq,
+			    IRQF_TRIGGER_FALLING | IRQF_DISABLED,
+			    client->dev.driver->name, kpad);
+	if (error) {
+		dev_err(&client->dev, "irq %d busy?\n", client->irq);
+		goto err_unreg_dev;
+	}
+
+	error = adp5588_setup(client);
+	if (error)
+		goto err_free_irq;
+
+	device_init_wakeup(&client->dev, 1);
+	i2c_set_clientdata(client, kpad);
+
+	dev_info(&client->dev, "Rev.%d keypad, irq %d\n", revid, client->irq);
+	return 0;
+
+ err_free_irq:
+	free_irq(client->irq, kpad);
+ err_unreg_dev:
+	input_unregister_device(input);
+	input = NULL;
+ err_free_mem:
+	input_free_device(input);
+	kfree(kpad);
+
+	return error;
+}
+
+static int __devexit adp5588_remove(struct i2c_client *client)
+{
+	struct adp5588_kpad *kpad = i2c_get_clientdata(client);
+
+	adp5588_write(client, CFG, 0);
+	free_irq(client->irq, kpad);
+	cancel_delayed_work_sync(&kpad->work);
+	input_unregister_device(kpad->input);
+	i2c_set_clientdata(client, NULL);
+	kfree(kpad);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int adp5588_suspend(struct device *dev)
+{
+	struct adp5588_kpad *kpad = dev_get_drvdata(dev);
+	struct i2c_client *client = kpad->client;
+
+	disable_irq(client->irq);
+	cancel_delayed_work_sync(&kpad->work);
+
+	if (device_may_wakeup(&client->dev))
+		enable_irq_wake(client->irq);
+
+	return 0;
+}
+
+static int adp5588_resume(struct device *dev)
+{
+	struct adp5588_kpad *kpad = dev_get_drvdata(dev);
+	struct i2c_client *client = kpad->client;
+
+	if (device_may_wakeup(&client->dev))
+		disable_irq_wake(client->irq);
+
+	enable_irq(client->irq);
+
+	return 0;
+}
+
+static struct dev_pm_ops adp5588_dev_pm_ops = {
+	.suspend = adp5588_suspend,
+	.resume  = adp5588_resume,
+};
+#endif
+
+static const struct i2c_device_id adp5588_id[] = {
+	{ KBUILD_MODNAME, 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, adp5588_id);
+
+static struct i2c_driver adp5588_driver = {
+	.driver = {
+		.name = KBUILD_MODNAME,
+#ifdef CONFIG_PM
+		.pm   = &adp5588_dev_pm_ops,
+#endif
+	},
+	.probe    = adp5588_probe,
+	.remove   = __devexit_p(adp5588_remove),
+	.id_table = adp5588_id,
+};
+
+static int __init adp5588_init(void)
+{
+	return i2c_add_driver(&adp5588_driver);
+}
+module_init(adp5588_init);
+
+static void __exit adp5588_exit(void)
+{
+	i2c_del_driver(&adp5588_driver);
+}
+module_exit(adp5588_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
+MODULE_DESCRIPTION("ADP5588 Keypad driver");
+MODULE_ALIAS("platform:adp5588-keys");
diff --git a/drivers/input/keyboard/atkbd.c b/drivers/input/keyboard/atkbd.c
index adb09e2ba39..4709e15af60 100644
--- a/drivers/input/keyboard/atkbd.c
+++ b/drivers/input/keyboard/atkbd.c
@@ -773,23 +773,6 @@ static int atkbd_select_set(struct atkbd *atkbd, int target_set, int allow_extra
 static int atkbd_activate(struct atkbd *atkbd)
 {
 	struct ps2dev *ps2dev = &atkbd->ps2dev;
-	unsigned char param[1];
-
-/*
- * Set the LEDs to a defined state.
- */
-
-	param[0] = 0;
-	if (ps2_command(ps2dev, param, ATKBD_CMD_SETLEDS))
-		return -1;
-
-/*
- * Set autorepeat to fastest possible.
- */
-
-	param[0] = 0;
-	if (ps2_command(ps2dev, param, ATKBD_CMD_SETREP))
-		return -1;
 
 /*
  * Enable the keyboard to receive keystrokes.
@@ -1158,14 +1141,6 @@ static int atkbd_reconnect(struct serio *serio)
 			return -1;
 
 		atkbd_activate(atkbd);
-
-/*
- * Restore repeat rate and LEDs (that were reset by atkbd_activate)
- * to pre-resume state
- */
-		if (!atkbd->softrepeat)
-			atkbd_set_repeat_rate(atkbd);
-		atkbd_set_leds(atkbd);
 	}
 
 	atkbd_enable(atkbd);
diff --git a/drivers/input/keyboard/max7359_keypad.c b/drivers/input/keyboard/max7359_keypad.c
new file mode 100644
index 00000000000..3b5b948eba3
--- /dev/null
+++ b/drivers/input/keyboard/max7359_keypad.c
@@ -0,0 +1,330 @@
+/*
+ * max7359_keypad.c - MAX7359 Key Switch Controller Driver
+ *
+ * Copyright (C) 2009 Samsung Electronics
+ * Kim Kyuwon <q1.kim@samsung.com>
+ *
+ * Based on pxa27x_keypad.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Datasheet: http://www.maxim-ic.com/quick_view2.cfm/qv_pk/5456
+ */
+
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/input/matrix_keypad.h>
+
+#define MAX7359_MAX_KEY_ROWS	8
+#define MAX7359_MAX_KEY_COLS	8
+#define MAX7359_MAX_KEY_NUM	(MAX7359_MAX_KEY_ROWS * MAX7359_MAX_KEY_COLS)
+#define MAX7359_ROW_SHIFT	3
+
+/*
+ * MAX7359 registers
+ */
+#define MAX7359_REG_KEYFIFO	0x00
+#define MAX7359_REG_CONFIG	0x01
+#define MAX7359_REG_DEBOUNCE	0x02
+#define MAX7359_REG_INTERRUPT	0x03
+#define MAX7359_REG_PORTS	0x04
+#define MAX7359_REG_KEYREP	0x05
+#define MAX7359_REG_SLEEP	0x06
+
+/*
+ * Configuration register bits
+ */
+#define MAX7359_CFG_SLEEP	(1 << 7)
+#define MAX7359_CFG_INTERRUPT	(1 << 5)
+#define MAX7359_CFG_KEY_RELEASE	(1 << 3)
+#define MAX7359_CFG_WAKEUP	(1 << 1)
+#define MAX7359_CFG_TIMEOUT	(1 << 0)
+
+/*
+ * Autosleep register values (ms)
+ */
+#define MAX7359_AUTOSLEEP_8192	0x01
+#define MAX7359_AUTOSLEEP_4096	0x02
+#define MAX7359_AUTOSLEEP_2048	0x03
+#define MAX7359_AUTOSLEEP_1024	0x04
+#define MAX7359_AUTOSLEEP_512	0x05
+#define MAX7359_AUTOSLEEP_256	0x06
+
+struct max7359_keypad {
+	/* matrix key code map */
+	unsigned short keycodes[MAX7359_MAX_KEY_NUM];
+
+	struct input_dev *input_dev;
+	struct i2c_client *client;
+};
+
+static int max7359_write_reg(struct i2c_client *client, u8 reg, u8 val)
+{
+	int ret = i2c_smbus_write_byte_data(client, reg, val);
+
+	if (ret < 0)
+		dev_err(&client->dev, "%s: reg 0x%x, val 0x%x, err %d\n",
+			__func__, reg, val, ret);
+	return ret;
+}
+
+static int max7359_read_reg(struct i2c_client *client, int reg)
+{
+	int ret = i2c_smbus_read_byte_data(client, reg);
+
+	if (ret < 0)
+		dev_err(&client->dev, "%s: reg 0x%x, err %d\n",
+			__func__, reg, ret);
+	return ret;
+}
+
+static void max7359_build_keycode(struct max7359_keypad *keypad,
+				const struct matrix_keymap_data *keymap_data)
+{
+	struct input_dev *input_dev = keypad->input_dev;
+	int i;
+
+	for (i = 0; i < keymap_data->keymap_size; i++) {
+		unsigned int key = keymap_data->keymap[i];
+		unsigned int row = KEY_ROW(key);
+		unsigned int col = KEY_COL(key);
+		unsigned int scancode = MATRIX_SCAN_CODE(row, col,
+						MAX7359_ROW_SHIFT);
+		unsigned short keycode = KEY_VAL(key);
+
+		keypad->keycodes[scancode] = keycode;
+		__set_bit(keycode, input_dev->keybit);
+	}
+	__clear_bit(KEY_RESERVED, input_dev->keybit);
+}
+
+/* runs in an IRQ thread -- can (and will!) sleep */
+static irqreturn_t max7359_interrupt(int irq, void *dev_id)
+{
+	struct max7359_keypad *keypad = dev_id;
+	struct input_dev *input_dev = keypad->input_dev;
+	int val, row, col, release, code;
+
+	val = max7359_read_reg(keypad->client, MAX7359_REG_KEYFIFO);
+	row = val & 0x7;
+	col = (val >> 3) & 0x7;
+	release = val & 0x40;
+
+	code = MATRIX_SCAN_CODE(row, col, MAX7359_ROW_SHIFT);
+
+	dev_dbg(&keypad->client->dev,
+		"key[%d:%d] %s\n", row, col, release ? "release" : "press");
+
+	input_event(input_dev, EV_MSC, MSC_SCAN, code);
+	input_report_key(input_dev, keypad->keycodes[code], !release);
+	input_sync(input_dev);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Let MAX7359 fall into a deep sleep:
+ * If no keys are pressed, enter sleep mode for 8192 ms. And if any
+ * key is pressed, the MAX7359 returns to normal operating mode.
+ */
+static inline void max7359_fall_deepsleep(struct i2c_client *client)
+{
+	max7359_write_reg(client, MAX7359_REG_SLEEP, MAX7359_AUTOSLEEP_8192);
+}
+
+/*
+ * Let MAX7359 take a catnap:
+ * Autosleep just for 256 ms.
+ */
+static inline void max7359_take_catnap(struct i2c_client *client)
+{
+	max7359_write_reg(client, MAX7359_REG_SLEEP, MAX7359_AUTOSLEEP_256);
+}
+
+static int max7359_open(struct input_dev *dev)
+{
+	struct max7359_keypad *keypad = input_get_drvdata(dev);
+
+	max7359_take_catnap(keypad->client);
+
+	return 0;
+}
+
+static void max7359_close(struct input_dev *dev)
+{
+	struct max7359_keypad *keypad = input_get_drvdata(dev);
+
+	max7359_fall_deepsleep(keypad->client);
+}
+
+static void max7359_initialize(struct i2c_client *client)
+{
+	max7359_write_reg(client, MAX7359_REG_CONFIG,
+		MAX7359_CFG_INTERRUPT | /* Irq clears after host read */
+		MAX7359_CFG_KEY_RELEASE | /* Key release enable */
+		MAX7359_CFG_WAKEUP); /* Key press wakeup enable */
+
+	/* Full key-scan functionality */
+	max7359_write_reg(client, MAX7359_REG_DEBOUNCE, 0x1F);
+
+	/* nINT asserts every debounce cycles */
+	max7359_write_reg(client, MAX7359_REG_INTERRUPT, 0x01);
+
+	max7359_fall_deepsleep(client);
+}
+
+static int __devinit max7359_probe(struct i2c_client *client,
+					const struct i2c_device_id *id)
+{
+	const struct matrix_keymap_data *keymap_data = client->dev.platform_data;
+	struct max7359_keypad *keypad;
+	struct input_dev *input_dev;
+	int ret;
+	int error;
+
+	if (!client->irq) {
+		dev_err(&client->dev, "The irq number should not be zero\n");
+		return -EINVAL;
+	}
+
+	/* Detect MAX7359: The initial Keys FIFO value is '0x3F' */
+	ret = max7359_read_reg(client, MAX7359_REG_KEYFIFO);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed to detect device\n");
+		return -ENODEV;
+	}
+
+	dev_dbg(&client->dev, "keys FIFO is 0x%02x\n", ret);
+
+	keypad = kzalloc(sizeof(struct max7359_keypad), GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!keypad || !input_dev) {
+		dev_err(&client->dev, "failed to allocate memory\n");
+		error = -ENOMEM;
+		goto failed_free_mem;
+	}
+
+	keypad->client = client;
+	keypad->input_dev = input_dev;
+
+	input_dev->name = client->name;
+	input_dev->id.bustype = BUS_I2C;
+	input_dev->open = max7359_open;
+	input_dev->close = max7359_close;
+	input_dev->dev.parent = &client->dev;
+
+	input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP);
+	input_dev->keycodesize = sizeof(keypad->keycodes[0]);
+	input_dev->keycodemax = ARRAY_SIZE(keypad->keycodes);
+	input_dev->keycode = keypad->keycodes;
+
+	input_set_capability(input_dev, EV_MSC, MSC_SCAN);
+	input_set_drvdata(input_dev, keypad);
+
+	max7359_build_keycode(keypad, keymap_data);
+
+	error = request_threaded_irq(client->irq, NULL, max7359_interrupt,
+				     IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+				     client->name, keypad);
+	if (error) {
+		dev_err(&client->dev, "failed to register interrupt\n");
+		goto failed_free_mem;
+	}
+
+	/* Register the input device */
+	error = input_register_device(input_dev);
+	if (error) {
+		dev_err(&client->dev, "failed to register input device\n");
+		goto failed_free_irq;
+	}
+
+	/* Initialize MAX7359 */
+	max7359_initialize(client);
+
+	i2c_set_clientdata(client, keypad);
+	device_init_wakeup(&client->dev, 1);
+
+	return 0;
+
+failed_free_irq:
+	free_irq(client->irq, keypad);
+failed_free_mem:
+	input_free_device(input_dev);
+	kfree(keypad);
+	return error;
+}
+
+static int __devexit max7359_remove(struct i2c_client *client)
+{
+	struct max7359_keypad *keypad = i2c_get_clientdata(client);
+
+	free_irq(client->irq, keypad);
+	input_unregister_device(keypad->input_dev);
+	i2c_set_clientdata(client, NULL);
+	kfree(keypad);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int max7359_suspend(struct i2c_client *client, pm_message_t mesg)
+{
+	max7359_fall_deepsleep(client);
+
+	if (device_may_wakeup(&client->dev))
+		enable_irq_wake(client->irq);
+
+	return 0;
+}
+
+static int max7359_resume(struct i2c_client *client)
+{
+	if (device_may_wakeup(&client->dev))
+		disable_irq_wake(client->irq);
+
+	/* Restore the default setting */
+	max7359_take_catnap(client);
+
+	return 0;
+}
+#else
+#define max7359_suspend	NULL
+#define max7359_resume	NULL
+#endif
+
+static const struct i2c_device_id max7359_ids[] = {
+	{ "max7359", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, max7359_ids);
+
+static struct i2c_driver max7359_i2c_driver = {
+	.driver = {
+		.name = "max7359",
+	},
+	.probe		= max7359_probe,
+	.remove		= __devexit_p(max7359_remove),
+	.suspend	= max7359_suspend,
+	.resume		= max7359_resume,
+	.id_table	= max7359_ids,
+};
+
+static int __init max7359_init(void)
+{
+	return i2c_add_driver(&max7359_i2c_driver);
+}
+module_init(max7359_init);
+
+static void __exit max7359_exit(void)
+{
+	i2c_del_driver(&max7359_i2c_driver);
+}
+module_exit(max7359_exit);
+
+MODULE_AUTHOR("Kim Kyuwon <q1.kim@samsung.com>");
+MODULE_DESCRIPTION("MAX7359 Key Switch Controller Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/input/keyboard/opencores-kbd.c b/drivers/input/keyboard/opencores-kbd.c
new file mode 100644
index 00000000000..78cccddbf55
--- /dev/null
+++ b/drivers/input/keyboard/opencores-kbd.c
@@ -0,0 +1,180 @@
+/*
+ * OpenCores Keyboard Controller Driver
+ * http://www.opencores.org/project,keyboardcontroller
+ *
+ * Copyright 2007-2009 HV Sistemas S.L.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+struct opencores_kbd {
+	struct input_dev *input;
+	struct resource *addr_res;
+	void __iomem *addr;
+	int irq;
+	unsigned short keycodes[128];
+};
+
+static irqreturn_t opencores_kbd_isr(int irq, void *dev_id)
+{
+	struct opencores_kbd *opencores_kbd = dev_id;
+	struct input_dev *input = opencores_kbd->input;
+	unsigned char c;
+
+	c = readb(opencores_kbd->addr);
+	input_report_key(input, c & 0x7f, c & 0x80 ? 0 : 1);
+	input_sync(input);
+
+	return IRQ_HANDLED;
+}
+
+static int __devinit opencores_kbd_probe(struct platform_device *pdev)
+{
+	struct input_dev *input;
+	struct opencores_kbd *opencores_kbd;
+	struct resource *res;
+	int irq, i, error;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "missing board memory resource\n");
+		return -EINVAL;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "missing board IRQ resource\n");
+		return -EINVAL;
+	}
+
+	opencores_kbd = kzalloc(sizeof(*opencores_kbd), GFP_KERNEL);
+	input = input_allocate_device();
+	if (!opencores_kbd || !input) {
+		dev_err(&pdev->dev, "failed to allocate device structures\n");
+		error = -ENOMEM;
+		goto err_free_mem;
+	}
+
+	opencores_kbd->addr_res = res;
+	res = request_mem_region(res->start, resource_size(res), pdev->name);
+	if (!res) {
+		dev_err(&pdev->dev, "failed to request I/O memory\n");
+		error = -EBUSY;
+		goto err_free_mem;
+	}
+
+	opencores_kbd->addr = ioremap(res->start, resource_size(res));
+	if (!opencores_kbd->addr) {
+		dev_err(&pdev->dev, "failed to remap I/O memory\n");
+		error = -ENXIO;
+		goto err_rel_mem;
+	}
+
+	opencores_kbd->input = input;
+	opencores_kbd->irq = irq;
+
+	input->name = pdev->name;
+	input->phys = "opencores-kbd/input0";
+	input->dev.parent = &pdev->dev;
+
+	input_set_drvdata(input, opencores_kbd);
+
+	input->id.bustype = BUS_HOST;
+	input->id.vendor = 0x0001;
+	input->id.product = 0x0001;
+	input->id.version = 0x0100;
+
+	input->keycode = opencores_kbd->keycodes;
+	input->keycodesize = sizeof(opencores_kbd->keycodes[0]);
+	input->keycodemax = ARRAY_SIZE(opencores_kbd->keycodes);
+
+	__set_bit(EV_KEY, input->evbit);
+
+	for (i = 0; i < ARRAY_SIZE(opencores_kbd->keycodes); i++) {
+		/*
+		 * OpenCores controller happens to have scancodes match
+		 * our KEY_* definitions.
+		 */
+		opencores_kbd->keycodes[i] = i;
+		__set_bit(opencores_kbd->keycodes[i], input->keybit);
+	}
+	__clear_bit(KEY_RESERVED, input->keybit);
+
+	error = request_irq(irq, &opencores_kbd_isr,
+			    IRQF_TRIGGER_RISING, pdev->name, opencores_kbd);
+	if (error) {
+		dev_err(&pdev->dev, "unable to claim irq %d\n", irq);
+		goto err_unmap_mem;
+	}
+
+	error = input_register_device(input);
+	if (error) {
+		dev_err(&pdev->dev, "unable to register input device\n");
+		goto err_free_irq;
+	}
+
+	platform_set_drvdata(pdev, opencores_kbd);
+
+	return 0;
+
+ err_free_irq:
+	free_irq(irq, opencores_kbd);
+ err_unmap_mem:
+	iounmap(opencores_kbd->addr);
+ err_rel_mem:
+	release_mem_region(res->start, resource_size(res));
+ err_free_mem:
+	input_free_device(input);
+	kfree(opencores_kbd);
+
+	return error;
+}
+
+static int __devexit opencores_kbd_remove(struct platform_device *pdev)
+{
+	struct opencores_kbd *opencores_kbd = platform_get_drvdata(pdev);
+
+	free_irq(opencores_kbd->irq, opencores_kbd);
+
+	iounmap(opencores_kbd->addr);
+	release_mem_region(opencores_kbd->addr_res->start,
+		resource_size(opencores_kbd->addr_res));
+	input_unregister_device(opencores_kbd->input);
+	kfree(opencores_kbd);
+
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+static struct platform_driver opencores_kbd_device_driver = {
+	.probe    = opencores_kbd_probe,
+	.remove   = __devexit_p(opencores_kbd_remove),
+	.driver   = {
+		.name = "opencores-kbd",
+	},
+};
+
+static int __init opencores_kbd_init(void)
+{
+	return platform_driver_register(&opencores_kbd_device_driver);
+}
+module_init(opencores_kbd_init);
+
+static void __exit opencores_kbd_exit(void)
+{
+	platform_driver_unregister(&opencores_kbd_device_driver);
+}
+module_exit(opencores_kbd_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Javier Herrero <jherrero@hvsistemas.es>");
+MODULE_DESCRIPTION("Keyboard driver for OpenCores Keyboard Controller");
diff --git a/drivers/input/keyboard/qt2160.c b/drivers/input/keyboard/qt2160.c
new file mode 100644
index 00000000000..191cc51d6cf
--- /dev/null
+++ b/drivers/input/keyboard/qt2160.c
@@ -0,0 +1,397 @@
+/*
+ *  qt2160.c - Atmel AT42QT2160 Touch Sense Controller
+ *
+ *  Copyright (C) 2009 Raphael Derosso Pereira <raphaelpereira@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/i2c.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+
+#define QT2160_VALID_CHIPID  0x11
+
+#define QT2160_CMD_CHIPID     0
+#define QT2160_CMD_CODEVER    1
+#define QT2160_CMD_GSTAT      2
+#define QT2160_CMD_KEYS3      3
+#define QT2160_CMD_KEYS4      4
+#define QT2160_CMD_SLIDE      5
+#define QT2160_CMD_GPIOS      6
+#define QT2160_CMD_SUBVER     7
+#define QT2160_CMD_CALIBRATE  10
+
+#define QT2160_CYCLE_INTERVAL	(2*HZ)
+
+static unsigned char qt2160_key2code[] = {
+	KEY_0, KEY_1, KEY_2, KEY_3,
+	KEY_4, KEY_5, KEY_6, KEY_7,
+	KEY_8, KEY_9, KEY_A, KEY_B,
+	KEY_C, KEY_D, KEY_E, KEY_F,
+};
+
+struct qt2160_data {
+	struct i2c_client *client;
+	struct input_dev *input;
+	struct delayed_work dwork;
+	spinlock_t lock;        /* Protects canceling/rescheduling of dwork */
+	unsigned short keycodes[ARRAY_SIZE(qt2160_key2code)];
+	u16 key_matrix;
+};
+
+static int qt2160_read_block(struct i2c_client *client,
+			     u8 inireg, u8 *buffer, unsigned int count)
+{
+	int error, idx = 0;
+
+	/*
+	 * Can't use SMBus block data read. Check for I2C functionality to speed
+	 * things up whenever possible. Otherwise we will be forced to read
+	 * sequentially.
+	 */
+	if (i2c_check_functionality(client->adapter, I2C_FUNC_I2C))	{
+
+		error = i2c_smbus_write_byte(client, inireg + idx);
+		if (error) {
+			dev_err(&client->dev,
+				"couldn't send request. Returned %d\n", error);
+			return error;
+		}
+
+		error = i2c_master_recv(client, buffer, count);
+		if (error != count) {
+			dev_err(&client->dev,
+				"couldn't read registers. Returned %d bytes\n", error);
+			return error;
+		}
+	} else {
+
+		while (count--) {
+			int data;
+
+			error = i2c_smbus_write_byte(client, inireg + idx);
+			if (error) {
+				dev_err(&client->dev,
+					"couldn't send request. Returned %d\n", error);
+				return error;
+			}
+
+			data = i2c_smbus_read_byte(client);
+			if (data < 0) {
+				dev_err(&client->dev,
+					"couldn't read register. Returned %d\n", data);
+				return data;
+			}
+
+			buffer[idx++] = data;
+		}
+	}
+
+	return 0;
+}
+
+static int qt2160_get_key_matrix(struct qt2160_data *qt2160)
+{
+	struct i2c_client *client = qt2160->client;
+	struct input_dev *input = qt2160->input;
+	u8 regs[6];
+	u16 old_matrix, new_matrix;
+	int ret, i, mask;
+
+	dev_dbg(&client->dev, "requesting keys...\n");
+
+	/*
+	 * Read all registers from General Status Register
+	 * to GPIOs register
+	 */
+	ret = qt2160_read_block(client, QT2160_CMD_GSTAT, regs, 6);
+	if (ret) {
+		dev_err(&client->dev,
+			"could not perform chip read.\n");
+		return ret;
+	}
+
+	old_matrix = qt2160->key_matrix;
+	qt2160->key_matrix = new_matrix = (regs[2] << 8) | regs[1];
+
+	mask = 0x01;
+	for (i = 0; i < 16; ++i, mask <<= 1) {
+		int keyval = new_matrix & mask;
+
+		if ((old_matrix & mask) != keyval) {
+			input_report_key(input, qt2160->keycodes[i], keyval);
+			dev_dbg(&client->dev, "key %d %s\n",
+				i, keyval ? "pressed" : "released");
+		}
+	}
+
+	input_sync(input);
+
+	return 0;
+}
+
+static irqreturn_t qt2160_irq(int irq, void *_qt2160)
+{
+	struct qt2160_data *qt2160 = _qt2160;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qt2160->lock, flags);
+
+	__cancel_delayed_work(&qt2160->dwork);
+	schedule_delayed_work(&qt2160->dwork, 0);
+
+	spin_unlock_irqrestore(&qt2160->lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static void qt2160_schedule_read(struct qt2160_data *qt2160)
+{
+	spin_lock_irq(&qt2160->lock);
+	schedule_delayed_work(&qt2160->dwork, QT2160_CYCLE_INTERVAL);
+	spin_unlock_irq(&qt2160->lock);
+}
+
+static void qt2160_worker(struct work_struct *work)
+{
+	struct qt2160_data *qt2160 =
+		container_of(work, struct qt2160_data, dwork.work);
+
+	dev_dbg(&qt2160->client->dev, "worker\n");
+
+	qt2160_get_key_matrix(qt2160);
+
+	/* Avoid device lock up by checking every so often */
+	qt2160_schedule_read(qt2160);
+}
+
+static int __devinit qt2160_read(struct i2c_client *client, u8 reg)
+{
+	int ret;
+
+	ret = i2c_smbus_write_byte(client, reg);
+	if (ret) {
+		dev_err(&client->dev,
+			"couldn't send request. Returned %d\n", ret);
+		return ret;
+	}
+
+	ret = i2c_smbus_read_byte(client);
+	if (ret < 0) {
+		dev_err(&client->dev,
+			"couldn't read register. Returned %d\n", ret);
+		return ret;
+	}
+
+	return ret;
+}
+
+static int __devinit qt2160_write(struct i2c_client *client, u8 reg, u8 data)
+{
+	int error;
+
+	error = i2c_smbus_write_byte(client, reg);
+	if (error) {
+		dev_err(&client->dev,
+			"couldn't send request. Returned %d\n", error);
+		return error;
+	}
+
+	error = i2c_smbus_write_byte(client, data);
+	if (error) {
+		dev_err(&client->dev,
+			"couldn't write data. Returned %d\n", error);
+		return error;
+	}
+
+	return error;
+}
+
+
+static bool __devinit qt2160_identify(struct i2c_client *client)
+{
+	int id, ver, rev;
+
+	/* Read Chid ID to check if chip is valid */
+	id = qt2160_read(client, QT2160_CMD_CHIPID);
+	if (id != QT2160_VALID_CHIPID) {
+		dev_err(&client->dev, "ID %d not supported\n", id);
+		return false;
+	}
+
+	/* Read chip firmware version */
+	ver = qt2160_read(client, QT2160_CMD_CODEVER);
+	if (ver < 0) {
+		dev_err(&client->dev, "could not get firmware version\n");
+		return false;
+	}
+
+	/* Read chip firmware revision */
+	rev = qt2160_read(client, QT2160_CMD_SUBVER);
+	if (rev < 0) {
+		dev_err(&client->dev, "could not get firmware revision\n");
+		return false;
+	}
+
+	dev_info(&client->dev, "AT42QT2160 firmware version %d.%d.%d\n",
+			ver >> 4, ver & 0xf, rev);
+
+	return true;
+}
+
+static int __devinit qt2160_probe(struct i2c_client *client,
+				  const struct i2c_device_id *id)
+{
+	struct qt2160_data *qt2160;
+	struct input_dev *input;
+	int i;
+	int error;
+
+	/* Check functionality */
+	error = i2c_check_functionality(client->adapter,
+			I2C_FUNC_SMBUS_BYTE);
+	if (!error) {
+		dev_err(&client->dev, "%s adapter not supported\n",
+				dev_driver_string(&client->adapter->dev));
+		return -ENODEV;
+	}
+
+	if (!qt2160_identify(client))
+		return -ENODEV;
+
+	/* Chip is valid and active. Allocate structure */
+	qt2160 = kzalloc(sizeof(struct qt2160_data), GFP_KERNEL);
+	input = input_allocate_device();
+	if (!qt2160 || !input) {
+		dev_err(&client->dev, "insufficient memory\n");
+		error = -ENOMEM;
+		goto err_free_mem;
+	}
+
+	qt2160->client = client;
+	qt2160->input = input;
+	INIT_DELAYED_WORK(&qt2160->dwork, qt2160_worker);
+	spin_lock_init(&qt2160->lock);
+
+	input->name = "AT42QT2160 Touch Sense Keyboard";
+	input->id.bustype = BUS_I2C;
+
+	input->keycode = qt2160->keycodes;
+	input->keycodesize = sizeof(qt2160->keycodes[0]);
+	input->keycodemax = ARRAY_SIZE(qt2160_key2code);
+
+	__set_bit(EV_KEY, input->evbit);
+	__clear_bit(EV_REP, input->evbit);
+	for (i = 0; i < ARRAY_SIZE(qt2160_key2code); i++) {
+		qt2160->keycodes[i] = qt2160_key2code[i];
+		__set_bit(qt2160_key2code[i], input->keybit);
+	}
+	__clear_bit(KEY_RESERVED, input->keybit);
+
+	/* Calibrate device */
+	error = qt2160_write(client, QT2160_CMD_CALIBRATE, 1);
+	if (error) {
+		dev_err(&client->dev, "failed to calibrate device\n");
+		goto err_free_mem;
+	}
+
+	if (client->irq) {
+		error = request_irq(client->irq, qt2160_irq,
+				    IRQF_TRIGGER_FALLING, "qt2160", qt2160);
+		if (error) {
+			dev_err(&client->dev,
+				"failed to allocate irq %d\n", client->irq);
+			goto err_free_mem;
+		}
+	}
+
+	error = input_register_device(qt2160->input);
+	if (error) {
+		dev_err(&client->dev,
+			"Failed to register input device\n");
+		goto err_free_irq;
+	}
+
+	i2c_set_clientdata(client, qt2160);
+	qt2160_schedule_read(qt2160);
+
+	return 0;
+
+err_free_irq:
+	if (client->irq)
+		free_irq(client->irq, qt2160);
+err_free_mem:
+	input_free_device(input);
+	kfree(qt2160);
+	return error;
+}
+
+static int __devexit qt2160_remove(struct i2c_client *client)
+{
+	struct qt2160_data *qt2160 = i2c_get_clientdata(client);
+
+	/* Release IRQ so no queue will be scheduled */
+	if (client->irq)
+		free_irq(client->irq, qt2160);
+
+	cancel_delayed_work_sync(&qt2160->dwork);
+
+	input_unregister_device(qt2160->input);
+	kfree(qt2160);
+
+	i2c_set_clientdata(client, NULL);
+	return 0;
+}
+
+static struct i2c_device_id qt2160_idtable[] = {
+	{ "qt2160", 0, },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(i2c, qt2160_idtable);
+
+static struct i2c_driver qt2160_driver = {
+	.driver = {
+		.name	= "qt2160",
+		.owner  = THIS_MODULE,
+	},
+
+	.id_table	= qt2160_idtable,
+	.probe		= qt2160_probe,
+	.remove		= __devexit_p(qt2160_remove),
+};
+
+static int __init qt2160_init(void)
+{
+	return i2c_add_driver(&qt2160_driver);
+}
+module_init(qt2160_init);
+
+static void __exit qt2160_cleanup(void)
+{
+	i2c_del_driver(&qt2160_driver);
+}
+module_exit(qt2160_cleanup);
+
+MODULE_AUTHOR("Raphael Derosso Pereira <raphaelpereira@gmail.com>");
+MODULE_DESCRIPTION("Driver for AT42QT2160 Touch Sensor");
+MODULE_LICENSE("GPL");
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 76d6751f89a..02f4f8f1db6 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -225,6 +225,7 @@ config INPUT_SGI_BTNS
 config INPUT_WINBOND_CIR
 	tristate "Winbond IR remote control"
 	depends on X86 && PNP
+	select NEW_LEDS
 	select LEDS_CLASS
 	select BITREVERSE
 	help
diff --git a/drivers/input/misc/dm355evm_keys.c b/drivers/input/misc/dm355evm_keys.c
index 0918acae584..f2b67dc81d8 100644
--- a/drivers/input/misc/dm355evm_keys.c
+++ b/drivers/input/misc/dm355evm_keys.c
@@ -96,7 +96,13 @@ static struct {
 	{ 0x3169, KEY_PAUSE, },
 };
 
-/* runs in an IRQ thread -- can (and will!) sleep */
+/*
+ * Because we communicate with the MSP430 using I2C, and all I2C calls
+ * in Linux sleep, we use a threaded IRQ handler.  The IRQ itself is
+ * active low, but we go through the GPIO controller so we can trigger
+ * on falling edges and not worry about enabling/disabling the IRQ in
+ * the keypress handling path.
+ */
 static irqreturn_t dm355evm_keys_irq(int irq, void *_keys)
 {
 	struct dm355evm_keys	*keys = _keys;
@@ -171,18 +177,6 @@ static irqreturn_t dm355evm_keys_irq(int irq, void *_keys)
 	return IRQ_HANDLED;
 }
 
-/*
- * Because we communicate with the MSP430 using I2C, and all I2C calls
- * in Linux sleep, we use a threaded IRQ handler.  The IRQ itself is
- * active low, but we go through the GPIO controller so we can trigger
- * on falling edges and not worry about enabling/disabling the IRQ in
- * the keypress handling path.
- */
-static irqreturn_t dm355evm_keys_hardirq(int irq, void *_keys)
-{
-	return IRQ_WAKE_THREAD;
-}
-
 static int dm355evm_setkeycode(struct input_dev *dev, int index, int keycode)
 {
 	u16		old_keycode;
@@ -257,10 +251,8 @@ static int __devinit dm355evm_keys_probe(struct platform_device *pdev)
 
 	/* REVISIT:  flush the event queue? */
 
-	status = request_threaded_irq(keys->irq,
-			dm355evm_keys_hardirq, dm355evm_keys_irq,
-			IRQF_TRIGGER_FALLING,
-			dev_name(&pdev->dev), keys);
+	status = request_threaded_irq(keys->irq, NULL, dm355evm_keys_irq,
+			IRQF_TRIGGER_FALLING, dev_name(&pdev->dev), keys);
 	if (status < 0)
 		goto fail1;
 
diff --git a/drivers/input/mouse/sentelic.c b/drivers/input/mouse/sentelic.c
index 84e2fc04d11..f84cbd97c88 100644
--- a/drivers/input/mouse/sentelic.c
+++ b/drivers/input/mouse/sentelic.c
@@ -92,7 +92,8 @@ static int fsp_reg_read(struct psmouse *psmouse, int reg_addr, int *reg_val)
 	 */
 	ps2_command(ps2dev, NULL, PSMOUSE_CMD_DISABLE);
 	psmouse_set_state(psmouse, PSMOUSE_CMD_MODE);
-	mutex_lock(&ps2dev->cmd_mutex);
+
+	ps2_begin_command(ps2dev);
 
 	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
 		goto out;
@@ -126,7 +127,7 @@ static int fsp_reg_read(struct psmouse *psmouse, int reg_addr, int *reg_val)
 	rc = 0;
 
  out:
-	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_end_command(ps2dev);
 	ps2_command(ps2dev, NULL, PSMOUSE_CMD_ENABLE);
 	psmouse_set_state(psmouse, PSMOUSE_ACTIVATED);
 	dev_dbg(&ps2dev->serio->dev, "READ REG: 0x%02x is 0x%02x (rc = %d)\n",
@@ -140,7 +141,7 @@ static int fsp_reg_write(struct psmouse *psmouse, int reg_addr, int reg_val)
 	unsigned char v;
 	int rc = -1;
 
-	mutex_lock(&ps2dev->cmd_mutex);
+	ps2_begin_command(ps2dev);
 
 	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
 		goto out;
@@ -179,7 +180,7 @@ static int fsp_reg_write(struct psmouse *psmouse, int reg_addr, int reg_val)
 	rc = 0;
 
  out:
-	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_end_command(ps2dev);
 	dev_dbg(&ps2dev->serio->dev, "WRITE REG: 0x%02x to 0x%02x (rc = %d)\n",
 		reg_addr, reg_val, rc);
 	return rc;
@@ -214,7 +215,8 @@ static int fsp_page_reg_read(struct psmouse *psmouse, int *reg_val)
 
 	ps2_command(ps2dev, NULL, PSMOUSE_CMD_DISABLE);
 	psmouse_set_state(psmouse, PSMOUSE_CMD_MODE);
-	mutex_lock(&ps2dev->cmd_mutex);
+
+	ps2_begin_command(ps2dev);
 
 	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
 		goto out;
@@ -236,7 +238,7 @@ static int fsp_page_reg_read(struct psmouse *psmouse, int *reg_val)
 	rc = 0;
 
  out:
-	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_end_command(ps2dev);
 	ps2_command(ps2dev, NULL, PSMOUSE_CMD_ENABLE);
 	psmouse_set_state(psmouse, PSMOUSE_ACTIVATED);
 	dev_dbg(&ps2dev->serio->dev, "READ PAGE REG: 0x%02x (rc = %d)\n",
@@ -250,7 +252,7 @@ static int fsp_page_reg_write(struct psmouse *psmouse, int reg_val)
 	unsigned char v;
 	int rc = -1;
 
-	mutex_lock(&ps2dev->cmd_mutex);
+	ps2_begin_command(ps2dev);
 
 	if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0)
 		goto out;
@@ -275,7 +277,7 @@ static int fsp_page_reg_write(struct psmouse *psmouse, int reg_val)
 	rc = 0;
 
  out:
-	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_end_command(ps2dev);
 	dev_dbg(&ps2dev->serio->dev, "WRITE PAGE REG: to 0x%02x (rc = %d)\n",
 		reg_val, rc);
 	return rc;
diff --git a/drivers/input/mouse/synaptics_i2c.c b/drivers/input/mouse/synaptics_i2c.c
index eac9fdde7ee..7283c78044a 100644
--- a/drivers/input/mouse/synaptics_i2c.c
+++ b/drivers/input/mouse/synaptics_i2c.c
@@ -203,7 +203,7 @@ MODULE_PARM_DESC(no_filter, "No Filter. Default = 0 (off)");
  * and the irq configuration should be set to Falling Edge Trigger
  */
 /* Control IRQ / Polling option */
-static int polling_req;
+static bool polling_req;
 module_param(polling_req, bool, 0444);
 MODULE_PARM_DESC(polling_req, "Request Polling. Default = 0 (use irq)");
 
@@ -217,6 +217,7 @@ struct synaptics_i2c {
 	struct i2c_client	*client;
 	struct input_dev	*input;
 	struct delayed_work	dwork;
+	spinlock_t		lock;
 	int			no_data_count;
 	int			no_decel_param;
 	int			reduce_report_param;
@@ -366,17 +367,28 @@ static bool synaptics_i2c_get_input(struct synaptics_i2c *touch)
 	return xy_delta || gesture;
 }
 
-static irqreturn_t synaptics_i2c_irq(int irq, void *dev_id)
+static void synaptics_i2c_reschedule_work(struct synaptics_i2c *touch,
+					  unsigned long delay)
 {
-	struct synaptics_i2c *touch = dev_id;
+	unsigned long flags;
+
+	spin_lock_irqsave(&touch->lock, flags);
 
 	/*
-	 * We want to have the work run immediately but it might have
-	 * already been scheduled with a delay, that's why we have to
-	 * cancel it first.
+	 * If work is already scheduled then subsequent schedules will not
+	 * change the scheduled time that's why we have to cancel it first.
 	 */
-	cancel_delayed_work(&touch->dwork);
-	schedule_delayed_work(&touch->dwork, 0);
+	__cancel_delayed_work(&touch->dwork);
+	schedule_delayed_work(&touch->dwork, delay);
+
+	spin_unlock_irqrestore(&touch->lock, flags);
+}
+
+static irqreturn_t synaptics_i2c_irq(int irq, void *dev_id)
+{
+	struct synaptics_i2c *touch = dev_id;
+
+	synaptics_i2c_reschedule_work(touch, 0);
 
 	return IRQ_HANDLED;
 }
@@ -452,7 +464,7 @@ static void synaptics_i2c_work_handler(struct work_struct *work)
 	 * We poll the device once in THREAD_IRQ_SLEEP_SECS and
 	 * if error is detected, we try to reset and reconfigure the touchpad.
 	 */
-	schedule_delayed_work(&touch->dwork, delay);
+	synaptics_i2c_reschedule_work(touch, delay);
 }
 
 static int synaptics_i2c_open(struct input_dev *input)
@@ -465,8 +477,8 @@ static int synaptics_i2c_open(struct input_dev *input)
 		return ret;
 
 	if (polling_req)
-		schedule_delayed_work(&touch->dwork,
-				       msecs_to_jiffies(NO_DATA_SLEEP_MSECS));
+		synaptics_i2c_reschedule_work(touch,
+				msecs_to_jiffies(NO_DATA_SLEEP_MSECS));
 
 	return 0;
 }
@@ -521,6 +533,7 @@ struct synaptics_i2c *synaptics_i2c_touch_create(struct i2c_client *client)
 	touch->scan_rate_param = scan_rate;
 	set_scan_rate(touch, scan_rate);
 	INIT_DELAYED_WORK(&touch->dwork, synaptics_i2c_work_handler);
+	spin_lock_init(&touch->lock);
 
 	return touch;
 }
@@ -535,14 +548,12 @@ static int __devinit synaptics_i2c_probe(struct i2c_client *client,
 	if (!touch)
 		return -ENOMEM;
 
-	i2c_set_clientdata(client, touch);
-
 	ret = synaptics_i2c_reset_config(client);
 	if (ret)
 		goto err_mem_free;
 
 	if (client->irq < 1)
-		polling_req = 1;
+		polling_req = true;
 
 	touch->input = input_allocate_device();
 	if (!touch->input) {
@@ -563,7 +574,7 @@ static int __devinit synaptics_i2c_probe(struct i2c_client *client,
 			dev_warn(&touch->client->dev,
 				  "IRQ request failed: %d, "
 				  "falling back to polling\n", ret);
-			polling_req = 1;
+			polling_req = true;
 			synaptics_i2c_reg_set(touch->client,
 					      INTERRUPT_EN_REG, 0);
 		}
@@ -580,12 +591,14 @@ static int __devinit synaptics_i2c_probe(struct i2c_client *client,
 			 "Input device register failed: %d\n", ret);
 		goto err_input_free;
 	}
+
+	i2c_set_clientdata(client, touch);
+
 	return 0;
 
 err_input_free:
 	input_free_device(touch->input);
 err_mem_free:
-	i2c_set_clientdata(client, NULL);
 	kfree(touch);
 
 	return ret;
@@ -596,7 +609,7 @@ static int __devexit synaptics_i2c_remove(struct i2c_client *client)
 	struct synaptics_i2c *touch = i2c_get_clientdata(client);
 
 	if (!polling_req)
-		free_irq(touch->client->irq, touch);
+		free_irq(client->irq, touch);
 
 	input_unregister_device(touch->input);
 	i2c_set_clientdata(client, NULL);
@@ -627,8 +640,8 @@ static int synaptics_i2c_resume(struct i2c_client *client)
 	if (ret)
 		return ret;
 
-	schedule_delayed_work(&touch->dwork,
-			       msecs_to_jiffies(NO_DATA_SLEEP_MSECS));
+	synaptics_i2c_reschedule_work(touch,
+				msecs_to_jiffies(NO_DATA_SLEEP_MSECS));
 
 	return 0;
 }
diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
index eb3ff94af58..bc56e52b945 100644
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -87,8 +87,22 @@ static bool i8042_bypass_aux_irq_test;
 
 #include "i8042.h"
 
+/*
+ * i8042_lock protects serialization between i8042_command and
+ * the interrupt handler.
+ */
 static DEFINE_SPINLOCK(i8042_lock);
 
+/*
+ * Writers to AUX and KBD ports as well as users issuing i8042_command
+ * directly should acquire i8042_mutex (by means of calling
+ * i8042_lock_chip() and i8042_unlock_ship() helpers) to ensure that
+ * they do not disturb each other (unfortunately in many i8042
+ * implementations write to one of the ports will immediately abort
+ * command that is being processed by another port).
+ */
+static DEFINE_MUTEX(i8042_mutex);
+
 struct i8042_port {
 	struct serio *serio;
 	int irq;
@@ -113,6 +127,18 @@ static struct platform_device *i8042_platform_device;
 
 static irqreturn_t i8042_interrupt(int irq, void *dev_id);
 
+void i8042_lock_chip(void)
+{
+	mutex_lock(&i8042_mutex);
+}
+EXPORT_SYMBOL(i8042_lock_chip);
+
+void i8042_unlock_chip(void)
+{
+	mutex_unlock(&i8042_mutex);
+}
+EXPORT_SYMBOL(i8042_unlock_chip);
+
 /*
  * The i8042_wait_read() and i8042_wait_write functions wait for the i8042 to
  * be ready for reading values from it / writing values to it.
@@ -1161,6 +1187,21 @@ static void __devexit i8042_unregister_ports(void)
 	}
 }
 
+/*
+ * Checks whether port belongs to i8042 controller.
+ */
+bool i8042_check_port_owner(const struct serio *port)
+{
+	int i;
+
+	for (i = 0; i < I8042_NUM_PORTS; i++)
+		if (i8042_ports[i].serio == port)
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL(i8042_check_port_owner);
+
 static void i8042_free_irqs(void)
 {
 	if (i8042_aux_irq_registered)
diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c
index 3a95b508bf2..769ba65a585 100644
--- a/drivers/input/serio/libps2.c
+++ b/drivers/input/serio/libps2.c
@@ -17,6 +17,7 @@
 #include <linux/interrupt.h>
 #include <linux/input.h>
 #include <linux/serio.h>
+#include <linux/i8042.h>
 #include <linux/init.h>
 #include <linux/libps2.h>
 
@@ -54,6 +55,24 @@ int ps2_sendbyte(struct ps2dev *ps2dev, unsigned char byte, int timeout)
 }
 EXPORT_SYMBOL(ps2_sendbyte);
 
+void ps2_begin_command(struct ps2dev *ps2dev)
+{
+	mutex_lock(&ps2dev->cmd_mutex);
+
+	if (i8042_check_port_owner(ps2dev->serio))
+		i8042_lock_chip();
+}
+EXPORT_SYMBOL(ps2_begin_command);
+
+void ps2_end_command(struct ps2dev *ps2dev)
+{
+	if (i8042_check_port_owner(ps2dev->serio))
+		i8042_unlock_chip();
+
+	mutex_unlock(&ps2dev->cmd_mutex);
+}
+EXPORT_SYMBOL(ps2_end_command);
+
 /*
  * ps2_drain() waits for device to transmit requested number of bytes
  * and discards them.
@@ -66,7 +85,7 @@ void ps2_drain(struct ps2dev *ps2dev, int maxbytes, int timeout)
 		maxbytes = sizeof(ps2dev->cmdbuf);
 	}
 
-	mutex_lock(&ps2dev->cmd_mutex);
+	ps2_begin_command(ps2dev);
 
 	serio_pause_rx(ps2dev->serio);
 	ps2dev->flags = PS2_FLAG_CMD;
@@ -76,7 +95,8 @@ void ps2_drain(struct ps2dev *ps2dev, int maxbytes, int timeout)
 	wait_event_timeout(ps2dev->wait,
 			   !(ps2dev->flags & PS2_FLAG_CMD),
 			   msecs_to_jiffies(timeout));
-	mutex_unlock(&ps2dev->cmd_mutex);
+
+	ps2_end_command(ps2dev);
 }
 EXPORT_SYMBOL(ps2_drain);
 
@@ -237,9 +257,9 @@ int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command)
 {
 	int rc;
 
-	mutex_lock(&ps2dev->cmd_mutex);
+	ps2_begin_command(ps2dev);
 	rc = __ps2_command(ps2dev, param, command);
-	mutex_unlock(&ps2dev->cmd_mutex);
+	ps2_end_command(ps2dev);
 
 	return rc;
 }
diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index ab02d72afbf..8cc453c85ea 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -48,8 +48,8 @@ config TOUCHSCREEN_AD7879_I2C
 	select TOUCHSCREEN_AD7879
 	help
 	  Say Y here if you have a touchscreen interface using the
-	  AD7879-1 controller, and your board-specific initialization
-	  code includes that in its table of I2C devices.
+	  AD7879-1/AD7889-1 controller, and your board-specific
+	  initialization code includes that in its table of I2C devices.
 
 	  If unsure, say N (but it's safe to say "Y").
 
@@ -62,7 +62,7 @@ config TOUCHSCREEN_AD7879_SPI
 	select TOUCHSCREEN_AD7879
 	help
 	  Say Y here if you have a touchscreen interface using the
-	  AD7879 controller, and your board-specific initialization
+	  AD7879/AD7889 controller, and your board-specific initialization
 	  code includes that in its table of SPI devices.
 
 	  If unsure, say N (but it's safe to say "Y").
@@ -169,6 +169,17 @@ config TOUCHSCREEN_WACOM_W8001
 	  To compile this driver as a module, choose M here: the
 	  module will be called wacom_w8001.
 
+config TOUCHSCREEN_MCS5000
+	tristate "MELFAS MCS-5000 touchscreen"
+	depends on I2C
+	help
+	  Say Y here if you have the MELFAS MCS-5000 touchscreen controller
+	  chip in your system.
+
+	  If unsure, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mcs5000_ts.
 
 config TOUCHSCREEN_MTOUCH
 	tristate "MicroTouch serial touchscreens"
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index 4599bf7ad81..15fa62cffc7 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_TOUCHSCREEN_EETI)		+= eeti_ts.o
 obj-$(CONFIG_TOUCHSCREEN_ELO)		+= elo.o
 obj-$(CONFIG_TOUCHSCREEN_FUJITSU)	+= fujitsu_ts.o
 obj-$(CONFIG_TOUCHSCREEN_INEXIO)	+= inexio.o
+obj-$(CONFIG_TOUCHSCREEN_MCS5000)	+= mcs5000_ts.o
 obj-$(CONFIG_TOUCHSCREEN_MIGOR)		+= migor_ts.o
 obj-$(CONFIG_TOUCHSCREEN_MTOUCH)	+= mtouch.o
 obj-$(CONFIG_TOUCHSCREEN_MK712)		+= mk712.o
diff --git a/drivers/input/touchscreen/ad7879.c b/drivers/input/touchscreen/ad7879.c
index 19b4db7e974..f06332c9e21 100644
--- a/drivers/input/touchscreen/ad7879.c
+++ b/drivers/input/touchscreen/ad7879.c
@@ -1,7 +1,8 @@
 /*
- * Copyright (C) 2008 Michael Hennerich, Analog Devices Inc.
+ * Copyright (C) 2008-2009 Michael Hennerich, Analog Devices Inc.
  *
- * Description:	AD7879 based touchscreen, and GPIO driver (I2C/SPI Interface)
+ * Description:	AD7879/AD7889 based touchscreen, and GPIO driver
+ *		(I2C/SPI Interface)
  *
  * Bugs:        Enter bugs at http://blackfin.uclinux.org/
  *
@@ -747,6 +748,7 @@ static int __devexit ad7879_remove(struct i2c_client *client)
 
 static const struct i2c_device_id ad7879_id[] = {
 	{ "ad7879", 0 },
+	{ "ad7889", 0 },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, ad7879_id);
diff --git a/drivers/input/touchscreen/mcs5000_ts.c b/drivers/input/touchscreen/mcs5000_ts.c
new file mode 100644
index 00000000000..4c28b89757f
--- /dev/null
+++ b/drivers/input/touchscreen/mcs5000_ts.c
@@ -0,0 +1,318 @@
+/*
+ * mcs5000_ts.c - Touchscreen driver for MELFAS MCS-5000 controller
+ *
+ * Copyright (C) 2009 Samsung Electronics Co.Ltd
+ * Author: Joonyoung Shim <jy0922.shim@samsung.com>
+ *
+ * Based on wm97xx-core.c
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/i2c.h>
+#include <linux/i2c/mcs5000_ts.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/irq.h>
+
+/* Registers */
+#define MCS5000_TS_STATUS		0x00
+#define STATUS_OFFSET			0
+#define STATUS_NO			(0 << STATUS_OFFSET)
+#define STATUS_INIT			(1 << STATUS_OFFSET)
+#define STATUS_SENSING			(2 << STATUS_OFFSET)
+#define STATUS_COORD			(3 << STATUS_OFFSET)
+#define STATUS_GESTURE			(4 << STATUS_OFFSET)
+#define ERROR_OFFSET			4
+#define ERROR_NO			(0 << ERROR_OFFSET)
+#define ERROR_POWER_ON_RESET		(1 << ERROR_OFFSET)
+#define ERROR_INT_RESET			(2 << ERROR_OFFSET)
+#define ERROR_EXT_RESET			(3 << ERROR_OFFSET)
+#define ERROR_INVALID_REG_ADDRESS	(8 << ERROR_OFFSET)
+#define ERROR_INVALID_REG_VALUE		(9 << ERROR_OFFSET)
+
+#define MCS5000_TS_OP_MODE		0x01
+#define RESET_OFFSET			0
+#define RESET_NO			(0 << RESET_OFFSET)
+#define RESET_EXT_SOFT			(1 << RESET_OFFSET)
+#define OP_MODE_OFFSET			1
+#define OP_MODE_SLEEP			(0 << OP_MODE_OFFSET)
+#define OP_MODE_ACTIVE			(1 << OP_MODE_OFFSET)
+#define GESTURE_OFFSET			4
+#define GESTURE_DISABLE			(0 << GESTURE_OFFSET)
+#define GESTURE_ENABLE			(1 << GESTURE_OFFSET)
+#define PROXIMITY_OFFSET		5
+#define PROXIMITY_DISABLE		(0 << PROXIMITY_OFFSET)
+#define PROXIMITY_ENABLE		(1 << PROXIMITY_OFFSET)
+#define SCAN_MODE_OFFSET		6
+#define SCAN_MODE_INTERRUPT		(0 << SCAN_MODE_OFFSET)
+#define SCAN_MODE_POLLING		(1 << SCAN_MODE_OFFSET)
+#define REPORT_RATE_OFFSET		7
+#define REPORT_RATE_40			(0 << REPORT_RATE_OFFSET)
+#define REPORT_RATE_80			(1 << REPORT_RATE_OFFSET)
+
+#define MCS5000_TS_SENS_CTL		0x02
+#define MCS5000_TS_FILTER_CTL		0x03
+#define PRI_FILTER_OFFSET		0
+#define SEC_FILTER_OFFSET		4
+
+#define MCS5000_TS_X_SIZE_UPPER		0x08
+#define MCS5000_TS_X_SIZE_LOWER		0x09
+#define MCS5000_TS_Y_SIZE_UPPER		0x0A
+#define MCS5000_TS_Y_SIZE_LOWER		0x0B
+
+#define MCS5000_TS_INPUT_INFO		0x10
+#define INPUT_TYPE_OFFSET		0
+#define INPUT_TYPE_NONTOUCH		(0 << INPUT_TYPE_OFFSET)
+#define INPUT_TYPE_SINGLE		(1 << INPUT_TYPE_OFFSET)
+#define INPUT_TYPE_DUAL			(2 << INPUT_TYPE_OFFSET)
+#define INPUT_TYPE_PALM			(3 << INPUT_TYPE_OFFSET)
+#define INPUT_TYPE_PROXIMITY		(7 << INPUT_TYPE_OFFSET)
+#define GESTURE_CODE_OFFSET		3
+#define GESTURE_CODE_NO			(0 << GESTURE_CODE_OFFSET)
+
+#define MCS5000_TS_X_POS_UPPER		0x11
+#define MCS5000_TS_X_POS_LOWER		0x12
+#define MCS5000_TS_Y_POS_UPPER		0x13
+#define MCS5000_TS_Y_POS_LOWER		0x14
+#define MCS5000_TS_Z_POS		0x15
+#define MCS5000_TS_WIDTH		0x16
+#define MCS5000_TS_GESTURE_VAL		0x17
+#define MCS5000_TS_MODULE_REV		0x20
+#define MCS5000_TS_FIRMWARE_VER		0x21
+
+/* Touchscreen absolute values */
+#define MCS5000_MAX_XC			0x3ff
+#define MCS5000_MAX_YC			0x3ff
+
+enum mcs5000_ts_read_offset {
+	READ_INPUT_INFO,
+	READ_X_POS_UPPER,
+	READ_X_POS_LOWER,
+	READ_Y_POS_UPPER,
+	READ_Y_POS_LOWER,
+	READ_BLOCK_SIZE,
+};
+
+/* Each client has this additional data */
+struct mcs5000_ts_data {
+	struct i2c_client *client;
+	struct input_dev *input_dev;
+	const struct mcs5000_ts_platform_data *platform_data;
+};
+
+static irqreturn_t mcs5000_ts_interrupt(int irq, void *dev_id)
+{
+	struct mcs5000_ts_data *data = dev_id;
+	struct i2c_client *client = data->client;
+	u8 buffer[READ_BLOCK_SIZE];
+	int err;
+	int x;
+	int y;
+
+	err = i2c_smbus_read_i2c_block_data(client, MCS5000_TS_INPUT_INFO,
+			READ_BLOCK_SIZE, buffer);
+	if (err < 0) {
+		dev_err(&client->dev, "%s, err[%d]\n", __func__, err);
+		goto out;
+	}
+
+	switch (buffer[READ_INPUT_INFO]) {
+	case INPUT_TYPE_NONTOUCH:
+		input_report_key(data->input_dev, BTN_TOUCH, 0);
+		input_sync(data->input_dev);
+		break;
+
+	case INPUT_TYPE_SINGLE:
+		x = (buffer[READ_X_POS_UPPER] << 8) | buffer[READ_X_POS_LOWER];
+		y = (buffer[READ_Y_POS_UPPER] << 8) | buffer[READ_Y_POS_LOWER];
+
+		input_report_key(data->input_dev, BTN_TOUCH, 1);
+		input_report_abs(data->input_dev, ABS_X, x);
+		input_report_abs(data->input_dev, ABS_Y, y);
+		input_sync(data->input_dev);
+		break;
+
+	case INPUT_TYPE_DUAL:
+		/* TODO */
+		break;
+
+	case INPUT_TYPE_PALM:
+		/* TODO */
+		break;
+
+	case INPUT_TYPE_PROXIMITY:
+		/* TODO */
+		break;
+
+	default:
+		dev_err(&client->dev, "Unknown ts input type %d\n",
+				buffer[READ_INPUT_INFO]);
+		break;
+	}
+
+ out:
+	return IRQ_HANDLED;
+}
+
+static void mcs5000_ts_phys_init(struct mcs5000_ts_data *data)
+{
+	const struct mcs5000_ts_platform_data *platform_data =
+		data->platform_data;
+	struct i2c_client *client = data->client;
+
+	/* Touch reset & sleep mode */
+	i2c_smbus_write_byte_data(client, MCS5000_TS_OP_MODE,
+			RESET_EXT_SOFT | OP_MODE_SLEEP);
+
+	/* Touch size */
+	i2c_smbus_write_byte_data(client, MCS5000_TS_X_SIZE_UPPER,
+			platform_data->x_size >> 8);
+	i2c_smbus_write_byte_data(client, MCS5000_TS_X_SIZE_LOWER,
+			platform_data->x_size & 0xff);
+	i2c_smbus_write_byte_data(client, MCS5000_TS_Y_SIZE_UPPER,
+			platform_data->y_size >> 8);
+	i2c_smbus_write_byte_data(client, MCS5000_TS_Y_SIZE_LOWER,
+			platform_data->y_size & 0xff);
+
+	/* Touch active mode & 80 report rate */
+	i2c_smbus_write_byte_data(data->client, MCS5000_TS_OP_MODE,
+			OP_MODE_ACTIVE | REPORT_RATE_80);
+}
+
+static int __devinit mcs5000_ts_probe(struct i2c_client *client,
+		const struct i2c_device_id *id)
+{
+	struct mcs5000_ts_data *data;
+	struct input_dev *input_dev;
+	int ret;
+
+	if (!client->dev.platform_data)
+		return -EINVAL;
+
+	data = kzalloc(sizeof(struct mcs5000_ts_data), GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!data || !input_dev) {
+		dev_err(&client->dev, "Failed to allocate memory\n");
+		ret = -ENOMEM;
+		goto err_free_mem;
+	}
+
+	data->client = client;
+	data->input_dev = input_dev;
+	data->platform_data = client->dev.platform_data;
+
+	input_dev->name = "MELPAS MCS-5000 Touchscreen";
+	input_dev->id.bustype = BUS_I2C;
+	input_dev->dev.parent = &client->dev;
+
+	__set_bit(EV_ABS, input_dev->evbit);
+	__set_bit(EV_KEY, input_dev->evbit);
+	__set_bit(BTN_TOUCH, input_dev->keybit);
+	input_set_abs_params(input_dev, ABS_X, 0, MCS5000_MAX_XC, 0, 0);
+	input_set_abs_params(input_dev, ABS_Y, 0, MCS5000_MAX_YC, 0, 0);
+
+	input_set_drvdata(input_dev, data);
+
+	if (data->platform_data->cfg_pin)
+		data->platform_data->cfg_pin();
+
+	ret = request_threaded_irq(client->irq, NULL, mcs5000_ts_interrupt,
+			IRQF_TRIGGER_LOW | IRQF_ONESHOT, "mcs5000_ts", data);
+
+	if (ret < 0) {
+		dev_err(&client->dev, "Failed to register interrupt\n");
+		goto err_free_mem;
+	}
+
+	ret = input_register_device(data->input_dev);
+	if (ret < 0)
+		goto err_free_irq;
+
+	mcs5000_ts_phys_init(data);
+	i2c_set_clientdata(client, data);
+
+	return 0;
+
+err_free_irq:
+	free_irq(client->irq, data);
+err_free_mem:
+	input_free_device(input_dev);
+	kfree(data);
+	return ret;
+}
+
+static int __devexit mcs5000_ts_remove(struct i2c_client *client)
+{
+	struct mcs5000_ts_data *data = i2c_get_clientdata(client);
+
+	free_irq(client->irq, data);
+	input_unregister_device(data->input_dev);
+	kfree(data);
+	i2c_set_clientdata(client, NULL);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int mcs5000_ts_suspend(struct i2c_client *client, pm_message_t mesg)
+{
+	/* Touch sleep mode */
+	i2c_smbus_write_byte_data(client, MCS5000_TS_OP_MODE, OP_MODE_SLEEP);
+
+	return 0;
+}
+
+static int mcs5000_ts_resume(struct i2c_client *client)
+{
+	struct mcs5000_ts_data *data = i2c_get_clientdata(client);
+
+	mcs5000_ts_phys_init(data);
+
+	return 0;
+}
+#else
+#define mcs5000_ts_suspend	NULL
+#define mcs5000_ts_resume	NULL
+#endif
+
+static const struct i2c_device_id mcs5000_ts_id[] = {
+	{ "mcs5000_ts", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, mcs5000_ts_id);
+
+static struct i2c_driver mcs5000_ts_driver = {
+	.probe		= mcs5000_ts_probe,
+	.remove		= __devexit_p(mcs5000_ts_remove),
+	.suspend	= mcs5000_ts_suspend,
+	.resume		= mcs5000_ts_resume,
+	.driver = {
+		.name = "mcs5000_ts",
+	},
+	.id_table	= mcs5000_ts_id,
+};
+
+static int __init mcs5000_ts_init(void)
+{
+	return i2c_add_driver(&mcs5000_ts_driver);
+}
+
+static void __exit mcs5000_ts_exit(void)
+{
+	i2c_del_driver(&mcs5000_ts_driver);
+}
+
+module_init(mcs5000_ts_init);
+module_exit(mcs5000_ts_exit);
+
+/* Module information */
+MODULE_AUTHOR("Joonyoung Shim <jy0922.shim@samsung.com>");
+MODULE_DESCRIPTION("Touchscreen driver for MELFAS MCS-5000 controller");
+MODULE_LICENSE("GPL");
diff --git a/drivers/leds/leds-clevo-mail.c b/drivers/leds/leds-clevo-mail.c
index 1813c84ea5f..f2242db5401 100644
--- a/drivers/leds/leds-clevo-mail.c
+++ b/drivers/leds/leds-clevo-mail.c
@@ -93,6 +93,8 @@ static struct dmi_system_id __initdata mail_led_whitelist[] = {
 static void clevo_mail_led_set(struct led_classdev *led_cdev,
 				enum led_brightness value)
 {
+	i8042_lock_chip();
+
 	if (value == LED_OFF)
 		i8042_command(NULL, CLEVO_MAIL_LED_OFF);
 	else if (value <= LED_HALF)
@@ -100,6 +102,8 @@ static void clevo_mail_led_set(struct led_classdev *led_cdev,
 	else
 		i8042_command(NULL, CLEVO_MAIL_LED_BLINK_1HZ);
 
+	i8042_unlock_chip();
+
 }
 
 static int clevo_mail_led_blink(struct led_classdev *led_cdev,
@@ -108,6 +112,8 @@ static int clevo_mail_led_blink(struct led_classdev *led_cdev,
 {
 	int status = -EINVAL;
 
+	i8042_lock_chip();
+
 	if (*delay_on == 0 /* ms */ && *delay_off == 0 /* ms */) {
 		/* Special case: the leds subsystem requested us to
 		 * chose one user friendly blinking of the LED, and
@@ -135,6 +141,8 @@ static int clevo_mail_led_blink(struct led_classdev *led_cdev,
 		       *delay_on, *delay_off);
 	}
 
+	i8042_unlock_chip();
+
 	return status;
 }
 
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 020f9573fd8..2158377a135 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -124,6 +124,8 @@ config MD_RAID456
 	select MD_RAID6_PQ
 	select ASYNC_MEMCPY
 	select ASYNC_XOR
+	select ASYNC_PQ
+	select ASYNC_RAID6_RECOV
 	---help---
 	  A RAID-5 set of N drives with a capacity of C MB per drive provides
 	  the capacity of C * (N - 1) MB, and protects against a failure
@@ -152,9 +154,33 @@ config MD_RAID456
 
 	  If unsure, say Y.
 
+config MULTICORE_RAID456
+	bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
+	depends on MD_RAID456
+	depends on SMP
+	depends on EXPERIMENTAL
+	---help---
+	  Enable the raid456 module to dispatch per-stripe raid operations to a
+	  thread pool.
+
+	  If unsure, say N.
+
 config MD_RAID6_PQ
 	tristate
 
+config ASYNC_RAID6_TEST
+	tristate "Self test for hardware accelerated raid6 recovery"
+	depends on MD_RAID6_PQ
+	select ASYNC_RAID6_RECOV
+	---help---
+	  This is a one-shot self test that permutes through the
+	  recovery of all the possible two disk failure scenarios for a
+	  N-disk array.  Recovery is performed with the asynchronous
+	  raid6 recovery routines, and will optionally use an offload
+	  engine if one is available.
+
+	  If unsure, say N.
+
 config MD_MULTIPATH
 	tristate "Multipath I/O support"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 3319c2fec28..6986b0059d2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -108,6 +108,8 @@ static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
  * allocated while we're using it
  */
 static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create)
+__releases(bitmap->lock)
+__acquires(bitmap->lock)
 {
 	unsigned char *mappage;
 
@@ -325,7 +327,6 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 	return 0;
 
  bad_alignment:
-	rcu_read_unlock();
 	return -EINVAL;
 }
 
@@ -1207,6 +1208,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
 					    sector_t offset, int *blocks,
 					    int create)
+__releases(bitmap->lock)
+__acquires(bitmap->lock)
 {
 	/* If 'create', we might release the lock and reclaim it.
 	 * The lock must have been taken with interrupts enabled.
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ea484290544..1ceceb334d5 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -108,6 +108,9 @@ static int linear_congested(void *data, int bits)
 	linear_conf_t *conf;
 	int i, ret = 0;
 
+	if (mddev_congested(mddev, bits))
+		return 1;
+
 	rcu_read_lock();
 	conf = rcu_dereference(mddev->private);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6aa497e4baf..26ba42a7912 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -262,6 +262,12 @@ static void mddev_resume(mddev_t *mddev)
 	mddev->pers->quiesce(mddev, 0);
 }
 
+int mddev_congested(mddev_t *mddev, int bits)
+{
+	return mddev->suspended;
+}
+EXPORT_SYMBOL(mddev_congested);
+
 
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
@@ -4218,7 +4224,7 @@ static int do_md_run(mddev_t * mddev)
 			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 			mddev->sync_thread = md_register_thread(md_do_sync,
 								mddev,
-								"%s_resync");
+								"resync");
 			if (!mddev->sync_thread) {
 				printk(KERN_ERR "%s: could not start resync"
 				       " thread...\n",
@@ -4575,10 +4581,10 @@ static int get_version(void __user * arg)
 static int get_array_info(mddev_t * mddev, void __user * arg)
 {
 	mdu_array_info_t info;
-	int nr,working,active,failed,spare;
+	int nr,working,insync,failed,spare;
 	mdk_rdev_t *rdev;
 
-	nr=working=active=failed=spare=0;
+	nr=working=insync=failed=spare=0;
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		nr++;
 		if (test_bit(Faulty, &rdev->flags))
@@ -4586,7 +4592,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
 		else {
 			working++;
 			if (test_bit(In_sync, &rdev->flags))
-				active++;	
+				insync++;	
 			else
 				spare++;
 		}
@@ -4611,7 +4617,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
 		info.state = (1<<MD_SB_CLEAN);
 	if (mddev->bitmap && mddev->bitmap_offset)
 		info.state = (1<<MD_SB_BITMAP_PRESENT);
-	info.active_disks  = active;
+	info.active_disks  = insync;
 	info.working_disks = working;
 	info.failed_disks  = failed;
 	info.spare_disks   = spare;
@@ -4721,7 +4727,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		if (!list_empty(&mddev->disks)) {
 			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
 							mdk_rdev_t, same_set);
-			int err = super_types[mddev->major_version]
+			err = super_types[mddev->major_version]
 				.load_super(rdev, rdev0, mddev->minor_version);
 			if (err < 0) {
 				printk(KERN_WARNING 
@@ -5631,7 +5637,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
 	thread->run = run;
 	thread->mddev = mddev;
 	thread->timeout = MAX_SCHEDULE_TIMEOUT;
-	thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
+	thread->tsk = kthread_run(md_thread, thread,
+				  "%s_%s",
+				  mdname(thread->mddev),
+				  name ?: mddev->pers->name);
 	if (IS_ERR(thread->tsk)) {
 		kfree(thread);
 		return NULL;
@@ -6745,7 +6754,7 @@ void md_check_recovery(mddev_t *mddev)
 			}
 			mddev->sync_thread = md_register_thread(md_do_sync,
 								mddev,
-								"%s_resync");
+								"resync");
 			if (!mddev->sync_thread) {
 				printk(KERN_ERR "%s: could not start resync"
 					" thread...\n", 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f55d2ff9513..f184b69ef33 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -430,6 +430,7 @@ extern void md_write_end(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
 
+extern int mddev_congested(mddev_t *mddev, int bits);
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 			   sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index d2d3fd54cc6..ee7646f974a 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -150,7 +150,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	}
 
 	mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
-	memset(mp_bh, 0, sizeof(*mp_bh));
 
 	mp_bh->master_bio = bio;
 	mp_bh->mddev = mddev;
@@ -199,6 +198,9 @@ static int multipath_congested(void *data, int bits)
 	multipath_conf_t *conf = mddev->private;
 	int i, ret = 0;
 
+	if (mddev_congested(mddev, bits))
+		return 1;
+
 	rcu_read_lock();
 	for (i = 0; i < mddev->raid_disks ; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
@@ -504,7 +506,7 @@ static int multipath_run (mddev_t *mddev)
 	}
 
 	{
-		mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath");
+		mddev->thread = md_register_thread(multipathd, mddev, NULL);
 		if (!mddev->thread) {
 			printk(KERN_ERR "multipath: couldn't allocate thread"
 				" for %s\n", mdname(mddev));
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f845ed98fec..d3a4ce06015 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -44,6 +44,9 @@ static int raid0_congested(void *data, int bits)
 	mdk_rdev_t **devlist = conf->devlist;
 	int i, ret = 0;
 
+	if (mddev_congested(mddev, bits))
+		return 1;
+
 	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
 		struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
 
@@ -86,7 +89,7 @@ static void dump_zones(mddev_t *mddev)
 
 static int create_strip_zones(mddev_t *mddev)
 {
-	int i, c, j, err;
+	int i, c, err;
 	sector_t curr_zone_end, sectors;
 	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
 	struct strip_zone *zone;
@@ -198,6 +201,8 @@ static int create_strip_zones(mddev_t *mddev)
 	/* now do the other zones */
 	for (i = 1; i < conf->nr_strip_zones; i++)
 	{
+		int j;
+
 		zone = conf->strip_zone + i;
 		dev = conf->devlist + i * mddev->raid_disks;
 
@@ -207,7 +212,6 @@ static int create_strip_zones(mddev_t *mddev)
 		c = 0;
 
 		for (j=0; j<cnt; j++) {
-			char b[BDEVNAME_SIZE];
 			rdev = conf->devlist[j];
 			printk(KERN_INFO "raid0: checking %s ...",
 				bdevname(rdev->bdev, b));
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ff7ed333599..d1b9bd5fd4f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -576,6 +576,9 @@ static int raid1_congested(void *data, int bits)
 	conf_t *conf = mddev->private;
 	int i, ret = 0;
 
+	if (mddev_congested(mddev, bits))
+		return 1;
+
 	rcu_read_lock();
 	for (i = 0; i < mddev->raid_disks; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -851,7 +854,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid1_end_read_request;
-		read_bio->bi_rw = READ | do_sync;
+		read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
 		read_bio->bi_private = r1_bio;
 
 		generic_make_request(read_bio);
@@ -943,7 +946,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
 		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE | do_barriers | do_sync;
+		mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) |
+			(do_sync << BIO_RW_SYNCIO);
 		mbio->bi_private = r1_bio;
 
 		if (behind_pages) {
@@ -1623,7 +1627,8 @@ static void raid1d(mddev_t *mddev)
 						conf->mirrors[i].rdev->data_offset;
 					bio->bi_bdev = conf->mirrors[i].rdev->bdev;
 					bio->bi_end_io = raid1_end_write_request;
-					bio->bi_rw = WRITE | do_sync;
+					bio->bi_rw = WRITE |
+						(do_sync << BIO_RW_SYNCIO);
 					bio->bi_private = r1_bio;
 					r1_bio->bios[i] = bio;
 					generic_make_request(bio);
@@ -1672,7 +1677,7 @@ static void raid1d(mddev_t *mddev)
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				bio->bi_end_io = raid1_end_read_request;
-				bio->bi_rw = READ | do_sync;
+				bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
 				bio->bi_private = r1_bio;
 				unplug = 1;
 				generic_make_request(bio);
@@ -2047,7 +2052,7 @@ static int run(mddev_t *mddev)
 	conf->last_used = j;
 
 
-	mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
+	mddev->thread = md_register_thread(raid1d, mddev, NULL);
 	if (!mddev->thread) {
 		printk(KERN_ERR
 		       "raid1: couldn't allocate thread for %s\n",
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d0a2152e064..51c4c5c4d87 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -631,6 +631,8 @@ static int raid10_congested(void *data, int bits)
 	conf_t *conf = mddev->private;
 	int i, ret = 0;
 
+	if (mddev_congested(mddev, bits))
+		return 1;
 	rcu_read_lock();
 	for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -882,7 +884,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 			mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid10_end_read_request;
-		read_bio->bi_rw = READ | do_sync;
+		read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
 		read_bio->bi_private = r10_bio;
 
 		generic_make_request(read_bio);
@@ -950,7 +952,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 			conf->mirrors[d].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | do_sync;
+		mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO);
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
@@ -1623,7 +1625,7 @@ static void raid10d(mddev_t *mddev)
 				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
 					+ rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
-				bio->bi_rw = READ | do_sync;
+				bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = raid10_end_read_request;
 				unplug = 1;
@@ -1773,7 +1775,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
 	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		/* recovery... the complicated one */
-		int i, j, k;
+		int j, k;
 		r10_bio = NULL;
 
 		for (i=0 ; i<conf->raid_disks; i++)
@@ -2188,7 +2190,7 @@ static int run(mddev_t *mddev)
 	}
 
 
-	mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
+	mddev->thread = md_register_thread(raid10d, mddev, NULL);
 	if (!mddev->thread) {
 		printk(KERN_ERR
 		       "raid10: couldn't allocate thread for %s\n",
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 826eb346735..94829804ab7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -47,7 +47,9 @@
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
 #include <linux/async_tx.h>
+#include <linux/async.h>
 #include <linux/seq_file.h>
+#include <linux/cpu.h>
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
@@ -499,11 +501,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 	struct page *bio_page;
 	int i;
 	int page_offset;
+	struct async_submit_ctl submit;
+	enum async_tx_flags flags = 0;
 
 	if (bio->bi_sector >= sector)
 		page_offset = (signed)(bio->bi_sector - sector) * 512;
 	else
 		page_offset = (signed)(sector - bio->bi_sector) * -512;
+
+	if (frombio)
+		flags |= ASYNC_TX_FENCE;
+	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
+
 	bio_for_each_segment(bvl, bio, i) {
 		int len = bio_iovec_idx(bio, i)->bv_len;
 		int clen;
@@ -525,15 +534,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 			bio_page = bio_iovec_idx(bio, i)->bv_page;
 			if (frombio)
 				tx = async_memcpy(page, bio_page, page_offset,
-					b_offset, clen,
-					ASYNC_TX_DEP_ACK,
-					tx, NULL, NULL);
+						  b_offset, clen, &submit);
 			else
 				tx = async_memcpy(bio_page, page, b_offset,
-					page_offset, clen,
-					ASYNC_TX_DEP_ACK,
-					tx, NULL, NULL);
+						  page_offset, clen, &submit);
 		}
+		/* chain the operations */
+		submit.depend_tx = tx;
+
 		if (clen < len) /* hit end of page */
 			break;
 		page_offset +=  len;
@@ -592,6 +600,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 {
 	struct dma_async_tx_descriptor *tx = NULL;
 	raid5_conf_t *conf = sh->raid_conf;
+	struct async_submit_ctl submit;
 	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
@@ -615,22 +624,34 @@ static void ops_run_biofill(struct stripe_head *sh)
 	}
 
 	atomic_inc(&sh->count);
-	async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
-		ops_complete_biofill, sh);
+	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
+	async_trigger_callback(&submit);
 }
 
-static void ops_complete_compute5(void *stripe_head_ref)
+static void mark_target_uptodate(struct stripe_head *sh, int target)
 {
-	struct stripe_head *sh = stripe_head_ref;
-	int target = sh->ops.target;
-	struct r5dev *tgt = &sh->dev[target];
+	struct r5dev *tgt;
 
-	pr_debug("%s: stripe %llu\n", __func__,
-		(unsigned long long)sh->sector);
+	if (target < 0)
+		return;
 
+	tgt = &sh->dev[target];
 	set_bit(R5_UPTODATE, &tgt->flags);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 	clear_bit(R5_Wantcompute, &tgt->flags);
+}
+
+static void ops_complete_compute(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+
+	pr_debug("%s: stripe %llu\n", __func__,
+		(unsigned long long)sh->sector);
+
+	/* mark the computed target(s) as uptodate */
+	mark_target_uptodate(sh, sh->ops.target);
+	mark_target_uptodate(sh, sh->ops.target2);
+
 	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
 	if (sh->check_state == check_state_compute_run)
 		sh->check_state = check_state_compute_result;
@@ -638,16 +659,24 @@ static void ops_complete_compute5(void *stripe_head_ref)
 	release_stripe(sh);
 }
 
-static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
+/* return a pointer to the address conversion region of the scribble buffer */
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
+				 struct raid5_percpu *percpu)
+{
+	return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
-	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
-	struct page *xor_srcs[disks];
+	struct page **xor_srcs = percpu->scribble;
 	int target = sh->ops.target;
 	struct r5dev *tgt = &sh->dev[target];
 	struct page *xor_dest = tgt->page;
 	int count = 0;
 	struct dma_async_tx_descriptor *tx;
+	struct async_submit_ctl submit;
 	int i;
 
 	pr_debug("%s: stripe %llu block: %d\n",
@@ -660,17 +689,215 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
 
 	atomic_inc(&sh->count);
 
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
+			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
 	if (unlikely(count == 1))
-		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
-			0, NULL, ops_complete_compute5, sh);
+		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 	else
-		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-			ASYNC_TX_XOR_ZERO_DST, NULL,
-			ops_complete_compute5, sh);
+		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
 	return tx;
 }
 
+/* set_syndrome_sources - populate source buffers for gen_syndrome
+ * @srcs - (struct page *) array of size sh->disks
+ * @sh - stripe_head to parse
+ *
+ * Populates srcs in proper layout order for the stripe and returns the
+ * 'count' of sources to be used in a call to async_gen_syndrome.  The P
+ * destination buffer is recorded in srcs[count] and the Q destination
+ * is recorded in srcs[count+1]].
+ */
+static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
+{
+	int disks = sh->disks;
+	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
+	int d0_idx = raid6_d0(sh);
+	int count;
+	int i;
+
+	for (i = 0; i < disks; i++)
+		srcs[i] = (void *)raid6_empty_zero_page;
+
+	count = 0;
+	i = d0_idx;
+	do {
+		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+		srcs[slot] = sh->dev[i].page;
+		i = raid6_next_disk(i, disks);
+	} while (i != d0_idx);
+	BUG_ON(count != syndrome_disks);
+
+	return count;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+	int disks = sh->disks;
+	struct page **blocks = percpu->scribble;
+	int target;
+	int qd_idx = sh->qd_idx;
+	struct dma_async_tx_descriptor *tx;
+	struct async_submit_ctl submit;
+	struct r5dev *tgt;
+	struct page *dest;
+	int i;
+	int count;
+
+	if (sh->ops.target < 0)
+		target = sh->ops.target2;
+	else if (sh->ops.target2 < 0)
+		target = sh->ops.target;
+	else
+		/* we should only have one valid target */
+		BUG();
+	BUG_ON(target < 0);
+	pr_debug("%s: stripe %llu block: %d\n",
+		__func__, (unsigned long long)sh->sector, target);
+
+	tgt = &sh->dev[target];
+	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+	dest = tgt->page;
+
+	atomic_inc(&sh->count);
+
+	if (target == qd_idx) {
+		count = set_syndrome_sources(blocks, sh);
+		blocks[count] = NULL; /* regenerating p is not necessary */
+		BUG_ON(blocks[count+1] != dest); /* q should already be set */
+		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+				  ops_complete_compute, sh,
+				  to_addr_conv(sh, percpu));
+		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+	} else {
+		/* Compute any data- or p-drive using XOR */
+		count = 0;
+		for (i = disks; i-- ; ) {
+			if (i == target || i == qd_idx)
+				continue;
+			blocks[count++] = sh->dev[i].page;
+		}
+
+		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
+				  NULL, ops_complete_compute, sh,
+				  to_addr_conv(sh, percpu));
+		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
+	}
+
+	return tx;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+	int i, count, disks = sh->disks;
+	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
+	int d0_idx = raid6_d0(sh);
+	int faila = -1, failb = -1;
+	int target = sh->ops.target;
+	int target2 = sh->ops.target2;
+	struct r5dev *tgt = &sh->dev[target];
+	struct r5dev *tgt2 = &sh->dev[target2];
+	struct dma_async_tx_descriptor *tx;
+	struct page **blocks = percpu->scribble;
+	struct async_submit_ctl submit;
+
+	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
+		 __func__, (unsigned long long)sh->sector, target, target2);
+	BUG_ON(target < 0 || target2 < 0);
+	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
+
+	/* we need to open-code set_syndrome_sources to handle the
+	 * slot number conversion for 'faila' and 'failb'
+	 */
+	for (i = 0; i < disks ; i++)
+		blocks[i] = (void *)raid6_empty_zero_page;
+	count = 0;
+	i = d0_idx;
+	do {
+		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+		blocks[slot] = sh->dev[i].page;
+
+		if (i == target)
+			faila = slot;
+		if (i == target2)
+			failb = slot;
+		i = raid6_next_disk(i, disks);
+	} while (i != d0_idx);
+	BUG_ON(count != syndrome_disks);
+
+	BUG_ON(faila == failb);
+	if (failb < faila)
+		swap(faila, failb);
+	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
+		 __func__, (unsigned long long)sh->sector, faila, failb);
+
+	atomic_inc(&sh->count);
+
+	if (failb == syndrome_disks+1) {
+		/* Q disk is one of the missing disks */
+		if (faila == syndrome_disks) {
+			/* Missing P+Q, just recompute */
+			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+					  ops_complete_compute, sh,
+					  to_addr_conv(sh, percpu));
+			return async_gen_syndrome(blocks, 0, count+2,
+						  STRIPE_SIZE, &submit);
+		} else {
+			struct page *dest;
+			int data_target;
+			int qd_idx = sh->qd_idx;
+
+			/* Missing D+Q: recompute D from P, then recompute Q */
+			if (target == qd_idx)
+				data_target = target2;
+			else
+				data_target = target;
+
+			count = 0;
+			for (i = disks; i-- ; ) {
+				if (i == data_target || i == qd_idx)
+					continue;
+				blocks[count++] = sh->dev[i].page;
+			}
+			dest = sh->dev[data_target].page;
+			init_async_submit(&submit,
+					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
+					  NULL, NULL, NULL,
+					  to_addr_conv(sh, percpu));
+			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
+				       &submit);
+
+			count = set_syndrome_sources(blocks, sh);
+			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
+					  ops_complete_compute, sh,
+					  to_addr_conv(sh, percpu));
+			return async_gen_syndrome(blocks, 0, count+2,
+						  STRIPE_SIZE, &submit);
+		}
+	} else {
+		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+				  ops_complete_compute, sh,
+				  to_addr_conv(sh, percpu));
+		if (failb == syndrome_disks) {
+			/* We're missing D+P. */
+			return async_raid6_datap_recov(syndrome_disks+2,
+						       STRIPE_SIZE, faila,
+						       blocks, &submit);
+		} else {
+			/* We're missing D+D. */
+			return async_raid6_2data_recov(syndrome_disks+2,
+						       STRIPE_SIZE, faila, failb,
+						       blocks, &submit);
+		}
+	}
+}
+
+
 static void ops_complete_prexor(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
@@ -680,12 +907,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
+	       struct dma_async_tx_descriptor *tx)
 {
-	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
-	struct page *xor_srcs[disks];
+	struct page **xor_srcs = percpu->scribble;
 	int count = 0, pd_idx = sh->pd_idx, i;
+	struct async_submit_ctl submit;
 
 	/* existing parity data subtracted */
 	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -700,9 +928,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			xor_srcs[count++] = dev->page;
 	}
 
-	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-		ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
-		ops_complete_prexor, sh);
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
+	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
 	return tx;
 }
@@ -742,17 +970,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	return tx;
 }
 
-static void ops_complete_postxor(void *stripe_head_ref)
+static void ops_complete_reconstruct(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
-	int disks = sh->disks, i, pd_idx = sh->pd_idx;
+	int disks = sh->disks;
+	int pd_idx = sh->pd_idx;
+	int qd_idx = sh->qd_idx;
+	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
-		if (dev->written || i == pd_idx)
+
+		if (dev->written || i == pd_idx || i == qd_idx)
 			set_bit(R5_UPTODATE, &dev->flags);
 	}
 
@@ -770,12 +1002,12 @@ static void ops_complete_postxor(void *stripe_head_ref)
 }
 
 static void
-ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
+		     struct dma_async_tx_descriptor *tx)
 {
-	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
-	struct page *xor_srcs[disks];
-
+	struct page **xor_srcs = percpu->scribble;
+	struct async_submit_ctl submit;
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
 	int prexor = 0;
@@ -809,18 +1041,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
 	 * for the synchronous xor case
 	 */
-	flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
+	flags = ASYNC_TX_ACK |
 		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
 
 	atomic_inc(&sh->count);
 
-	if (unlikely(count == 1)) {
-		flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
-		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
-			flags, tx, ops_complete_postxor, sh);
-	} else
-		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-			flags, tx, ops_complete_postxor, sh);
+	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
+			  to_addr_conv(sh, percpu));
+	if (unlikely(count == 1))
+		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+	else
+		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+}
+
+static void
+ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
+		     struct dma_async_tx_descriptor *tx)
+{
+	struct async_submit_ctl submit;
+	struct page **blocks = percpu->scribble;
+	int count;
+
+	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+	count = set_syndrome_sources(blocks, sh);
+
+	atomic_inc(&sh->count);
+
+	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+			  sh, to_addr_conv(sh, percpu));
+	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
 }
 
 static void ops_complete_check(void *stripe_head_ref)
@@ -835,63 +1085,115 @@ static void ops_complete_check(void *stripe_head_ref)
 	release_stripe(sh);
 }
 
-static void ops_run_check(struct stripe_head *sh)
+static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
-	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
-	struct page *xor_srcs[disks];
+	int pd_idx = sh->pd_idx;
+	int qd_idx = sh->qd_idx;
+	struct page *xor_dest;
+	struct page **xor_srcs = percpu->scribble;
 	struct dma_async_tx_descriptor *tx;
-
-	int count = 0, pd_idx = sh->pd_idx, i;
-	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+	struct async_submit_ctl submit;
+	int count;
+	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	count = 0;
+	xor_dest = sh->dev[pd_idx].page;
+	xor_srcs[count++] = xor_dest;
 	for (i = disks; i--; ) {
-		struct r5dev *dev = &sh->dev[i];
-		if (i != pd_idx)
-			xor_srcs[count++] = dev->page;
+		if (i == pd_idx || i == qd_idx)
+			continue;
+		xor_srcs[count++] = sh->dev[i].page;
 	}
 
-	tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-		&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
+	init_async_submit(&submit, 0, NULL, NULL, NULL,
+			  to_addr_conv(sh, percpu));
+	tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+			   &sh->ops.zero_sum_result, &submit);
+
+	atomic_inc(&sh->count);
+	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
+	tx = async_trigger_callback(&submit);
+}
+
+static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
+{
+	struct page **srcs = percpu->scribble;
+	struct async_submit_ctl submit;
+	int count;
+
+	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
+		(unsigned long long)sh->sector, checkp);
+
+	count = set_syndrome_sources(srcs, sh);
+	if (!checkp)
+		srcs[count] = NULL;
 
 	atomic_inc(&sh->count);
-	tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
-		ops_complete_check, sh);
+	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
+			  sh, to_addr_conv(sh, percpu));
+	async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
+			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
 
-static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
+	raid5_conf_t *conf = sh->raid_conf;
+	int level = conf->level;
+	struct raid5_percpu *percpu;
+	unsigned long cpu;
 
+	cpu = get_cpu();
+	percpu = per_cpu_ptr(conf->percpu, cpu);
 	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
 		ops_run_biofill(sh);
 		overlap_clear++;
 	}
 
 	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
-		tx = ops_run_compute5(sh);
-		/* terminate the chain if postxor is not set to be run */
-		if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
+		if (level < 6)
+			tx = ops_run_compute5(sh, percpu);
+		else {
+			if (sh->ops.target2 < 0 || sh->ops.target < 0)
+				tx = ops_run_compute6_1(sh, percpu);
+			else
+				tx = ops_run_compute6_2(sh, percpu);
+		}
+		/* terminate the chain if reconstruct is not set to be run */
+		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
 			async_tx_ack(tx);
 	}
 
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
-		tx = ops_run_prexor(sh, tx);
+		tx = ops_run_prexor(sh, percpu, tx);
 
 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
 		tx = ops_run_biodrain(sh, tx);
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
-		ops_run_postxor(sh, tx);
+	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
+		if (level < 6)
+			ops_run_reconstruct5(sh, percpu, tx);
+		else
+			ops_run_reconstruct6(sh, percpu, tx);
+	}
 
-	if (test_bit(STRIPE_OP_CHECK, &ops_request))
-		ops_run_check(sh);
+	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
+		if (sh->check_state == check_state_run)
+			ops_run_check_p(sh, percpu);
+		else if (sh->check_state == check_state_run_q)
+			ops_run_check_pq(sh, percpu, 0);
+		else if (sh->check_state == check_state_run_pq)
+			ops_run_check_pq(sh, percpu, 1);
+		else
+			BUG();
+	}
 
 	if (overlap_clear)
 		for (i = disks; i--; ) {
@@ -899,6 +1201,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			if (test_and_clear_bit(R5_Overlap, &dev->flags))
 				wake_up(&sh->raid_conf->wait_for_overlap);
 		}
+	put_cpu();
 }
 
 static int grow_one_stripe(raid5_conf_t *conf)
@@ -948,6 +1251,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	return 0;
 }
 
+/**
+ * scribble_len - return the required size of the scribble region
+ * @num - total number of disks in the array
+ *
+ * The size must be enough to contain:
+ * 1/ a struct page pointer for each device in the array +2
+ * 2/ room to convert each entry in (1) to its corresponding dma
+ *    (dma_map_page()) or page (page_address()) address.
+ *
+ * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
+ * calculate over all devices (not just the data blocks), using zeros in place
+ * of the P and Q blocks.
+ */
+static size_t scribble_len(int num)
+{
+	size_t len;
+
+	len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
+
+	return len;
+}
+
 static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
 	/* Make all the stripes able to hold 'newsize' devices.
@@ -976,6 +1301,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	struct stripe_head *osh, *nsh;
 	LIST_HEAD(newstripes);
 	struct disk_info *ndisks;
+	unsigned long cpu;
 	int err;
 	struct kmem_cache *sc;
 	int i;
@@ -1041,7 +1367,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	/* Step 3.
 	 * At this point, we are holding all the stripes so the array
 	 * is completely stalled, so now is a good time to resize
-	 * conf->disks.
+	 * conf->disks and the scribble region
 	 */
 	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
 	if (ndisks) {
@@ -1052,10 +1378,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	} else
 		err = -ENOMEM;
 
+	get_online_cpus();
+	conf->scribble_len = scribble_len(newsize);
+	for_each_present_cpu(cpu) {
+		struct raid5_percpu *percpu;
+		void *scribble;
+
+		percpu = per_cpu_ptr(conf->percpu, cpu);
+		scribble = kmalloc(conf->scribble_len, GFP_NOIO);
+
+		if (scribble) {
+			kfree(percpu->scribble);
+			percpu->scribble = scribble;
+		} else {
+			err = -ENOMEM;
+			break;
+		}
+	}
+	put_online_cpus();
+
 	/* Step 4, return new stripes to service */
 	while(!list_empty(&newstripes)) {
 		nsh = list_entry(newstripes.next, struct stripe_head, lru);
 		list_del_init(&nsh->lru);
+
 		for (i=conf->raid_disks; i < newsize; i++)
 			if (nsh->dev[i].page == NULL) {
 				struct page *p = alloc_page(GFP_NOIO);
@@ -1594,258 +1940,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 }
 
 
-
-/*
- * Copy data between a page in the stripe cache, and one or more bion
- * The page could align with the middle of the bio, or there could be
- * several bion, each with several bio_vecs, which cover part of the page
- * Multiple bion are linked together on bi_next.  There may be extras
- * at the end of this list.  We ignore them.
- */
-static void copy_data(int frombio, struct bio *bio,
-		     struct page *page,
-		     sector_t sector)
-{
-	char *pa = page_address(page);
-	struct bio_vec *bvl;
-	int i;
-	int page_offset;
-
-	if (bio->bi_sector >= sector)
-		page_offset = (signed)(bio->bi_sector - sector) * 512;
-	else
-		page_offset = (signed)(sector - bio->bi_sector) * -512;
-	bio_for_each_segment(bvl, bio, i) {
-		int len = bio_iovec_idx(bio,i)->bv_len;
-		int clen;
-		int b_offset = 0;
-
-		if (page_offset < 0) {
-			b_offset = -page_offset;
-			page_offset += b_offset;
-			len -= b_offset;
-		}
-
-		if (len > 0 && page_offset + len > STRIPE_SIZE)
-			clen = STRIPE_SIZE - page_offset;
-		else clen = len;
-
-		if (clen > 0) {
-			char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
-			if (frombio)
-				memcpy(pa+page_offset, ba+b_offset, clen);
-			else
-				memcpy(ba+b_offset, pa+page_offset, clen);
-			__bio_kunmap_atomic(ba, KM_USER0);
-		}
-		if (clen < len) /* hit end of page */
-			break;
-		page_offset +=  len;
-	}
-}
-
-#define check_xor()	do {						  \
-				if (count == MAX_XOR_BLOCKS) {		  \
-				xor_blocks(count, STRIPE_SIZE, dest, ptr);\
-				count = 0;				  \
-			   }						  \
-			} while(0)
-
-static void compute_parity6(struct stripe_head *sh, int method)
-{
-	raid5_conf_t *conf = sh->raid_conf;
-	int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
-	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
-	struct bio *chosen;
-	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
-	void *ptrs[syndrome_disks+2];
-
-	pd_idx = sh->pd_idx;
-	qd_idx = sh->qd_idx;
-	d0_idx = raid6_d0(sh);
-
-	pr_debug("compute_parity, stripe %llu, method %d\n",
-		(unsigned long long)sh->sector, method);
-
-	switch(method) {
-	case READ_MODIFY_WRITE:
-		BUG();		/* READ_MODIFY_WRITE N/A for RAID-6 */
-	case RECONSTRUCT_WRITE:
-		for (i= disks; i-- ;)
-			if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
-				chosen = sh->dev[i].towrite;
-				sh->dev[i].towrite = NULL;
-
-				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-					wake_up(&conf->wait_for_overlap);
-
-				BUG_ON(sh->dev[i].written);
-				sh->dev[i].written = chosen;
-			}
-		break;
-	case CHECK_PARITY:
-		BUG();		/* Not implemented yet */
-	}
-
-	for (i = disks; i--;)
-		if (sh->dev[i].written) {
-			sector_t sector = sh->dev[i].sector;
-			struct bio *wbi = sh->dev[i].written;
-			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-				copy_data(1, wbi, sh->dev[i].page, sector);
-				wbi = r5_next_bio(wbi, sector);
-			}
-
-			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			set_bit(R5_UPTODATE, &sh->dev[i].flags);
-		}
-
-	/* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
-
-	for (i = 0; i < disks; i++)
-		ptrs[i] = (void *)raid6_empty_zero_page;
-
-	count = 0;
-	i = d0_idx;
-	do {
-		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
-
-		ptrs[slot] = page_address(sh->dev[i].page);
-		if (slot < syndrome_disks &&
-		    !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
-			printk(KERN_ERR "block %d/%d not uptodate "
-			       "on parity calc\n", i, count);
-			BUG();
-		}
-
-		i = raid6_next_disk(i, disks);
-	} while (i != d0_idx);
-	BUG_ON(count != syndrome_disks);
-
-	raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
-
-	switch(method) {
-	case RECONSTRUCT_WRITE:
-		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
-		set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
-		set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags);
-		break;
-	case UPDATE_PARITY:
-		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-		set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
-		break;
-	}
-}
-
-
-/* Compute one missing block */
-static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
-{
-	int i, count, disks = sh->disks;
-	void *ptr[MAX_XOR_BLOCKS], *dest, *p;
-	int qd_idx = sh->qd_idx;
-
-	pr_debug("compute_block_1, stripe %llu, idx %d\n",
-		(unsigned long long)sh->sector, dd_idx);
-
-	if ( dd_idx == qd_idx ) {
-		/* We're actually computing the Q drive */
-		compute_parity6(sh, UPDATE_PARITY);
-	} else {
-		dest = page_address(sh->dev[dd_idx].page);
-		if (!nozero) memset(dest, 0, STRIPE_SIZE);
-		count = 0;
-		for (i = disks ; i--; ) {
-			if (i == dd_idx || i == qd_idx)
-				continue;
-			p = page_address(sh->dev[i].page);
-			if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
-				ptr[count++] = p;
-			else
-				printk("compute_block() %d, stripe %llu, %d"
-				       " not present\n", dd_idx,
-				       (unsigned long long)sh->sector, i);
-
-			check_xor();
-		}
-		if (count)
-			xor_blocks(count, STRIPE_SIZE, dest, ptr);
-		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-	}
-}
-
-/* Compute two missing blocks */
-static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
-{
-	int i, count, disks = sh->disks;
-	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
-	int d0_idx = raid6_d0(sh);
-	int faila = -1, failb = -1;
-	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
-	void *ptrs[syndrome_disks+2];
-
-	for (i = 0; i < disks ; i++)
-		ptrs[i] = (void *)raid6_empty_zero_page;
-	count = 0;
-	i = d0_idx;
-	do {
-		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
-
-		ptrs[slot] = page_address(sh->dev[i].page);
-
-		if (i == dd_idx1)
-			faila = slot;
-		if (i == dd_idx2)
-			failb = slot;
-		i = raid6_next_disk(i, disks);
-	} while (i != d0_idx);
-	BUG_ON(count != syndrome_disks);
-
-	BUG_ON(faila == failb);
-	if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
-
-	pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
-		 (unsigned long long)sh->sector, dd_idx1, dd_idx2,
-		 faila, failb);
-
-	if (failb == syndrome_disks+1) {
-		/* Q disk is one of the missing disks */
-		if (faila == syndrome_disks) {
-			/* Missing P+Q, just recompute */
-			compute_parity6(sh, UPDATE_PARITY);
-			return;
-		} else {
-			/* We're missing D+Q; recompute D from P */
-			compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
-					     dd_idx2 : dd_idx1),
-					0);
-			compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
-			return;
-		}
-	}
-
-	/* We're missing D+P or D+D; */
-	if (failb == syndrome_disks) {
-		/* We're missing D+P. */
-		raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
-	} else {
-		/* We're missing D+D. */
-		raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
-				  ptrs);
-	}
-
-	/* Both the above update both missing blocks */
-	set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
-	set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
-}
-
 static void
-schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
+schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 			 int rcw, int expand)
 {
 	int i, pd_idx = sh->pd_idx, disks = sh->disks;
+	raid5_conf_t *conf = sh->raid_conf;
+	int level = conf->level;
 
 	if (rcw) {
 		/* if we are not expanding this is a proper write request, and
@@ -1858,7 +1959,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
 		} else
 			sh->reconstruct_state = reconstruct_state_run;
 
-		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
+		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -1871,17 +1972,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
 				s->locked++;
 			}
 		}
-		if (s->locked + 1 == disks)
+		if (s->locked + conf->max_degraded == disks)
 			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
-				atomic_inc(&sh->raid_conf->pending_full_writes);
+				atomic_inc(&conf->pending_full_writes);
 	} else {
+		BUG_ON(level == 6);
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
 
 		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
 		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
 		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
-		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
+		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -1899,13 +2001,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
 		}
 	}
 
-	/* keep the parity disk locked while asynchronous operations
+	/* keep the parity disk(s) locked while asynchronous operations
 	 * are in flight
 	 */
 	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
 	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
 	s->locked++;
 
+	if (level == 6) {
+		int qd_idx = sh->qd_idx;
+		struct r5dev *dev = &sh->dev[qd_idx];
+
+		set_bit(R5_LOCKED, &dev->flags);
+		clear_bit(R5_UPTODATE, &dev->flags);
+		s->locked++;
+	}
+
 	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
 		__func__, (unsigned long long)sh->sector,
 		s->locked, s->ops_request);
@@ -1986,13 +2097,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 
 static void end_reshape(raid5_conf_t *conf);
 
-static int page_is_zero(struct page *p)
-{
-	char *a = page_address(p);
-	return ((*(u32*)a) == 0 &&
-		memcmp(a, a+4, STRIPE_SIZE-4)==0);
-}
-
 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
 			    struct stripe_head *sh)
 {
@@ -2132,9 +2236,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
 			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
 			set_bit(R5_Wantcompute, &dev->flags);
 			sh->ops.target = disk_idx;
+			sh->ops.target2 = -1;
 			s->req_compute = 1;
 			/* Careful: from this point on 'uptodate' is in the eye
-			 * of raid5_run_ops which services 'compute' operations
+			 * of raid_run_ops which services 'compute' operations
 			 * before writes. R5_Wantcompute flags a block that will
 			 * be R5_UPTODATE by the time it is needed for a
 			 * subsequent operation.
@@ -2173,61 +2278,104 @@ static void handle_stripe_fill5(struct stripe_head *sh,
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
 
-static void handle_stripe_fill6(struct stripe_head *sh,
-			struct stripe_head_state *s, struct r6_state *r6s,
-			int disks)
+/* fetch_block6 - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill6 to continue
+ */
+static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
+			 struct r6_state *r6s, int disk_idx, int disks)
 {
-	int i;
-	for (i = disks; i--; ) {
-		struct r5dev *dev = &sh->dev[i];
-		if (!test_bit(R5_LOCKED, &dev->flags) &&
-		    !test_bit(R5_UPTODATE, &dev->flags) &&
-		    (dev->toread || (dev->towrite &&
-		     !test_bit(R5_OVERWRITE, &dev->flags)) ||
-		     s->syncing || s->expanding ||
-		     (s->failed >= 1 &&
-		      (sh->dev[r6s->failed_num[0]].toread ||
-		       s->to_write)) ||
-		     (s->failed >= 2 &&
-		      (sh->dev[r6s->failed_num[1]].toread ||
-		       s->to_write)))) {
-			/* we would like to get this block, possibly
-			 * by computing it, but we might not be able to
+	struct r5dev *dev = &sh->dev[disk_idx];
+	struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
+				  &sh->dev[r6s->failed_num[1]] };
+
+	if (!test_bit(R5_LOCKED, &dev->flags) &&
+	    !test_bit(R5_UPTODATE, &dev->flags) &&
+	    (dev->toread ||
+	     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+	     s->syncing || s->expanding ||
+	     (s->failed >= 1 &&
+	      (fdev[0]->toread || s->to_write)) ||
+	     (s->failed >= 2 &&
+	      (fdev[1]->toread || s->to_write)))) {
+		/* we would like to get this block, possibly by computing it,
+		 * otherwise read it if the backing disk is insync
+		 */
+		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
+		BUG_ON(test_bit(R5_Wantread, &dev->flags));
+		if ((s->uptodate == disks - 1) &&
+		    (s->failed && (disk_idx == r6s->failed_num[0] ||
+				   disk_idx == r6s->failed_num[1]))) {
+			/* have disk failed, and we're requested to fetch it;
+			 * do compute it
 			 */
-			if ((s->uptodate == disks - 1) &&
-			    (s->failed && (i == r6s->failed_num[0] ||
-					   i == r6s->failed_num[1]))) {
-				pr_debug("Computing stripe %llu block %d\n",
-				       (unsigned long long)sh->sector, i);
-				compute_block_1(sh, i, 0);
-				s->uptodate++;
-			} else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
-				/* Computing 2-failure is *very* expensive; only
-				 * do it if failed >= 2
-				 */
-				int other;
-				for (other = disks; other--; ) {
-					if (other == i)
-						continue;
-					if (!test_bit(R5_UPTODATE,
-					      &sh->dev[other].flags))
-						break;
-				}
-				BUG_ON(other < 0);
-				pr_debug("Computing stripe %llu blocks %d,%d\n",
-				       (unsigned long long)sh->sector,
-				       i, other);
-				compute_block_2(sh, i, other);
-				s->uptodate += 2;
-			} else if (test_bit(R5_Insync, &dev->flags)) {
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantread, &dev->flags);
-				s->locked++;
-				pr_debug("Reading block %d (sync=%d)\n",
-					i, s->syncing);
+			pr_debug("Computing stripe %llu block %d\n",
+			       (unsigned long long)sh->sector, disk_idx);
+			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+			set_bit(R5_Wantcompute, &dev->flags);
+			sh->ops.target = disk_idx;
+			sh->ops.target2 = -1; /* no 2nd target */
+			s->req_compute = 1;
+			s->uptodate++;
+			return 1;
+		} else if (s->uptodate == disks-2 && s->failed >= 2) {
+			/* Computing 2-failure is *very* expensive; only
+			 * do it if failed >= 2
+			 */
+			int other;
+			for (other = disks; other--; ) {
+				if (other == disk_idx)
+					continue;
+				if (!test_bit(R5_UPTODATE,
+				      &sh->dev[other].flags))
+					break;
 			}
+			BUG_ON(other < 0);
+			pr_debug("Computing stripe %llu blocks %d,%d\n",
+			       (unsigned long long)sh->sector,
+			       disk_idx, other);
+			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+			set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
+			set_bit(R5_Wantcompute, &sh->dev[other].flags);
+			sh->ops.target = disk_idx;
+			sh->ops.target2 = other;
+			s->uptodate += 2;
+			s->req_compute = 1;
+			return 1;
+		} else if (test_bit(R5_Insync, &dev->flags)) {
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantread, &dev->flags);
+			s->locked++;
+			pr_debug("Reading block %d (sync=%d)\n",
+				disk_idx, s->syncing);
 		}
 	}
+
+	return 0;
+}
+
+/**
+ * handle_stripe_fill6 - read or compute data to satisfy pending requests.
+ */
+static void handle_stripe_fill6(struct stripe_head *sh,
+			struct stripe_head_state *s, struct r6_state *r6s,
+			int disks)
+{
+	int i;
+
+	/* look for blocks to read/compute, skip this if a compute
+	 * is already in flight, or if the stripe contents are in the
+	 * midst of changing due to a write
+	 */
+	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
+	    !sh->reconstruct_state)
+		for (i = disks; i--; )
+			if (fetch_block6(sh, s, r6s, i, disks))
+				break;
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
 
@@ -2361,114 +2509,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
 	 */
 	/* since handle_stripe can be called at any time we need to handle the
 	 * case where a compute block operation has been submitted and then a
-	 * subsequent call wants to start a write request.  raid5_run_ops only
-	 * handles the case where compute block and postxor are requested
+	 * subsequent call wants to start a write request.  raid_run_ops only
+	 * handles the case where compute block and reconstruct are requested
 	 * simultaneously.  If this is not the case then new writes need to be
 	 * held off until the compute completes.
 	 */
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
 	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
-		schedule_reconstruction5(sh, s, rcw == 0, 0);
+		schedule_reconstruction(sh, s, rcw == 0, 0);
 }
 
 static void handle_stripe_dirtying6(raid5_conf_t *conf,
 		struct stripe_head *sh,	struct stripe_head_state *s,
 		struct r6_state *r6s, int disks)
 {
-	int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
+	int rcw = 0, pd_idx = sh->pd_idx, i;
 	int qd_idx = sh->qd_idx;
+
+	set_bit(STRIPE_HANDLE, &sh->state);
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
-		/* Would I have to read this buffer for reconstruct_write */
-		if (!test_bit(R5_OVERWRITE, &dev->flags)
-		    && i != pd_idx && i != qd_idx
-		    && (!test_bit(R5_LOCKED, &dev->flags)
-			    ) &&
-		    !test_bit(R5_UPTODATE, &dev->flags)) {
-			if (test_bit(R5_Insync, &dev->flags)) rcw++;
-			else {
-				pr_debug("raid6: must_compute: "
-					"disk %d flags=%#lx\n", i, dev->flags);
-				must_compute++;
+		/* check if we haven't enough data */
+		if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+		    i != pd_idx && i != qd_idx &&
+		    !test_bit(R5_LOCKED, &dev->flags) &&
+		    !(test_bit(R5_UPTODATE, &dev->flags) ||
+		      test_bit(R5_Wantcompute, &dev->flags))) {
+			rcw++;
+			if (!test_bit(R5_Insync, &dev->flags))
+				continue; /* it's a failed drive */
+
+			if (
+			  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+				pr_debug("Read_old stripe %llu "
+					"block %d for Reconstruct\n",
+				     (unsigned long long)sh->sector, i);
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantread, &dev->flags);
+				s->locked++;
+			} else {
+				pr_debug("Request delayed stripe %llu "
+					"block %d for Reconstruct\n",
+				     (unsigned long long)sh->sector, i);
+				set_bit(STRIPE_DELAYED, &sh->state);
+				set_bit(STRIPE_HANDLE, &sh->state);
 			}
 		}
 	}
-	pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
-	       (unsigned long long)sh->sector, rcw, must_compute);
-	set_bit(STRIPE_HANDLE, &sh->state);
-
-	if (rcw > 0)
-		/* want reconstruct write, but need to get some data */
-		for (i = disks; i--; ) {
-			struct r5dev *dev = &sh->dev[i];
-			if (!test_bit(R5_OVERWRITE, &dev->flags)
-			    && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
-			    && !test_bit(R5_LOCKED, &dev->flags) &&
-			    !test_bit(R5_UPTODATE, &dev->flags) &&
-			    test_bit(R5_Insync, &dev->flags)) {
-				if (
-				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-					pr_debug("Read_old stripe %llu "
-						"block %d for Reconstruct\n",
-					     (unsigned long long)sh->sector, i);
-					set_bit(R5_LOCKED, &dev->flags);
-					set_bit(R5_Wantread, &dev->flags);
-					s->locked++;
-				} else {
-					pr_debug("Request delayed stripe %llu "
-						"block %d for Reconstruct\n",
-					     (unsigned long long)sh->sector, i);
-					set_bit(STRIPE_DELAYED, &sh->state);
-					set_bit(STRIPE_HANDLE, &sh->state);
-				}
-			}
-		}
 	/* now if nothing is locked, and if we have enough data, we can start a
 	 * write request
 	 */
-	if (s->locked == 0 && rcw == 0 &&
+	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
+	    s->locked == 0 && rcw == 0 &&
 	    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-		if (must_compute > 0) {
-			/* We have failed blocks and need to compute them */
-			switch (s->failed) {
-			case 0:
-				BUG();
-			case 1:
-				compute_block_1(sh, r6s->failed_num[0], 0);
-				break;
-			case 2:
-				compute_block_2(sh, r6s->failed_num[0],
-						r6s->failed_num[1]);
-				break;
-			default: /* This request should have been failed? */
-				BUG();
-			}
-		}
-
-		pr_debug("Computing parity for stripe %llu\n",
-			(unsigned long long)sh->sector);
-		compute_parity6(sh, RECONSTRUCT_WRITE);
-		/* now every locked buffer is ready to be written */
-		for (i = disks; i--; )
-			if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-				pr_debug("Writing stripe %llu block %d\n",
-				       (unsigned long long)sh->sector, i);
-				s->locked++;
-				set_bit(R5_Wantwrite, &sh->dev[i].flags);
-			}
-		if (s->locked == disks)
-			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
-				atomic_inc(&conf->pending_full_writes);
-		/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
-		set_bit(STRIPE_INSYNC, &sh->state);
-
-		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-			atomic_dec(&conf->preread_active_stripes);
-			if (atomic_read(&conf->preread_active_stripes) <
-			    IO_THRESHOLD)
-				md_wakeup_thread(conf->mddev->thread);
-		}
+		schedule_reconstruction(sh, s, 1, 0);
 	}
 }
 
@@ -2527,7 +2622,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 		 * we are done.  Otherwise update the mismatch count and repair
 		 * parity if !MD_RECOVERY_CHECK
 		 */
-		if (sh->ops.zero_sum_result == 0)
+		if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
 			/* parity is correct (on disc,
 			 * not in buffer any more)
 			 */
@@ -2544,6 +2639,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 				set_bit(R5_Wantcompute,
 					&sh->dev[sh->pd_idx].flags);
 				sh->ops.target = sh->pd_idx;
+				sh->ops.target2 = -1;
 				s->uptodate++;
 			}
 		}
@@ -2560,67 +2656,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 
 
 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
-				struct stripe_head_state *s,
-				struct r6_state *r6s, struct page *tmp_page,
-				int disks)
+				  struct stripe_head_state *s,
+				  struct r6_state *r6s, int disks)
 {
-	int update_p = 0, update_q = 0;
-	struct r5dev *dev;
 	int pd_idx = sh->pd_idx;
 	int qd_idx = sh->qd_idx;
+	struct r5dev *dev;
 
 	set_bit(STRIPE_HANDLE, &sh->state);
 
 	BUG_ON(s->failed > 2);
-	BUG_ON(s->uptodate < disks);
+
 	/* Want to check and possibly repair P and Q.
 	 * However there could be one 'failed' device, in which
 	 * case we can only check one of them, possibly using the
 	 * other to generate missing data
 	 */
 
-	/* If !tmp_page, we cannot do the calculations,
-	 * but as we have set STRIPE_HANDLE, we will soon be called
-	 * by stripe_handle with a tmp_page - just wait until then.
-	 */
-	if (tmp_page) {
+	switch (sh->check_state) {
+	case check_state_idle:
+		/* start a new check operation if there are < 2 failures */
 		if (s->failed == r6s->q_failed) {
-			/* The only possible failed device holds 'Q', so it
+			/* The only possible failed device holds Q, so it
 			 * makes sense to check P (If anything else were failed,
 			 * we would have used P to recreate it).
 			 */
-			compute_block_1(sh, pd_idx, 1);
-			if (!page_is_zero(sh->dev[pd_idx].page)) {
-				compute_block_1(sh, pd_idx, 0);
-				update_p = 1;
-			}
+			sh->check_state = check_state_run;
 		}
 		if (!r6s->q_failed && s->failed < 2) {
-			/* q is not failed, and we didn't use it to generate
+			/* Q is not failed, and we didn't use it to generate
 			 * anything, so it makes sense to check it
 			 */
-			memcpy(page_address(tmp_page),
-			       page_address(sh->dev[qd_idx].page),
-			       STRIPE_SIZE);
-			compute_parity6(sh, UPDATE_PARITY);
-			if (memcmp(page_address(tmp_page),
-				   page_address(sh->dev[qd_idx].page),
-				   STRIPE_SIZE) != 0) {
-				clear_bit(STRIPE_INSYNC, &sh->state);
-				update_q = 1;
-			}
+			if (sh->check_state == check_state_run)
+				sh->check_state = check_state_run_pq;
+			else
+				sh->check_state = check_state_run_q;
 		}
-		if (update_p || update_q) {
-			conf->mddev->resync_mismatches += STRIPE_SECTORS;
-			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
-				/* don't try to repair!! */
-				update_p = update_q = 0;
+
+		/* discard potentially stale zero_sum_result */
+		sh->ops.zero_sum_result = 0;
+
+		if (sh->check_state == check_state_run) {
+			/* async_xor_zero_sum destroys the contents of P */
+			clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+			s->uptodate--;
+		}
+		if (sh->check_state >= check_state_run &&
+		    sh->check_state <= check_state_run_pq) {
+			/* async_syndrome_zero_sum preserves P and Q, so
+			 * no need to mark them !uptodate here
+			 */
+			set_bit(STRIPE_OP_CHECK, &s->ops_request);
+			break;
 		}
 
+		/* we have 2-disk failure */
+		BUG_ON(s->failed != 2);
+		/* fall through */
+	case check_state_compute_result:
+		sh->check_state = check_state_idle;
+
+		/* check that a write has not made the stripe insync */
+		if (test_bit(STRIPE_INSYNC, &sh->state))
+			break;
+
 		/* now write out any block on a failed drive,
-		 * or P or Q if they need it
+		 * or P or Q if they were recomputed
 		 */
-
+		BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
 		if (s->failed == 2) {
 			dev = &sh->dev[r6s->failed_num[1]];
 			s->locked++;
@@ -2633,14 +2736,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantwrite, &dev->flags);
 		}
-
-		if (update_p) {
+		if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
 			dev = &sh->dev[pd_idx];
 			s->locked++;
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantwrite, &dev->flags);
 		}
-		if (update_q) {
+		if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
 			dev = &sh->dev[qd_idx];
 			s->locked++;
 			set_bit(R5_LOCKED, &dev->flags);
@@ -2649,6 +2751,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
 		clear_bit(STRIPE_DEGRADED, &sh->state);
 
 		set_bit(STRIPE_INSYNC, &sh->state);
+		break;
+	case check_state_run:
+	case check_state_run_q:
+	case check_state_run_pq:
+		break; /* we will be called again upon completion */
+	case check_state_check_result:
+		sh->check_state = check_state_idle;
+
+		/* handle a successful check operation, if parity is correct
+		 * we are done.  Otherwise update the mismatch count and repair
+		 * parity if !MD_RECOVERY_CHECK
+		 */
+		if (sh->ops.zero_sum_result == 0) {
+			/* both parities are correct */
+			if (!s->failed)
+				set_bit(STRIPE_INSYNC, &sh->state);
+			else {
+				/* in contrast to the raid5 case we can validate
+				 * parity, but still have a failure to write
+				 * back
+				 */
+				sh->check_state = check_state_compute_result;
+				/* Returning at this point means that we may go
+				 * off and bring p and/or q uptodate again so
+				 * we make sure to check zero_sum_result again
+				 * to verify if p or q need writeback
+				 */
+			}
+		} else {
+			conf->mddev->resync_mismatches += STRIPE_SECTORS;
+			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+				/* don't try to repair!! */
+				set_bit(STRIPE_INSYNC, &sh->state);
+			else {
+				int *target = &sh->ops.target;
+
+				sh->ops.target = -1;
+				sh->ops.target2 = -1;
+				sh->check_state = check_state_compute_run;
+				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+				if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
+					set_bit(R5_Wantcompute,
+						&sh->dev[pd_idx].flags);
+					*target = pd_idx;
+					target = &sh->ops.target2;
+					s->uptodate++;
+				}
+				if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
+					set_bit(R5_Wantcompute,
+						&sh->dev[qd_idx].flags);
+					*target = qd_idx;
+					s->uptodate++;
+				}
+			}
+		}
+		break;
+	case check_state_compute_run:
+		break;
+	default:
+		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+		       __func__, sh->check_state,
+		       (unsigned long long) sh->sector);
+		BUG();
 	}
 }
 
@@ -2666,6 +2832,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
 		if (i != sh->pd_idx && i != sh->qd_idx) {
 			int dd_idx, j;
 			struct stripe_head *sh2;
+			struct async_submit_ctl submit;
 
 			sector_t bn = compute_blocknr(sh, i, 1);
 			sector_t s = raid5_compute_sector(conf, bn, 0,
@@ -2685,9 +2852,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
 			}
 
 			/* place all the copies on one channel */
+			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
 			tx = async_memcpy(sh2->dev[dd_idx].page,
-				sh->dev[i].page, 0, 0, STRIPE_SIZE,
-				ASYNC_TX_DEP_ACK, tx, NULL, NULL);
+					  sh->dev[i].page, 0, 0, STRIPE_SIZE,
+					  &submit);
 
 			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
 			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
@@ -2756,7 +2924,8 @@ static bool handle_stripe5(struct stripe_head *sh)
 	rcu_read_lock();
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
-		struct r5dev *dev = &sh->dev[i];
+
+		dev = &sh->dev[i];
 		clear_bit(R5_Insync, &dev->flags);
 
 		pr_debug("check %d: state 0x%lx toread %p read %p write %p "
@@ -2973,7 +3142,7 @@ static bool handle_stripe5(struct stripe_head *sh)
 		/* Need to write out all blocks after computing parity */
 		sh->disks = conf->raid_disks;
 		stripe_set_idx(sh->sector, conf, 0, sh);
-		schedule_reconstruction5(sh, &s, 1, 1);
+		schedule_reconstruction(sh, &s, 1, 1);
 	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
@@ -2993,7 +3162,7 @@ static bool handle_stripe5(struct stripe_head *sh)
 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
 	if (s.ops_request)
-		raid5_run_ops(sh, s.ops_request);
+		raid_run_ops(sh, s.ops_request);
 
 	ops_run_io(sh, &s);
 
@@ -3002,7 +3171,7 @@ static bool handle_stripe5(struct stripe_head *sh)
 	return blocked_rdev == NULL;
 }
 
-static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
+static bool handle_stripe6(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
@@ -3014,9 +3183,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	mdk_rdev_t *blocked_rdev = NULL;
 
 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
-		"pd_idx=%d, qd_idx=%d\n",
+		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
 	       (unsigned long long)sh->sector, sh->state,
-	       atomic_read(&sh->count), pd_idx, qd_idx);
+	       atomic_read(&sh->count), pd_idx, qd_idx,
+	       sh->check_state, sh->reconstruct_state);
 	memset(&s, 0, sizeof(s));
 
 	spin_lock(&sh->lock);
@@ -3036,35 +3206,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 
 		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
 			i, dev->flags, dev->toread, dev->towrite, dev->written);
-		/* maybe we can reply to a read */
-		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
-			struct bio *rbi, *rbi2;
-			pr_debug("Return read for disc %d\n", i);
-			spin_lock_irq(&conf->device_lock);
-			rbi = dev->toread;
-			dev->toread = NULL;
-			if (test_and_clear_bit(R5_Overlap, &dev->flags))
-				wake_up(&conf->wait_for_overlap);
-			spin_unlock_irq(&conf->device_lock);
-			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-				copy_data(0, rbi, dev->page, dev->sector);
-				rbi2 = r5_next_bio(rbi, dev->sector);
-				spin_lock_irq(&conf->device_lock);
-				if (!raid5_dec_bi_phys_segments(rbi)) {
-					rbi->bi_next = return_bi;
-					return_bi = rbi;
-				}
-				spin_unlock_irq(&conf->device_lock);
-				rbi = rbi2;
-			}
-		}
+		/* maybe we can reply to a read
+		 *
+		 * new wantfill requests are only permitted while
+		 * ops_complete_biofill is guaranteed to be inactive
+		 */
+		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
+		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
+			set_bit(R5_Wantfill, &dev->flags);
 
 		/* now count some things */
 		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
 		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
+		if (test_bit(R5_Wantcompute, &dev->flags)) {
+			s.compute++;
+			BUG_ON(s.compute > 2);
+		}
 
-
-		if (dev->toread)
+		if (test_bit(R5_Wantfill, &dev->flags)) {
+			s.to_fill++;
+		} else if (dev->toread)
 			s.to_read++;
 		if (dev->towrite) {
 			s.to_write++;
@@ -3105,6 +3266,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		blocked_rdev = NULL;
 	}
 
+	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
+		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
+		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
+	}
+
 	pr_debug("locked=%d uptodate=%d to_read=%d"
 	       " to_write=%d failed=%d failed_num=%d,%d\n",
 	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3145,19 +3311,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	 * or to load a block that is being partially written.
 	 */
 	if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
-	    (s.syncing && (s.uptodate < disks)) || s.expanding)
+	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
 		handle_stripe_fill6(sh, &s, &r6s, disks);
 
-	/* now to consider writing and what else, if anything should be read */
-	if (s.to_write)
+	/* Now we check to see if any write operations have recently
+	 * completed
+	 */
+	if (sh->reconstruct_state == reconstruct_state_drain_result) {
+		int qd_idx = sh->qd_idx;
+
+		sh->reconstruct_state = reconstruct_state_idle;
+		/* All the 'written' buffers and the parity blocks are ready to
+		 * be written back to disk
+		 */
+		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
+		for (i = disks; i--; ) {
+			dev = &sh->dev[i];
+			if (test_bit(R5_LOCKED, &dev->flags) &&
+			    (i == sh->pd_idx || i == qd_idx ||
+			     dev->written)) {
+				pr_debug("Writing block %d\n", i);
+				BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
+				set_bit(R5_Wantwrite, &dev->flags);
+				if (!test_bit(R5_Insync, &dev->flags) ||
+				    ((i == sh->pd_idx || i == qd_idx) &&
+				      s.failed == 0))
+					set_bit(STRIPE_INSYNC, &sh->state);
+			}
+		}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+			atomic_dec(&conf->preread_active_stripes);
+			if (atomic_read(&conf->preread_active_stripes) <
+				IO_THRESHOLD)
+				md_wakeup_thread(conf->mddev->thread);
+		}
+	}
+
+	/* Now to consider new write requests and what else, if anything
+	 * should be read.  We do not handle new writes when:
+	 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
+	 * 2/ A 'check' operation is in flight, as it may clobber the parity
+	 *    block.
+	 */
+	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
 		handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
-	 * data is available
+	 * data is available.  The parity check is held off while parity
+	 * dependent operations are in flight.
 	 */
-	if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
-		handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
+	if (sh->check_state ||
+	    (s.syncing && s.locked == 0 &&
+	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
+	     !test_bit(STRIPE_INSYNC, &sh->state)))
+		handle_parity_checks6(conf, sh, &s, &r6s, disks);
 
 	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
@@ -3178,15 +3387,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 					set_bit(R5_Wantwrite, &dev->flags);
 					set_bit(R5_ReWrite, &dev->flags);
 					set_bit(R5_LOCKED, &dev->flags);
+					s.locked++;
 				} else {
 					/* let's read it back */
 					set_bit(R5_Wantread, &dev->flags);
 					set_bit(R5_LOCKED, &dev->flags);
+					s.locked++;
 				}
 			}
 		}
 
-	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+	/* Finish reconstruct operations initiated by the expansion process */
+	if (sh->reconstruct_state == reconstruct_state_result) {
+		sh->reconstruct_state = reconstruct_state_idle;
+		clear_bit(STRIPE_EXPANDING, &sh->state);
+		for (i = conf->raid_disks; i--; ) {
+			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+			set_bit(R5_LOCKED, &sh->dev[i].flags);
+			s.locked++;
+		}
+	}
+
+	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+	    !sh->reconstruct_state) {
 		struct stripe_head *sh2
 			= get_active_stripe(conf, sh->sector, 1, 1, 1);
 		if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
@@ -3207,14 +3430,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		/* Need to write out all blocks after computing P&Q */
 		sh->disks = conf->raid_disks;
 		stripe_set_idx(sh->sector, conf, 0, sh);
-		compute_parity6(sh, RECONSTRUCT_WRITE);
-		for (i = conf->raid_disks ; i-- ;  ) {
-			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			s.locked++;
-			set_bit(R5_Wantwrite, &sh->dev[i].flags);
-		}
-		clear_bit(STRIPE_EXPANDING, &sh->state);
-	} else if (s.expanded) {
+		schedule_reconstruction(sh, &s, 1, 1);
+	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
@@ -3232,6 +3449,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	if (unlikely(blocked_rdev))
 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
+	if (s.ops_request)
+		raid_run_ops(sh, s.ops_request);
+
 	ops_run_io(sh, &s);
 
 	return_io(return_bi);
@@ -3240,16 +3460,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 }
 
 /* returns true if the stripe was handled */
-static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page)
+static bool handle_stripe(struct stripe_head *sh)
 {
 	if (sh->raid_conf->level == 6)
-		return handle_stripe6(sh, tmp_page);
+		return handle_stripe6(sh);
 	else
 		return handle_stripe5(sh);
 }
 
-
-
 static void raid5_activate_delayed(raid5_conf_t *conf)
 {
 	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -3331,6 +3549,9 @@ static int raid5_congested(void *data, int bits)
 	/* No difference between reads and writes.  Just check
 	 * how busy the stripe_cache is
 	 */
+
+	if (mddev_congested(mddev, bits))
+		return 1;
 	if (conf->inactive_blocked)
 		return 1;
 	if (conf->quiesce)
@@ -3880,7 +4101,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	INIT_LIST_HEAD(&stripes);
 	for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
 		int j;
-		int skipped = 0;
+		int skipped_disk = 0;
 		sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
 		set_bit(STRIPE_EXPANDING, &sh->state);
 		atomic_inc(&conf->reshape_stripes);
@@ -3896,14 +4117,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 				continue;
 			s = compute_blocknr(sh, j, 0);
 			if (s < raid5_size(mddev, 0, 0)) {
-				skipped = 1;
+				skipped_disk = 1;
 				continue;
 			}
 			memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
 			set_bit(R5_Expanded, &sh->dev[j].flags);
 			set_bit(R5_UPTODATE, &sh->dev[j].flags);
 		}
-		if (!skipped) {
+		if (!skipped_disk) {
 			set_bit(STRIPE_EXPAND_READY, &sh->state);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
@@ -4057,7 +4278,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 	spin_unlock(&sh->lock);
 
 	/* wait for any blocked device to be handled */
-	while(unlikely(!handle_stripe(sh, NULL)))
+	while (unlikely(!handle_stripe(sh)))
 		;
 	release_stripe(sh);
 
@@ -4114,7 +4335,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 			return handled;
 		}
 
-		handle_stripe(sh, NULL);
+		handle_stripe(sh);
 		release_stripe(sh);
 		handled++;
 	}
@@ -4128,6 +4349,36 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 	return handled;
 }
 
+#ifdef CONFIG_MULTICORE_RAID456
+static void __process_stripe(void *param, async_cookie_t cookie)
+{
+	struct stripe_head *sh = param;
+
+	handle_stripe(sh);
+	release_stripe(sh);
+}
+
+static void process_stripe(struct stripe_head *sh, struct list_head *domain)
+{
+	async_schedule_domain(__process_stripe, sh, domain);
+}
+
+static void synchronize_stripe_processing(struct list_head *domain)
+{
+	async_synchronize_full_domain(domain);
+}
+#else
+static void process_stripe(struct stripe_head *sh, struct list_head *domain)
+{
+	handle_stripe(sh);
+	release_stripe(sh);
+	cond_resched();
+}
+
+static void synchronize_stripe_processing(struct list_head *domain)
+{
+}
+#endif
 
 
 /*
@@ -4142,6 +4393,7 @@ static void raid5d(mddev_t *mddev)
 	struct stripe_head *sh;
 	raid5_conf_t *conf = mddev->private;
 	int handled;
+	LIST_HEAD(raid_domain);
 
 	pr_debug("+++ raid5d active\n");
 
@@ -4178,8 +4430,7 @@ static void raid5d(mddev_t *mddev)
 		spin_unlock_irq(&conf->device_lock);
 		
 		handled++;
-		handle_stripe(sh, conf->spare_page);
-		release_stripe(sh);
+		process_stripe(sh, &raid_domain);
 
 		spin_lock_irq(&conf->device_lock);
 	}
@@ -4187,6 +4438,7 @@ static void raid5d(mddev_t *mddev)
 
 	spin_unlock_irq(&conf->device_lock);
 
+	synchronize_stripe_processing(&raid_domain);
 	async_tx_issue_pending_all();
 	unplug_slaves(mddev);
 
@@ -4319,15 +4571,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 	return sectors * (raid_disks - conf->max_degraded);
 }
 
+static void raid5_free_percpu(raid5_conf_t *conf)
+{
+	struct raid5_percpu *percpu;
+	unsigned long cpu;
+
+	if (!conf->percpu)
+		return;
+
+	get_online_cpus();
+	for_each_possible_cpu(cpu) {
+		percpu = per_cpu_ptr(conf->percpu, cpu);
+		safe_put_page(percpu->spare_page);
+		kfree(percpu->scribble);
+	}
+#ifdef CONFIG_HOTPLUG_CPU
+	unregister_cpu_notifier(&conf->cpu_notify);
+#endif
+	put_online_cpus();
+
+	free_percpu(conf->percpu);
+}
+
 static void free_conf(raid5_conf_t *conf)
 {
 	shrink_stripes(conf);
-	safe_put_page(conf->spare_page);
+	raid5_free_percpu(conf);
 	kfree(conf->disks);
 	kfree(conf->stripe_hashtbl);
 	kfree(conf);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
+			      void *hcpu)
+{
+	raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
+	long cpu = (long)hcpu;
+	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (conf->level == 6 && !percpu->spare_page)
+			percpu->spare_page = alloc_page(GFP_KERNEL);
+		if (!percpu->scribble)
+			percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+
+		if (!percpu->scribble ||
+		    (conf->level == 6 && !percpu->spare_page)) {
+			safe_put_page(percpu->spare_page);
+			kfree(percpu->scribble);
+			pr_err("%s: failed memory allocation for cpu%ld\n",
+			       __func__, cpu);
+			return NOTIFY_BAD;
+		}
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		safe_put_page(percpu->spare_page);
+		kfree(percpu->scribble);
+		percpu->spare_page = NULL;
+		percpu->scribble = NULL;
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+#endif
+
+static int raid5_alloc_percpu(raid5_conf_t *conf)
+{
+	unsigned long cpu;
+	struct page *spare_page;
+	struct raid5_percpu *allcpus;
+	void *scribble;
+	int err;
+
+	allcpus = alloc_percpu(struct raid5_percpu);
+	if (!allcpus)
+		return -ENOMEM;
+	conf->percpu = allcpus;
+
+	get_online_cpus();
+	err = 0;
+	for_each_present_cpu(cpu) {
+		if (conf->level == 6) {
+			spare_page = alloc_page(GFP_KERNEL);
+			if (!spare_page) {
+				err = -ENOMEM;
+				break;
+			}
+			per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
+		}
+		scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
+		if (!scribble) {
+			err = -ENOMEM;
+			break;
+		}
+		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
+	}
+#ifdef CONFIG_HOTPLUG_CPU
+	conf->cpu_notify.notifier_call = raid456_cpu_notify;
+	conf->cpu_notify.priority = 0;
+	if (err == 0)
+		err = register_cpu_notifier(&conf->cpu_notify);
+#endif
+	put_online_cpus();
+
+	return err;
+}
+
 static raid5_conf_t *setup_conf(mddev_t *mddev)
 {
 	raid5_conf_t *conf;
@@ -4369,6 +4724,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 		goto abort;
 
 	conf->raid_disks = mddev->raid_disks;
+	conf->scribble_len = scribble_len(conf->raid_disks);
 	if (mddev->reshape_position == MaxSector)
 		conf->previous_raid_disks = mddev->raid_disks;
 	else
@@ -4384,11 +4740,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
 		goto abort;
 
-	if (mddev->new_level == 6) {
-		conf->spare_page = alloc_page(GFP_KERNEL);
-		if (!conf->spare_page)
-			goto abort;
-	}
+	conf->level = mddev->new_level;
+	if (raid5_alloc_percpu(conf) != 0)
+		goto abort;
+
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
@@ -4447,7 +4802,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 		printk(KERN_INFO "raid5: allocated %dkB for %s\n",
 			memory, mdname(mddev));
 
-	conf->thread = md_register_thread(raid5d, mddev, "%s_raid5");
+	conf->thread = md_register_thread(raid5d, mddev, NULL);
 	if (!conf->thread) {
 		printk(KERN_ERR
 		       "raid5: couldn't allocate thread for %s\n",
@@ -4613,7 +4968,7 @@ static int run(mddev_t *mddev)
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-							"%s_reshape");
+							"reshape");
 	}
 
 	/* read-ahead size must cover two whole stripes, which is
@@ -5031,7 +5386,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-						"%s_reshape");
+						"reshape");
 	if (!mddev->sync_thread) {
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9459689c4ea..2390e0e83da 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -2,6 +2,7 @@
 #define _RAID5_H
 
 #include <linux/raid/xor.h>
+#include <linux/dmaengine.h>
 
 /*
  *
@@ -175,7 +176,9 @@
  */
 enum check_states {
 	check_state_idle = 0,
-	check_state_run, /* parity check */
+	check_state_run, /* xor parity check */
+	check_state_run_q, /* q-parity check */
+	check_state_run_pq, /* pq dual parity check */
 	check_state_check_result,
 	check_state_compute_run, /* parity repair */
 	check_state_compute_result,
@@ -215,8 +218,8 @@ struct stripe_head {
 	 * @target - STRIPE_OP_COMPUTE_BLK target
 	 */
 	struct stripe_operations {
-		int		   target;
-		u32		   zero_sum_result;
+		int 		     target, target2;
+		enum sum_check_flags zero_sum_result;
 	} ops;
 	struct r5dev {
 		struct bio	req;
@@ -298,7 +301,7 @@ struct r6_state {
 #define STRIPE_OP_COMPUTE_BLK	1
 #define STRIPE_OP_PREXOR	2
 #define STRIPE_OP_BIODRAIN	3
-#define STRIPE_OP_POSTXOR	4
+#define STRIPE_OP_RECONSTRUCT	4
 #define STRIPE_OP_CHECK	5
 
 /*
@@ -385,8 +388,21 @@ struct raid5_private_data {
 					    * (fresh device added).
 					    * Cleared when a sync completes.
 					    */
-
-	struct page 		*spare_page; /* Used when checking P/Q in raid6 */
+	/* per cpu variables */
+	struct raid5_percpu {
+		struct page	*spare_page; /* Used when checking P/Q in raid6 */
+		void		*scribble;   /* space for constructing buffer
+					      * lists and performing address
+					      * conversions
+					      */
+	} *percpu;
+	size_t			scribble_len; /* size of scribble region must be
+					       * associated with conf to handle
+					       * cpu hotplug while reshaping
+					       */
+#ifdef CONFIG_HOTPLUG_CPU
+	struct notifier_block	cpu_notify;
+#endif
 
 	/*
 	 * Free stripes pool
diff --git a/drivers/media/dvb/dvb-core/dvbdev.h b/drivers/media/dvb/dvb-core/dvbdev.h
index 895e2efca8a..01fc7048474 100644
--- a/drivers/media/dvb/dvb-core/dvbdev.h
+++ b/drivers/media/dvb/dvb-core/dvbdev.h
@@ -31,10 +31,9 @@
 #define DVB_MAJOR 212
 
 #if defined(CONFIG_DVB_MAX_ADAPTERS) && CONFIG_DVB_MAX_ADAPTERS > 0
-#define DVB_MAX_ADAPTERS CONFIG_DVB_MAX_ADAPTERS
+  #define DVB_MAX_ADAPTERS CONFIG_DVB_MAX_ADAPTERS
 #else
-#warning invalid CONFIG_DVB_MAX_ADAPTERS value
-#define DVB_MAX_ADAPTERS 8
+  #define DVB_MAX_ADAPTERS 8
 #endif
 
 #define DVB_UNSET (-1)
diff --git a/drivers/media/dvb/dvb-usb/Kconfig b/drivers/media/dvb/dvb-usb/Kconfig
index 0e4b97fba38..9744b069241 100644
--- a/drivers/media/dvb/dvb-usb/Kconfig
+++ b/drivers/media/dvb/dvb-usb/Kconfig
@@ -75,7 +75,7 @@ config DVB_USB_DIB0700
 	select DVB_DIB3000MC if !DVB_FE_CUSTOMISE
 	select DVB_S5H1411 if !DVB_FE_CUSTOMISE
 	select DVB_LGDT3305 if !DVB_FE_CUSTOMISE
-	select DVB_TUNER_DIB0070 if !DVB_FE_CUSTOMISE
+	select DVB_TUNER_DIB0070
 	select MEDIA_TUNER_MT2060 if !MEDIA_TUNER_CUSTOMISE
 	select MEDIA_TUNER_MT2266 if !MEDIA_TUNER_CUSTOMISE
 	select MEDIA_TUNER_XC2028 if !MEDIA_TUNER_CUSTOMISE
diff --git a/drivers/media/video/saa7164/saa7164-api.c b/drivers/media/video/saa7164/saa7164-api.c
index bb6df1b276b..6f094a96ac8 100644
--- a/drivers/media/video/saa7164/saa7164-api.c
+++ b/drivers/media/video/saa7164/saa7164-api.c
@@ -415,7 +415,7 @@ int saa7164_api_enum_subdevs(struct saa7164_dev *dev)
 		goto out;
 	}
 
-	if (debug & DBGLVL_API)
+	if (saa_debug & DBGLVL_API)
 		saa7164_dumphex16(dev, buf, (buflen/16)*16);
 
 	saa7164_api_dump_subdevs(dev, buf, buflen);
@@ -480,7 +480,7 @@ int saa7164_api_i2c_read(struct saa7164_i2c *bus, u8 addr, u32 reglen, u8 *reg,
 
 	dprintk(DBGLVL_API, "%s() len = %d bytes\n", __func__, len);
 
-	if (debug & DBGLVL_I2C)
+	if (saa_debug & DBGLVL_I2C)
 		saa7164_dumphex16(dev, buf, 2 * 16);
 
 	ret = saa7164_cmd_send(bus->dev, unitid, GET_CUR,
@@ -488,7 +488,7 @@ int saa7164_api_i2c_read(struct saa7164_i2c *bus, u8 addr, u32 reglen, u8 *reg,
 	if (ret != SAA_OK)
 		printk(KERN_ERR "%s() error, ret(2) = 0x%x\n", __func__, ret);
 	else {
-		if (debug & DBGLVL_I2C)
+		if (saa_debug & DBGLVL_I2C)
 			saa7164_dumphex16(dev, buf, sizeof(buf));
 		memcpy(data, (buf + 2 * sizeof(u32) + reglen), datalen);
 	}
@@ -548,7 +548,7 @@ int saa7164_api_i2c_write(struct saa7164_i2c *bus, u8 addr, u32 datalen,
 	*((u32 *)(buf + 1 * sizeof(u32))) = datalen - reglen;
 	memcpy((buf + 2 * sizeof(u32)), data, datalen);
 
-	if (debug & DBGLVL_I2C)
+	if (saa_debug & DBGLVL_I2C)
 		saa7164_dumphex16(dev, buf, sizeof(buf));
 
 	ret = saa7164_cmd_send(bus->dev, unitid, SET_CUR,
diff --git a/drivers/media/video/saa7164/saa7164-cmd.c b/drivers/media/video/saa7164/saa7164-cmd.c
index e097f1a0969..c45966edc0c 100644
--- a/drivers/media/video/saa7164/saa7164-cmd.c
+++ b/drivers/media/video/saa7164/saa7164-cmd.c
@@ -250,7 +250,7 @@ int saa7164_cmd_wait(struct saa7164_dev *dev, u8 seqno)
 	unsigned long stamp;
 	int r;
 
-	if (debug >= 4)
+	if (saa_debug >= 4)
 		saa7164_bus_dump(dev);
 
 	dprintk(DBGLVL_CMD, "%s(seqno=%d)\n", __func__, seqno);
diff --git a/drivers/media/video/saa7164/saa7164-core.c b/drivers/media/video/saa7164/saa7164-core.c
index f0dbead188c..709affc3104 100644
--- a/drivers/media/video/saa7164/saa7164-core.c
+++ b/drivers/media/video/saa7164/saa7164-core.c
@@ -45,8 +45,8 @@ MODULE_LICENSE("GPL");
  32 bus
  */
 
-unsigned int debug;
-module_param(debug, int, 0644);
+unsigned int saa_debug;
+module_param_named(debug, saa_debug, int, 0644);
 MODULE_PARM_DESC(debug, "enable debug messages");
 
 unsigned int waitsecs = 10;
@@ -653,7 +653,7 @@ static int __devinit saa7164_initdev(struct pci_dev *pci_dev,
 		printk(KERN_ERR "%s() Unsupported board detected, "
 			"registering without firmware\n", __func__);
 
-	dprintk(1, "%s() parameter debug = %d\n", __func__, debug);
+	dprintk(1, "%s() parameter debug = %d\n", __func__, saa_debug);
 	dprintk(1, "%s() parameter waitsecs = %d\n", __func__, waitsecs);
 
 fail_fw:
diff --git a/drivers/media/video/saa7164/saa7164.h b/drivers/media/video/saa7164/saa7164.h
index 6753008a9c9..42660b546f0 100644
--- a/drivers/media/video/saa7164/saa7164.h
+++ b/drivers/media/video/saa7164/saa7164.h
@@ -375,9 +375,9 @@ extern int saa7164_buffer_dealloc(struct saa7164_tsport *port,
 
 /* ----------------------------------------------------------- */
 
-extern unsigned int debug;
+extern unsigned int saa_debug;
 #define dprintk(level, fmt, arg...)\
-	do { if (debug & level)\
+	do { if (saa_debug & level)\
 		printk(KERN_DEBUG "%s: " fmt, dev->name, ## arg);\
 	} while (0)
 
diff --git a/drivers/media/video/usbvision/usbvision-core.c b/drivers/media/video/usbvision/usbvision-core.c
index 6ba16abeebd..e0f91e4ab65 100644
--- a/drivers/media/video/usbvision/usbvision-core.c
+++ b/drivers/media/video/usbvision/usbvision-core.c
@@ -28,7 +28,6 @@
 #include <linux/timer.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
diff --git a/drivers/media/video/usbvision/usbvision-i2c.c b/drivers/media/video/usbvision/usbvision-i2c.c
index f97fd06d594..c19f51dba2e 100644
--- a/drivers/media/video/usbvision/usbvision-i2c.c
+++ b/drivers/media/video/usbvision/usbvision-i2c.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <asm/uaccess.h>
 #include <linux/ioport.h>
diff --git a/drivers/media/video/usbvision/usbvision-video.c b/drivers/media/video/usbvision/usbvision-video.c
index 90d9b5c0e9a..a2a50d608a3 100644
--- a/drivers/media/video/usbvision/usbvision-video.c
+++ b/drivers/media/video/usbvision/usbvision-video.c
@@ -52,7 +52,6 @@
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index a5b448ea4ea..b3bf1c44d74 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -339,9 +339,9 @@ static int h_memstick_read_dev_id(struct memstick_dev *card,
 			card->id.type = id_reg.type;
 			card->id.category = id_reg.category;
 			card->id.class = id_reg.class;
+			dev_dbg(&card->dev, "if_mode = %02x\n", id_reg.if_mode);
 		}
 		complete(&card->mrq_complete);
-		dev_dbg(&card->dev, "if_mode = %02x\n", id_reg.if_mode);
 		return -EAGAIN;
 	}
 }
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index 79689b10f93..766e21e1557 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -937,6 +937,8 @@ static int quicktest1(unsigned long arg)
 
 	/* Need  1K cacheline aligned that does not cross page boundary */
 	p = kmalloc(4096, 0);
+	if (p == NULL)
+		return -ENOMEM;
 	mq = ALIGNUP(p, 1024);
 	memset(mes, 0xee, sizeof(mes));
 	dw = mq;
diff --git a/drivers/misc/sgi-gru/gruprocfs.c b/drivers/misc/sgi-gru/gruprocfs.c
index 9cbf95bedce..ccd4408a26c 100644
--- a/drivers/misc/sgi-gru/gruprocfs.c
+++ b/drivers/misc/sgi-gru/gruprocfs.c
@@ -340,10 +340,9 @@ static struct proc_dir_entry *proc_gru __read_mostly;
 
 static int create_proc_file(struct proc_entry *p)
 {
-	p->entry = create_proc_entry(p->name, p->mode, proc_gru);
+	p->entry = proc_create(p->name, p->mode, proc_gru, p->fops);
 	if (!p->entry)
 		return -1;
-	p->entry->proc_fops = p->fops;
 	return 0;
 }
 
diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index 065fa818be5..fc25586b7ee 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -599,6 +599,7 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
 	struct scatterlist		*sg;
 	unsigned int			i;
 	enum dma_data_direction		direction;
+	unsigned int			sglen;
 
 	/*
 	 * We don't do DMA on "complex" transfers, i.e. with
@@ -628,11 +629,14 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
 	else
 		direction = DMA_TO_DEVICE;
 
+	sglen = dma_map_sg(&host->pdev->dev, data->sg, data->sg_len, direction);
+	if (sglen != data->sg_len)
+		goto unmap_exit;
 	desc = chan->device->device_prep_slave_sg(chan,
 			data->sg, data->sg_len, direction,
 			DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 	if (!desc)
-		return -ENOMEM;
+		goto unmap_exit;
 
 	host->dma.data_desc = desc;
 	desc->callback = atmci_dma_complete;
@@ -643,6 +647,9 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
 	chan->device->device_issue_pending(chan);
 
 	return 0;
+unmap_exit:
+	dma_unmap_sg(&host->pdev->dev, data->sg, sglen, direction);
+	return -ENOMEM;
 }
 
 #else /* CONFIG_MMC_ATMELMCI_DMA */
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index e4ec3659759..ecf90f5c97c 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -159,7 +159,7 @@ config MTD_AFS_PARTS
 
 config MTD_OF_PARTS
 	tristate "Flash partition map based on OF description"
-	depends on PPC_OF && MTD_PARTITIONS
+	depends on (MICROBLAZE || PPC_OF) && MTD_PARTITIONS
 	help
 	  This provides a partition parsing function which derives
 	  the partition map from the children of the flash node,
diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index 3a9a960644b..841e085ab74 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -74,7 +74,7 @@ config MTD_PHYSMAP_BANKWIDTH
 
 config MTD_PHYSMAP_OF
 	tristate "Flash device in physical memory map based on OF description"
-	depends on PPC_OF && (MTD_CFI || MTD_JEDECPROBE || MTD_ROM)
+	depends on (MICROBLAZE || PPC_OF) && (MTD_CFI || MTD_JEDECPROBE || MTD_ROM)
 	help
 	  This provides a 'mapping' driver which allows the NOR Flash and
 	  ROM driver code to communicate with chips which are mapped
diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index 7adff4d0960..b9eeadf01b7 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -813,10 +813,10 @@ static int vortex_suspend(struct pci_dev *pdev, pm_message_t state)
 		if (netif_running(dev)) {
 			netif_device_detach(dev);
 			vortex_down(dev, 1);
+			disable_irq(dev->irq);
 		}
 		pci_save_state(pdev);
 		pci_enable_wake(pdev, pci_choose_state(pdev, state), 0);
-		free_irq(dev->irq, dev);
 		pci_disable_device(pdev);
 		pci_set_power_state(pdev, pci_choose_state(pdev, state));
 	}
@@ -839,18 +839,12 @@ static int vortex_resume(struct pci_dev *pdev)
 			return err;
 		}
 		pci_set_master(pdev);
-		if (request_irq(dev->irq, vp->full_bus_master_rx ?
-				&boomerang_interrupt : &vortex_interrupt, IRQF_SHARED, dev->name, dev)) {
-			pr_warning("%s: Could not reserve IRQ %d\n", dev->name, dev->irq);
-			pci_disable_device(pdev);
-			return -EBUSY;
-		}
 		if (netif_running(dev)) {
 			err = vortex_up(dev);
 			if (err)
 				return err;
-			else
-				netif_device_attach(dev);
+			enable_irq(dev->irq);
+			netif_device_attach(dev);
 		}
 	}
 	return 0;
diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index 462d9f59c53..83a1922e68e 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -87,7 +87,7 @@
 
 /* These identify the driver base version and may not be removed. */
 static char version[] =
-KERN_INFO DRV_NAME ": 10/100 PCI Ethernet driver v" DRV_VERSION " (" DRV_RELDATE ")\n";
+DRV_NAME ": 10/100 PCI Ethernet driver v" DRV_VERSION " (" DRV_RELDATE ")\n";
 
 MODULE_AUTHOR("Jeff Garzik <jgarzik@pobox.com>");
 MODULE_DESCRIPTION("RealTek RTL-8139C+ series 10/100 PCI Ethernet driver");
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index ed5741b2e70..2bea67c134f 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -1875,7 +1875,7 @@ config 68360_ENET
 
 config FEC
 	bool "FEC ethernet controller (of ColdFire and some i.MX CPUs)"
-	depends on M523x || M527x || M5272 || M528x || M520x || M532x || MACH_MX27 || ARCH_MX35
+	depends on M523x || M527x || M5272 || M528x || M520x || M532x || MACH_MX27 || ARCH_MX35 || ARCH_MX25
 	help
 	  Say Y here if you want to use the built-in 10/100 Fast ethernet
 	  controller on some Motorola ColdFire and Freescale i.MX processors.
diff --git a/drivers/net/atl1c/atl1c_main.c b/drivers/net/atl1c/atl1c_main.c
index be2c6cfe6e8..1372e9a99f5 100644
--- a/drivers/net/atl1c/atl1c_main.c
+++ b/drivers/net/atl1c/atl1c_main.c
@@ -2296,7 +2296,7 @@ static int atl1c_suspend(struct pci_dev *pdev, pm_message_t state)
 	u32 ctrl;
 	u32 mac_ctrl_data;
 	u32 master_ctrl_data;
-	u32 wol_ctrl_data;
+	u32 wol_ctrl_data = 0;
 	u16 mii_bmsr_data;
 	u16 save_autoneg_advertised;
 	u16 mii_intr_status_data;
diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
index 09007437246..df32c109b7a 100644
--- a/drivers/net/can/Kconfig
+++ b/drivers/net/can/Kconfig
@@ -75,6 +75,13 @@ config CAN_EMS_PCI
 	  CPC-PCIe and CPC-104P cards from EMS Dr. Thomas Wuensche
 	  (http://www.ems-wuensche.de).
 
+config CAN_EMS_USB
+	tristate "EMS CPC-USB/ARM7 CAN/USB interface"
+	depends on USB && CAN_DEV
+	---help---
+	  This driver is for the one channel CPC-USB/ARM7 CAN/USB interface
+	  from from EMS Dr. Thomas Wuensche (http://www.ems-wuensche.de).
+
 config CAN_KVASER_PCI
 	tristate "Kvaser PCIcanx and Kvaser PCIcan PCI Cards"
 	depends on PCI && CAN_SJA1000
@@ -82,6 +89,12 @@ config CAN_KVASER_PCI
 	  This driver is for the the PCIcanx and PCIcan cards (1, 2 or
 	  4 channel) from Kvaser (http://www.kvaser.com).
 
+config CAN_AT91
+	tristate "Atmel AT91 onchip CAN controller"
+	depends on CAN && CAN_DEV && ARCH_AT91SAM9263
+	---help---
+	  This is a driver for the SoC CAN controller in Atmel's AT91SAM9263.
+
 config CAN_DEBUG_DEVICES
 	bool "CAN devices debugging messages"
 	depends on CAN
diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
index 523a941b358..0dea62721f2 100644
--- a/drivers/net/can/Makefile
+++ b/drivers/net/can/Makefile
@@ -7,6 +7,9 @@ obj-$(CONFIG_CAN_VCAN)		+= vcan.o
 obj-$(CONFIG_CAN_DEV)		+= can-dev.o
 can-dev-y			:= dev.o
 
+obj-y				+= usb/
+
 obj-$(CONFIG_CAN_SJA1000)	+= sja1000/
+obj-$(CONFIG_CAN_AT91)		+= at91_can.o
 
 ccflags-$(CONFIG_CAN_DEBUG_DEVICES) := -DDEBUG
diff --git a/drivers/net/can/sja1000/ems_pci.c b/drivers/net/can/sja1000/ems_pci.c
index 7d84b8ac9c1..fd04789d337 100644
--- a/drivers/net/can/sja1000/ems_pci.c
+++ b/drivers/net/can/sja1000/ems_pci.c
@@ -94,12 +94,14 @@ struct ems_pci_card {
 #define EMS_PCI_CDR             (CDR_CBP | CDR_CLKOUT_MASK)
 
 #define EMS_PCI_V1_BASE_BAR     1
-#define EMS_PCI_V1_MEM_SIZE     4096
+#define EMS_PCI_V1_CONF_SIZE    4096 /* size of PITA control area */
 #define EMS_PCI_V2_BASE_BAR     2
-#define EMS_PCI_V2_MEM_SIZE     128
+#define EMS_PCI_V2_CONF_SIZE    128 /* size of PLX control area */
 #define EMS_PCI_CAN_BASE_OFFSET 0x400 /* offset where the controllers starts */
 #define EMS_PCI_CAN_CTRL_SIZE   0x200 /* memory size for each controller */
 
+#define EMS_PCI_BASE_SIZE  4096 /* size of controller area */
+
 static struct pci_device_id ems_pci_tbl[] = {
 	/* CPC-PCI v1 */
 	{PCI_VENDOR_ID_SIEMENS, 0x2104, PCI_ANY_ID, PCI_ANY_ID,},
@@ -224,7 +226,7 @@ static int __devinit ems_pci_add_card(struct pci_dev *pdev,
 	struct sja1000_priv *priv;
 	struct net_device *dev;
 	struct ems_pci_card *card;
-	int max_chan, mem_size, base_bar;
+	int max_chan, conf_size, base_bar;
 	int err, i;
 
 	/* Enabling PCI device */
@@ -251,22 +253,22 @@ static int __devinit ems_pci_add_card(struct pci_dev *pdev,
 		card->version = 2; /* CPC-PCI v2 */
 		max_chan = EMS_PCI_V2_MAX_CHAN;
 		base_bar = EMS_PCI_V2_BASE_BAR;
-		mem_size = EMS_PCI_V2_MEM_SIZE;
+		conf_size = EMS_PCI_V2_CONF_SIZE;
 	} else {
 		card->version = 1; /* CPC-PCI v1 */
 		max_chan = EMS_PCI_V1_MAX_CHAN;
 		base_bar = EMS_PCI_V1_BASE_BAR;
-		mem_size = EMS_PCI_V1_MEM_SIZE;
+		conf_size = EMS_PCI_V1_CONF_SIZE;
 	}
 
 	/* Remap configuration space and controller memory area */
-	card->conf_addr = pci_iomap(pdev, 0, mem_size);
+	card->conf_addr = pci_iomap(pdev, 0, conf_size);
 	if (card->conf_addr == NULL) {
 		err = -ENOMEM;
 		goto failure_cleanup;
 	}
 
-	card->base_addr = pci_iomap(pdev, base_bar, mem_size);
+	card->base_addr = pci_iomap(pdev, base_bar, EMS_PCI_BASE_SIZE);
 	if (card->base_addr == NULL) {
 		err = -ENOMEM;
 		goto failure_cleanup;
diff --git a/drivers/net/can/usb/Makefile b/drivers/net/can/usb/Makefile
new file mode 100644
index 00000000000..c3f75ba701b
--- /dev/null
+++ b/drivers/net/can/usb/Makefile
@@ -0,0 +1,5 @@
+#
+#  Makefile for the Linux Controller Area Network USB drivers.
+#
+
+obj-$(CONFIG_CAN_EMS_USB) += ems_usb.o
diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
new file mode 100644
index 00000000000..9012e0abc62
--- /dev/null
+++ b/drivers/net/can/usb/ems_usb.c
@@ -0,0 +1,1155 @@
+/*
+ * CAN driver for EMS Dr. Thomas Wuensche CPC-USB/ARM7
+ *
+ * Copyright (C) 2004-2009 EMS Dr. Thomas Wuensche
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/usb.h>
+
+#include <linux/can.h>
+#include <linux/can/dev.h>
+#include <linux/can/error.h>
+
+MODULE_AUTHOR("Sebastian Haas <haas@ems-wuensche.com>");
+MODULE_DESCRIPTION("CAN driver for EMS Dr. Thomas Wuensche CAN/USB interfaces");
+MODULE_LICENSE("GPL v2");
+
+/* Control-Values for CPC_Control() Command Subject Selection */
+#define CONTR_CAN_MESSAGE 0x04
+#define CONTR_CAN_STATE   0x0C
+#define CONTR_BUS_ERROR   0x1C
+
+/* Control Command Actions */
+#define CONTR_CONT_OFF 0
+#define CONTR_CONT_ON  1
+#define CONTR_ONCE     2
+
+/* Messages from CPC to PC */
+#define CPC_MSG_TYPE_CAN_FRAME       1  /* CAN data frame */
+#define CPC_MSG_TYPE_RTR_FRAME       8  /* CAN remote frame */
+#define CPC_MSG_TYPE_CAN_PARAMS      12 /* Actual CAN parameters */
+#define CPC_MSG_TYPE_CAN_STATE       14 /* CAN state message */
+#define CPC_MSG_TYPE_EXT_CAN_FRAME   16 /* Extended CAN data frame */
+#define CPC_MSG_TYPE_EXT_RTR_FRAME   17 /* Extended remote frame */
+#define CPC_MSG_TYPE_CONTROL         19 /* change interface behavior */
+#define CPC_MSG_TYPE_CONFIRM         20 /* command processed confirmation */
+#define CPC_MSG_TYPE_OVERRUN         21 /* overrun events */
+#define CPC_MSG_TYPE_CAN_FRAME_ERROR 23 /* detected bus errors */
+#define CPC_MSG_TYPE_ERR_COUNTER     25 /* RX/TX error counter */
+
+/* Messages from the PC to the CPC interface  */
+#define CPC_CMD_TYPE_CAN_FRAME     1   /* CAN data frame */
+#define CPC_CMD_TYPE_CONTROL       3   /* control of interface behavior */
+#define CPC_CMD_TYPE_CAN_PARAMS    6   /* set CAN parameters */
+#define CPC_CMD_TYPE_RTR_FRAME     13  /* CAN remote frame */
+#define CPC_CMD_TYPE_CAN_STATE     14  /* CAN state message */
+#define CPC_CMD_TYPE_EXT_CAN_FRAME 15  /* Extended CAN data frame */
+#define CPC_CMD_TYPE_EXT_RTR_FRAME 16  /* Extended CAN remote frame */
+#define CPC_CMD_TYPE_CAN_EXIT      200 /* exit the CAN */
+
+#define CPC_CMD_TYPE_INQ_ERR_COUNTER 25 /* request the CAN error counters */
+#define CPC_CMD_TYPE_CLEAR_MSG_QUEUE 8  /* clear CPC_MSG queue */
+#define CPC_CMD_TYPE_CLEAR_CMD_QUEUE 28 /* clear CPC_CMD queue */
+
+#define CPC_CC_TYPE_SJA1000 2 /* Philips basic CAN controller */
+
+#define CPC_CAN_ECODE_ERRFRAME 0x01 /* Ecode type */
+
+/* Overrun types */
+#define CPC_OVR_EVENT_CAN       0x01
+#define CPC_OVR_EVENT_CANSTATE  0x02
+#define CPC_OVR_EVENT_BUSERROR  0x04
+
+/*
+ * If the CAN controller lost a message we indicate it with the highest bit
+ * set in the count field.
+ */
+#define CPC_OVR_HW 0x80
+
+/* Size of the "struct ems_cpc_msg" without the union */
+#define CPC_MSG_HEADER_LEN   11
+#define CPC_CAN_MSG_MIN_SIZE 5
+
+/* Define these values to match your devices */
+#define USB_CPCUSB_VENDOR_ID 0x12D6
+
+#define USB_CPCUSB_ARM7_PRODUCT_ID 0x0444
+
+/* Mode register NXP LPC2119/SJA1000 CAN Controller */
+#define SJA1000_MOD_NORMAL 0x00
+#define SJA1000_MOD_RM     0x01
+
+/* ECC register NXP LPC2119/SJA1000 CAN Controller */
+#define SJA1000_ECC_SEG   0x1F
+#define SJA1000_ECC_DIR   0x20
+#define SJA1000_ECC_ERR   0x06
+#define SJA1000_ECC_BIT   0x00
+#define SJA1000_ECC_FORM  0x40
+#define SJA1000_ECC_STUFF 0x80
+#define SJA1000_ECC_MASK  0xc0
+
+/* Status register content */
+#define SJA1000_SR_BS 0x80
+#define SJA1000_SR_ES 0x40
+
+#define SJA1000_DEFAULT_OUTPUT_CONTROL 0xDA
+
+/*
+ * The device actually uses a 16MHz clock to generate the CAN clock
+ * but it expects SJA1000 bit settings based on 8MHz (is internally
+ * converted).
+ */
+#define EMS_USB_ARM7_CLOCK 8000000
+
+/*
+ * CAN-Message representation in a CPC_MSG. Message object type is
+ * CPC_MSG_TYPE_CAN_FRAME or CPC_MSG_TYPE_RTR_FRAME or
+ * CPC_MSG_TYPE_EXT_CAN_FRAME or CPC_MSG_TYPE_EXT_RTR_FRAME.
+ */
+struct cpc_can_msg {
+	u32 id;
+	u8 length;
+	u8 msg[8];
+};
+
+/* Representation of the CAN parameters for the SJA1000 controller */
+struct cpc_sja1000_params {
+	u8 mode;
+	u8 acc_code0;
+	u8 acc_code1;
+	u8 acc_code2;
+	u8 acc_code3;
+	u8 acc_mask0;
+	u8 acc_mask1;
+	u8 acc_mask2;
+	u8 acc_mask3;
+	u8 btr0;
+	u8 btr1;
+	u8 outp_contr;
+};
+
+/* CAN params message representation */
+struct cpc_can_params {
+	u8 cc_type;
+
+	/* Will support M16C CAN controller in the future */
+	union {
+		struct cpc_sja1000_params sja1000;
+	} cc_params;
+};
+
+/* Structure for confirmed message handling */
+struct cpc_confirm {
+	u8 error; /* error code */
+};
+
+/* Structure for overrun conditions */
+struct cpc_overrun {
+	u8 event;
+	u8 count;
+};
+
+/* SJA1000 CAN errors (compatible to NXP LPC2119) */
+struct cpc_sja1000_can_error {
+	u8 ecc;
+	u8 rxerr;
+	u8 txerr;
+};
+
+/* structure for CAN error conditions */
+struct cpc_can_error {
+	u8 ecode;
+
+	struct {
+		u8 cc_type;
+
+		/* Other controllers may also provide error code capture regs */
+		union {
+			struct cpc_sja1000_can_error sja1000;
+		} regs;
+	} cc;
+};
+
+/*
+ * Structure containing RX/TX error counter. This structure is used to request
+ * the values of the CAN controllers TX and RX error counter.
+ */
+struct cpc_can_err_counter {
+	u8 rx;
+	u8 tx;
+};
+
+/* Main message type used between library and application */
+struct __attribute__ ((packed)) ems_cpc_msg {
+	u8 type;	/* type of message */
+	u8 length;	/* length of data within union 'msg' */
+	u8 msgid;	/* confirmation handle */
+	u32 ts_sec;	/* timestamp in seconds */
+	u32 ts_nsec;	/* timestamp in nano seconds */
+
+	union {
+		u8 generic[64];
+		struct cpc_can_msg can_msg;
+		struct cpc_can_params can_params;
+		struct cpc_confirm confirmation;
+		struct cpc_overrun overrun;
+		struct cpc_can_error error;
+		struct cpc_can_err_counter err_counter;
+		u8 can_state;
+	} msg;
+};
+
+/*
+ * Table of devices that work with this driver
+ * NOTE: This driver supports only CPC-USB/ARM7 (LPC2119) yet.
+ */
+static struct usb_device_id ems_usb_table[] = {
+	{USB_DEVICE(USB_CPCUSB_VENDOR_ID, USB_CPCUSB_ARM7_PRODUCT_ID)},
+	{} /* Terminating entry */
+};
+
+MODULE_DEVICE_TABLE(usb, ems_usb_table);
+
+#define RX_BUFFER_SIZE      64
+#define CPC_HEADER_SIZE     4
+#define INTR_IN_BUFFER_SIZE 4
+
+#define MAX_RX_URBS 10
+#define MAX_TX_URBS CAN_ECHO_SKB_MAX
+
+struct ems_usb;
+
+struct ems_tx_urb_context {
+	struct ems_usb *dev;
+
+	u32 echo_index;
+	u8 dlc;
+};
+
+struct ems_usb {
+	struct can_priv can; /* must be the first member */
+	int open_time;
+
+	struct sk_buff *echo_skb[MAX_TX_URBS];
+
+	struct usb_device *udev;
+	struct net_device *netdev;
+
+	atomic_t active_tx_urbs;
+	struct usb_anchor tx_submitted;
+	struct ems_tx_urb_context tx_contexts[MAX_TX_URBS];
+
+	struct usb_anchor rx_submitted;
+
+	struct urb *intr_urb;
+
+	u8 *tx_msg_buffer;
+
+	u8 *intr_in_buffer;
+	unsigned int free_slots; /* remember number of available slots */
+
+	struct ems_cpc_msg active_params; /* active controller parameters */
+};
+
+static void ems_usb_read_interrupt_callback(struct urb *urb)
+{
+	struct ems_usb *dev = urb->context;
+	struct net_device *netdev = dev->netdev;
+	int err;
+
+	if (!netif_device_present(netdev))
+		return;
+
+	switch (urb->status) {
+	case 0:
+		dev->free_slots = dev->intr_in_buffer[1];
+		break;
+
+	case -ECONNRESET: /* unlink */
+	case -ENOENT:
+	case -ESHUTDOWN:
+		return;
+
+	default:
+		dev_info(netdev->dev.parent, "Rx interrupt aborted %d\n",
+			 urb->status);
+		break;
+	}
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+
+	if (err == -ENODEV)
+		netif_device_detach(netdev);
+	else if (err)
+		dev_err(netdev->dev.parent,
+			"failed resubmitting intr urb: %d\n", err);
+
+	return;
+}
+
+static void ems_usb_rx_can_msg(struct ems_usb *dev, struct ems_cpc_msg *msg)
+{
+	struct can_frame *cf;
+	struct sk_buff *skb;
+	int i;
+	struct net_device_stats *stats = &dev->netdev->stats;
+
+	skb = netdev_alloc_skb(dev->netdev, sizeof(struct can_frame));
+	if (skb == NULL)
+		return;
+
+	skb->protocol = htons(ETH_P_CAN);
+
+	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
+
+	cf->can_id = msg->msg.can_msg.id;
+	cf->can_dlc = min_t(u8, msg->msg.can_msg.length, 8);
+
+	if (msg->type == CPC_MSG_TYPE_EXT_CAN_FRAME
+	    || msg->type == CPC_MSG_TYPE_EXT_RTR_FRAME)
+		cf->can_id |= CAN_EFF_FLAG;
+
+	if (msg->type == CPC_MSG_TYPE_RTR_FRAME
+	    || msg->type == CPC_MSG_TYPE_EXT_RTR_FRAME) {
+		cf->can_id |= CAN_RTR_FLAG;
+	} else {
+		for (i = 0; i < cf->can_dlc; i++)
+			cf->data[i] = msg->msg.can_msg.msg[i];
+	}
+
+	netif_rx(skb);
+
+	stats->rx_packets++;
+	stats->rx_bytes += cf->can_dlc;
+}
+
+static void ems_usb_rx_err(struct ems_usb *dev, struct ems_cpc_msg *msg)
+{
+	struct can_frame *cf;
+	struct sk_buff *skb;
+	struct net_device_stats *stats = &dev->netdev->stats;
+
+	skb = netdev_alloc_skb(dev->netdev, sizeof(struct can_frame));
+	if (skb == NULL)
+		return;
+
+	skb->protocol = htons(ETH_P_CAN);
+
+	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
+	memset(cf, 0, sizeof(struct can_frame));
+
+	cf->can_id = CAN_ERR_FLAG;
+	cf->can_dlc = CAN_ERR_DLC;
+
+	if (msg->type == CPC_MSG_TYPE_CAN_STATE) {
+		u8 state = msg->msg.can_state;
+
+		if (state & SJA1000_SR_BS) {
+			dev->can.state = CAN_STATE_BUS_OFF;
+			cf->can_id |= CAN_ERR_BUSOFF;
+
+			can_bus_off(dev->netdev);
+		} else if (state & SJA1000_SR_ES) {
+			dev->can.state = CAN_STATE_ERROR_WARNING;
+			dev->can.can_stats.error_warning++;
+		} else {
+			dev->can.state = CAN_STATE_ERROR_ACTIVE;
+			dev->can.can_stats.error_passive++;
+		}
+	} else if (msg->type == CPC_MSG_TYPE_CAN_FRAME_ERROR) {
+		u8 ecc = msg->msg.error.cc.regs.sja1000.ecc;
+		u8 txerr = msg->msg.error.cc.regs.sja1000.txerr;
+		u8 rxerr = msg->msg.error.cc.regs.sja1000.rxerr;
+
+		/* bus error interrupt */
+		dev->can.can_stats.bus_error++;
+		stats->rx_errors++;
+
+		cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
+
+		switch (ecc & SJA1000_ECC_MASK) {
+		case SJA1000_ECC_BIT:
+			cf->data[2] |= CAN_ERR_PROT_BIT;
+			break;
+		case SJA1000_ECC_FORM:
+			cf->data[2] |= CAN_ERR_PROT_FORM;
+			break;
+		case SJA1000_ECC_STUFF:
+			cf->data[2] |= CAN_ERR_PROT_STUFF;
+			break;
+		default:
+			cf->data[2] |= CAN_ERR_PROT_UNSPEC;
+			cf->data[3] = ecc & SJA1000_ECC_SEG;
+			break;
+		}
+
+		/* Error occured during transmission? */
+		if ((ecc & SJA1000_ECC_DIR) == 0)
+			cf->data[2] |= CAN_ERR_PROT_TX;
+
+		if (dev->can.state == CAN_STATE_ERROR_WARNING ||
+		    dev->can.state == CAN_STATE_ERROR_PASSIVE) {
+			cf->data[1] = (txerr > rxerr) ?
+			    CAN_ERR_CRTL_TX_PASSIVE : CAN_ERR_CRTL_RX_PASSIVE;
+		}
+	} else if (msg->type == CPC_MSG_TYPE_OVERRUN) {
+		cf->can_id |= CAN_ERR_CRTL;
+		cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
+
+		stats->rx_over_errors++;
+		stats->rx_errors++;
+	}
+
+	netif_rx(skb);
+
+	stats->rx_packets++;
+	stats->rx_bytes += cf->can_dlc;
+}
+
+/*
+ * callback for bulk IN urb
+ */
+static void ems_usb_read_bulk_callback(struct urb *urb)
+{
+	struct ems_usb *dev = urb->context;
+	struct net_device *netdev;
+	int retval;
+
+	netdev = dev->netdev;
+
+	if (!netif_device_present(netdev))
+		return;
+
+	switch (urb->status) {
+	case 0: /* success */
+		break;
+
+	case -ENOENT:
+		return;
+
+	default:
+		dev_info(netdev->dev.parent, "Rx URB aborted (%d)\n",
+			 urb->status);
+		goto resubmit_urb;
+	}
+
+	if (urb->actual_length > CPC_HEADER_SIZE) {
+		struct ems_cpc_msg *msg;
+		u8 *ibuf = urb->transfer_buffer;
+		u8 msg_count, again, start;
+
+		msg_count = ibuf[0] & ~0x80;
+		again = ibuf[0] & 0x80;
+
+		start = CPC_HEADER_SIZE;
+
+		while (msg_count) {
+			msg = (struct ems_cpc_msg *)&ibuf[start];
+
+			switch (msg->type) {
+			case CPC_MSG_TYPE_CAN_STATE:
+				/* Process CAN state changes */
+				ems_usb_rx_err(dev, msg);
+				break;
+
+			case CPC_MSG_TYPE_CAN_FRAME:
+			case CPC_MSG_TYPE_EXT_CAN_FRAME:
+			case CPC_MSG_TYPE_RTR_FRAME:
+			case CPC_MSG_TYPE_EXT_RTR_FRAME:
+				ems_usb_rx_can_msg(dev, msg);
+				break;
+
+			case CPC_MSG_TYPE_CAN_FRAME_ERROR:
+				/* Process errorframe */
+				ems_usb_rx_err(dev, msg);
+				break;
+
+			case CPC_MSG_TYPE_OVERRUN:
+				/* Message lost while receiving */
+				ems_usb_rx_err(dev, msg);
+				break;
+			}
+
+			start += CPC_MSG_HEADER_LEN + msg->length;
+			msg_count--;
+
+			if (start > urb->transfer_buffer_length) {
+				dev_err(netdev->dev.parent, "format error\n");
+				break;
+			}
+		}
+	}
+
+resubmit_urb:
+	usb_fill_bulk_urb(urb, dev->udev, usb_rcvbulkpipe(dev->udev, 2),
+			  urb->transfer_buffer, RX_BUFFER_SIZE,
+			  ems_usb_read_bulk_callback, dev);
+
+	retval = usb_submit_urb(urb, GFP_ATOMIC);
+
+	if (retval == -ENODEV)
+		netif_device_detach(netdev);
+	else if (retval)
+		dev_err(netdev->dev.parent,
+			"failed resubmitting read bulk urb: %d\n", retval);
+
+	return;
+}
+
+/*
+ * callback for bulk IN urb
+ */
+static void ems_usb_write_bulk_callback(struct urb *urb)
+{
+	struct ems_tx_urb_context *context = urb->context;
+	struct ems_usb *dev;
+	struct net_device *netdev;
+
+	BUG_ON(!context);
+
+	dev = context->dev;
+	netdev = dev->netdev;
+
+	/* free up our allocated buffer */
+	usb_buffer_free(urb->dev, urb->transfer_buffer_length,
+			urb->transfer_buffer, urb->transfer_dma);
+
+	atomic_dec(&dev->active_tx_urbs);
+
+	if (!netif_device_present(netdev))
+		return;
+
+	if (urb->status)
+		dev_info(netdev->dev.parent, "Tx URB aborted (%d)\n",
+			 urb->status);
+
+	netdev->trans_start = jiffies;
+
+	/* transmission complete interrupt */
+	netdev->stats.tx_packets++;
+	netdev->stats.tx_bytes += context->dlc;
+
+	can_get_echo_skb(netdev, context->echo_index);
+
+	/* Release context */
+	context->echo_index = MAX_TX_URBS;
+
+	if (netif_queue_stopped(netdev))
+		netif_wake_queue(netdev);
+}
+
+/*
+ * Send the given CPC command synchronously
+ */
+static int ems_usb_command_msg(struct ems_usb *dev, struct ems_cpc_msg *msg)
+{
+	int actual_length;
+
+	/* Copy payload */
+	memcpy(&dev->tx_msg_buffer[CPC_HEADER_SIZE], msg,
+	       msg->length + CPC_MSG_HEADER_LEN);
+
+	/* Clear header */
+	memset(&dev->tx_msg_buffer[0], 0, CPC_HEADER_SIZE);
+
+	return usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, 2),
+			    &dev->tx_msg_buffer[0],
+			    msg->length + CPC_MSG_HEADER_LEN + CPC_HEADER_SIZE,
+			    &actual_length, 1000);
+}
+
+/*
+ * Change CAN controllers' mode register
+ */
+static int ems_usb_write_mode(struct ems_usb *dev, u8 mode)
+{
+	dev->active_params.msg.can_params.cc_params.sja1000.mode = mode;
+
+	return ems_usb_command_msg(dev, &dev->active_params);
+}
+
+/*
+ * Send a CPC_Control command to change behaviour when interface receives a CAN
+ * message, bus error or CAN state changed notifications.
+ */
+static int ems_usb_control_cmd(struct ems_usb *dev, u8 val)
+{
+	struct ems_cpc_msg cmd;
+
+	cmd.type = CPC_CMD_TYPE_CONTROL;
+	cmd.length = CPC_MSG_HEADER_LEN + 1;
+
+	cmd.msgid = 0;
+
+	cmd.msg.generic[0] = val;
+
+	return ems_usb_command_msg(dev, &cmd);
+}
+
+/*
+ * Start interface
+ */
+static int ems_usb_start(struct ems_usb *dev)
+{
+	struct net_device *netdev = dev->netdev;
+	int err, i;
+
+	dev->intr_in_buffer[0] = 0;
+	dev->free_slots = 15; /* initial size */
+
+	for (i = 0; i < MAX_RX_URBS; i++) {
+		struct urb *urb = NULL;
+		u8 *buf = NULL;
+
+		/* create a URB, and a buffer for it */
+		urb = usb_alloc_urb(0, GFP_KERNEL);
+		if (!urb) {
+			dev_err(netdev->dev.parent,
+				"No memory left for URBs\n");
+			return -ENOMEM;
+		}
+
+		buf = usb_buffer_alloc(dev->udev, RX_BUFFER_SIZE, GFP_KERNEL,
+				       &urb->transfer_dma);
+		if (!buf) {
+			dev_err(netdev->dev.parent,
+				"No memory left for USB buffer\n");
+			usb_free_urb(urb);
+			return -ENOMEM;
+		}
+
+		usb_fill_bulk_urb(urb, dev->udev, usb_rcvbulkpipe(dev->udev, 2),
+				  buf, RX_BUFFER_SIZE,
+				  ems_usb_read_bulk_callback, dev);
+		urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+		usb_anchor_urb(urb, &dev->rx_submitted);
+
+		err = usb_submit_urb(urb, GFP_KERNEL);
+		if (err) {
+			if (err == -ENODEV)
+				netif_device_detach(dev->netdev);
+
+			usb_unanchor_urb(urb);
+			usb_buffer_free(dev->udev, RX_BUFFER_SIZE, buf,
+					urb->transfer_dma);
+			break;
+		}
+
+		/* Drop reference, USB core will take care of freeing it */
+		usb_free_urb(urb);
+	}
+
+	/* Did we submit any URBs */
+	if (i == 0) {
+		dev_warn(netdev->dev.parent, "couldn't setup read URBs\n");
+		return err;
+	}
+
+	/* Warn if we've couldn't transmit all the URBs */
+	if (i < MAX_RX_URBS)
+		dev_warn(netdev->dev.parent, "rx performance may be slow\n");
+
+	/* Setup and start interrupt URB */
+	usb_fill_int_urb(dev->intr_urb, dev->udev,
+			 usb_rcvintpipe(dev->udev, 1),
+			 dev->intr_in_buffer,
+			 INTR_IN_BUFFER_SIZE,
+			 ems_usb_read_interrupt_callback, dev, 1);
+
+	err = usb_submit_urb(dev->intr_urb, GFP_KERNEL);
+	if (err) {
+		if (err == -ENODEV)
+			netif_device_detach(dev->netdev);
+
+		dev_warn(netdev->dev.parent, "intr URB submit failed: %d\n",
+			 err);
+
+		return err;
+	}
+
+	/* CPC-USB will transfer received message to host */
+	err = ems_usb_control_cmd(dev, CONTR_CAN_MESSAGE | CONTR_CONT_ON);
+	if (err)
+		goto failed;
+
+	/* CPC-USB will transfer CAN state changes to host */
+	err = ems_usb_control_cmd(dev, CONTR_CAN_STATE | CONTR_CONT_ON);
+	if (err)
+		goto failed;
+
+	/* CPC-USB will transfer bus errors to host */
+	err = ems_usb_control_cmd(dev, CONTR_BUS_ERROR | CONTR_CONT_ON);
+	if (err)
+		goto failed;
+
+	err = ems_usb_write_mode(dev, SJA1000_MOD_NORMAL);
+	if (err)
+		goto failed;
+
+	dev->can.state = CAN_STATE_ERROR_ACTIVE;
+
+	return 0;
+
+failed:
+	if (err == -ENODEV)
+		netif_device_detach(dev->netdev);
+
+	dev_warn(netdev->dev.parent, "couldn't submit control: %d\n", err);
+
+	return err;
+}
+
+static void unlink_all_urbs(struct ems_usb *dev)
+{
+	int i;
+
+	usb_unlink_urb(dev->intr_urb);
+
+	usb_kill_anchored_urbs(&dev->rx_submitted);
+
+	usb_kill_anchored_urbs(&dev->tx_submitted);
+	atomic_set(&dev->active_tx_urbs, 0);
+
+	for (i = 0; i < MAX_TX_URBS; i++)
+		dev->tx_contexts[i].echo_index = MAX_TX_URBS;
+}
+
+static int ems_usb_open(struct net_device *netdev)
+{
+	struct ems_usb *dev = netdev_priv(netdev);
+	int err;
+
+	err = ems_usb_write_mode(dev, SJA1000_MOD_RM);
+	if (err)
+		return err;
+
+	/* common open */
+	err = open_candev(netdev);
+	if (err)
+		return err;
+
+	/* finally start device */
+	err = ems_usb_start(dev);
+	if (err) {
+		if (err == -ENODEV)
+			netif_device_detach(dev->netdev);
+
+		dev_warn(netdev->dev.parent, "couldn't start device: %d\n",
+			 err);
+
+		close_candev(netdev);
+
+		return err;
+	}
+
+	dev->open_time = jiffies;
+
+	netif_start_queue(netdev);
+
+	return 0;
+}
+
+static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct ems_usb *dev = netdev_priv(netdev);
+	struct ems_tx_urb_context *context = NULL;
+	struct net_device_stats *stats = &netdev->stats;
+	struct can_frame *cf = (struct can_frame *)skb->data;
+	struct ems_cpc_msg *msg;
+	struct urb *urb;
+	u8 *buf;
+	int i, err;
+	size_t size = CPC_HEADER_SIZE + CPC_MSG_HEADER_LEN
+			+ sizeof(struct cpc_can_msg);
+
+	/* create a URB, and a buffer for it, and copy the data to the URB */
+	urb = usb_alloc_urb(0, GFP_ATOMIC);
+	if (!urb) {
+		dev_err(netdev->dev.parent, "No memory left for URBs\n");
+		goto nomem;
+	}
+
+	buf = usb_buffer_alloc(dev->udev, size, GFP_ATOMIC, &urb->transfer_dma);
+	if (!buf) {
+		dev_err(netdev->dev.parent, "No memory left for USB buffer\n");
+		usb_free_urb(urb);
+		goto nomem;
+	}
+
+	msg = (struct ems_cpc_msg *)&buf[CPC_HEADER_SIZE];
+
+	msg->msg.can_msg.id = cf->can_id & CAN_ERR_MASK;
+	msg->msg.can_msg.length = cf->can_dlc;
+
+	if (cf->can_id & CAN_RTR_FLAG) {
+		msg->type = cf->can_id & CAN_EFF_FLAG ?
+			CPC_CMD_TYPE_EXT_RTR_FRAME : CPC_CMD_TYPE_RTR_FRAME;
+
+		msg->length = CPC_CAN_MSG_MIN_SIZE;
+	} else {
+		msg->type = cf->can_id & CAN_EFF_FLAG ?
+			CPC_CMD_TYPE_EXT_CAN_FRAME : CPC_CMD_TYPE_CAN_FRAME;
+
+		for (i = 0; i < cf->can_dlc; i++)
+			msg->msg.can_msg.msg[i] = cf->data[i];
+
+		msg->length = CPC_CAN_MSG_MIN_SIZE + cf->can_dlc;
+	}
+
+	for (i = 0; i < MAX_TX_URBS; i++) {
+		if (dev->tx_contexts[i].echo_index == MAX_TX_URBS) {
+			context = &dev->tx_contexts[i];
+			break;
+		}
+	}
+
+	/*
+	 * May never happen! When this happens we'd more URBs in flight as
+	 * allowed (MAX_TX_URBS).
+	 */
+	if (!context) {
+		usb_unanchor_urb(urb);
+		usb_buffer_free(dev->udev, size, buf, urb->transfer_dma);
+
+		dev_warn(netdev->dev.parent, "couldn't find free context\n");
+
+		return NETDEV_TX_BUSY;
+	}
+
+	context->dev = dev;
+	context->echo_index = i;
+	context->dlc = cf->can_dlc;
+
+	usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, 2), buf,
+			  size, ems_usb_write_bulk_callback, context);
+	urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+	usb_anchor_urb(urb, &dev->tx_submitted);
+
+	can_put_echo_skb(skb, netdev, context->echo_index);
+
+	atomic_inc(&dev->active_tx_urbs);
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+	if (unlikely(err)) {
+		can_free_echo_skb(netdev, context->echo_index);
+
+		usb_unanchor_urb(urb);
+		usb_buffer_free(dev->udev, size, buf, urb->transfer_dma);
+		dev_kfree_skb(skb);
+
+		atomic_dec(&dev->active_tx_urbs);
+
+		if (err == -ENODEV) {
+			netif_device_detach(netdev);
+		} else {
+			dev_warn(netdev->dev.parent, "failed tx_urb %d\n", err);
+
+			stats->tx_dropped++;
+		}
+	} else {
+		netdev->trans_start = jiffies;
+
+		/* Slow down tx path */
+		if (atomic_read(&dev->active_tx_urbs) >= MAX_TX_URBS ||
+		    dev->free_slots < 5) {
+			netif_stop_queue(netdev);
+		}
+	}
+
+	/*
+	 * Release our reference to this URB, the USB core will eventually free
+	 * it entirely.
+	 */
+	usb_free_urb(urb);
+
+	return NETDEV_TX_OK;
+
+nomem:
+	if (skb)
+		dev_kfree_skb(skb);
+
+	stats->tx_dropped++;
+
+	return NETDEV_TX_OK;
+}
+
+static int ems_usb_close(struct net_device *netdev)
+{
+	struct ems_usb *dev = netdev_priv(netdev);
+
+	/* Stop polling */
+	unlink_all_urbs(dev);
+
+	netif_stop_queue(netdev);
+
+	/* Set CAN controller to reset mode */
+	if (ems_usb_write_mode(dev, SJA1000_MOD_RM))
+		dev_warn(netdev->dev.parent, "couldn't stop device");
+
+	close_candev(netdev);
+
+	dev->open_time = 0;
+
+	return 0;
+}
+
+static const struct net_device_ops ems_usb_netdev_ops = {
+	.ndo_open = ems_usb_open,
+	.ndo_stop = ems_usb_close,
+	.ndo_start_xmit = ems_usb_start_xmit,
+};
+
+static struct can_bittiming_const ems_usb_bittiming_const = {
+	.name = "ems_usb",
+	.tseg1_min = 1,
+	.tseg1_max = 16,
+	.tseg2_min = 1,
+	.tseg2_max = 8,
+	.sjw_max = 4,
+	.brp_min = 1,
+	.brp_max = 64,
+	.brp_inc = 1,
+};
+
+static int ems_usb_set_mode(struct net_device *netdev, enum can_mode mode)
+{
+	struct ems_usb *dev = netdev_priv(netdev);
+
+	if (!dev->open_time)
+		return -EINVAL;
+
+	switch (mode) {
+	case CAN_MODE_START:
+		if (ems_usb_write_mode(dev, SJA1000_MOD_NORMAL))
+			dev_warn(netdev->dev.parent, "couldn't start device");
+
+		if (netif_queue_stopped(netdev))
+			netif_wake_queue(netdev);
+		break;
+
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int ems_usb_set_bittiming(struct net_device *netdev)
+{
+	struct ems_usb *dev = netdev_priv(netdev);
+	struct can_bittiming *bt = &dev->can.bittiming;
+	u8 btr0, btr1;
+
+	btr0 = ((bt->brp - 1) & 0x3f) | (((bt->sjw - 1) & 0x3) << 6);
+	btr1 = ((bt->prop_seg + bt->phase_seg1 - 1) & 0xf) |
+		(((bt->phase_seg2 - 1) & 0x7) << 4);
+	if (dev->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES)
+		btr1 |= 0x80;
+
+	dev_info(netdev->dev.parent, "setting BTR0=0x%02x BTR1=0x%02x\n",
+		 btr0, btr1);
+
+	dev->active_params.msg.can_params.cc_params.sja1000.btr0 = btr0;
+	dev->active_params.msg.can_params.cc_params.sja1000.btr1 = btr1;
+
+	return ems_usb_command_msg(dev, &dev->active_params);
+}
+
+static void init_params_sja1000(struct ems_cpc_msg *msg)
+{
+	struct cpc_sja1000_params *sja1000 =
+		&msg->msg.can_params.cc_params.sja1000;
+
+	msg->type = CPC_CMD_TYPE_CAN_PARAMS;
+	msg->length = sizeof(struct cpc_can_params);
+	msg->msgid = 0;
+
+	msg->msg.can_params.cc_type = CPC_CC_TYPE_SJA1000;
+
+	/* Acceptance filter open */
+	sja1000->acc_code0 = 0x00;
+	sja1000->acc_code1 = 0x00;
+	sja1000->acc_code2 = 0x00;
+	sja1000->acc_code3 = 0x00;
+
+	/* Acceptance filter open */
+	sja1000->acc_mask0 = 0xFF;
+	sja1000->acc_mask1 = 0xFF;
+	sja1000->acc_mask2 = 0xFF;
+	sja1000->acc_mask3 = 0xFF;
+
+	sja1000->btr0 = 0;
+	sja1000->btr1 = 0;
+
+	sja1000->outp_contr = SJA1000_DEFAULT_OUTPUT_CONTROL;
+	sja1000->mode = SJA1000_MOD_RM;
+}
+
+/*
+ * probe function for new CPC-USB devices
+ */
+static int ems_usb_probe(struct usb_interface *intf,
+			 const struct usb_device_id *id)
+{
+	struct net_device *netdev;
+	struct ems_usb *dev;
+	int i, err = -ENOMEM;
+
+	netdev = alloc_candev(sizeof(struct ems_usb));
+	if (!netdev) {
+		dev_err(netdev->dev.parent, "Couldn't alloc candev\n");
+		return -ENOMEM;
+	}
+
+	dev = netdev_priv(netdev);
+
+	dev->udev = interface_to_usbdev(intf);
+	dev->netdev = netdev;
+
+	dev->can.state = CAN_STATE_STOPPED;
+	dev->can.clock.freq = EMS_USB_ARM7_CLOCK;
+	dev->can.bittiming_const = &ems_usb_bittiming_const;
+	dev->can.do_set_bittiming = ems_usb_set_bittiming;
+	dev->can.do_set_mode = ems_usb_set_mode;
+
+	netdev->flags |= IFF_ECHO; /* we support local echo */
+
+	netdev->netdev_ops = &ems_usb_netdev_ops;
+
+	netdev->flags |= IFF_ECHO; /* we support local echo */
+
+	init_usb_anchor(&dev->rx_submitted);
+
+	init_usb_anchor(&dev->tx_submitted);
+	atomic_set(&dev->active_tx_urbs, 0);
+
+	for (i = 0; i < MAX_TX_URBS; i++)
+		dev->tx_contexts[i].echo_index = MAX_TX_URBS;
+
+	dev->intr_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!dev->intr_urb) {
+		dev_err(netdev->dev.parent, "Couldn't alloc intr URB\n");
+		goto cleanup_candev;
+	}
+
+	dev->intr_in_buffer = kzalloc(INTR_IN_BUFFER_SIZE, GFP_KERNEL);
+	if (!dev->intr_in_buffer) {
+		dev_err(netdev->dev.parent, "Couldn't alloc Intr buffer\n");
+		goto cleanup_intr_urb;
+	}
+
+	dev->tx_msg_buffer = kzalloc(CPC_HEADER_SIZE +
+				     sizeof(struct ems_cpc_msg), GFP_KERNEL);
+	if (!dev->tx_msg_buffer) {
+		dev_err(netdev->dev.parent, "Couldn't alloc Tx buffer\n");
+		goto cleanup_intr_in_buffer;
+	}
+
+	usb_set_intfdata(intf, dev);
+
+	SET_NETDEV_DEV(netdev, &intf->dev);
+
+	init_params_sja1000(&dev->active_params);
+
+	err = ems_usb_command_msg(dev, &dev->active_params);
+	if (err) {
+		dev_err(netdev->dev.parent,
+			"couldn't initialize controller: %d\n", err);
+		goto cleanup_tx_msg_buffer;
+	}
+
+	err = register_candev(netdev);
+	if (err) {
+		dev_err(netdev->dev.parent,
+			"couldn't register CAN device: %d\n", err);
+		goto cleanup_tx_msg_buffer;
+	}
+
+	return 0;
+
+cleanup_tx_msg_buffer:
+	kfree(dev->tx_msg_buffer);
+
+cleanup_intr_in_buffer:
+	kfree(dev->intr_in_buffer);
+
+cleanup_intr_urb:
+	usb_free_urb(dev->intr_urb);
+
+cleanup_candev:
+	free_candev(netdev);
+
+	return err;
+}
+
+/*
+ * called by the usb core when the device is removed from the system
+ */
+static void ems_usb_disconnect(struct usb_interface *intf)
+{
+	struct ems_usb *dev = usb_get_intfdata(intf);
+
+	usb_set_intfdata(intf, NULL);
+
+	if (dev) {
+		unregister_netdev(dev->netdev);
+		free_candev(dev->netdev);
+
+		unlink_all_urbs(dev);
+
+		usb_free_urb(dev->intr_urb);
+
+		kfree(dev->intr_in_buffer);
+	}
+}
+
+/* usb specific object needed to register this driver with the usb subsystem */
+static struct usb_driver ems_usb_driver = {
+	.name = "ems_usb",
+	.probe = ems_usb_probe,
+	.disconnect = ems_usb_disconnect,
+	.id_table = ems_usb_table,
+};
+
+static int __init ems_usb_init(void)
+{
+	int err;
+
+	printk(KERN_INFO "CPC-USB kernel driver loaded\n");
+
+	/* register this driver with the USB subsystem */
+	err = usb_register(&ems_usb_driver);
+
+	if (err) {
+		err("usb_register failed. Error number %d\n", err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void __exit ems_usb_exit(void)
+{
+	/* deregister this driver with the USB subsystem */
+	usb_deregister(&ems_usb_driver);
+}
+
+module_init(ems_usb_init);
+module_exit(ems_usb_exit);
diff --git a/drivers/net/cnic.c b/drivers/net/cnic.c
index d45eacb7670..211c8e9182f 100644
--- a/drivers/net/cnic.c
+++ b/drivers/net/cnic.c
@@ -85,8 +85,6 @@ static int cnic_uio_open(struct uio_info *uinfo, struct inode *inode)
 
 	cp->uio_dev = iminor(inode);
 
-	cnic_shutdown_bnx2_rx_ring(dev);
-
 	cnic_init_bnx2_tx_ring(dev);
 	cnic_init_bnx2_rx_ring(dev);
 
@@ -98,6 +96,8 @@ static int cnic_uio_close(struct uio_info *uinfo, struct inode *inode)
 	struct cnic_dev *dev = uinfo->priv;
 	struct cnic_local *cp = dev->cnic_priv;
 
+	cnic_shutdown_bnx2_rx_ring(dev);
+
 	cp->uio_dev = -1;
 	return 0;
 }
diff --git a/drivers/net/cpmac.c b/drivers/net/cpmac.c
index 3e3fab8afb1..61f9da2b494 100644
--- a/drivers/net/cpmac.c
+++ b/drivers/net/cpmac.c
@@ -1109,7 +1109,7 @@ static int external_switch;
 static int __devinit cpmac_probe(struct platform_device *pdev)
 {
 	int rc, phy_id;
-	char mdio_bus_id[BUS_ID_SIZE];
+	char mdio_bus_id[MII_BUS_ID_SIZE];
 	struct resource *mem;
 	struct cpmac_priv *priv;
 	struct net_device *dev;
@@ -1118,7 +1118,7 @@ static int __devinit cpmac_probe(struct platform_device *pdev)
 	pdata = pdev->dev.platform_data;
 
 	if (external_switch || dumb_switch) {
-		strncpy(mdio_bus_id, "0", BUS_ID_SIZE); /* fixed phys bus */
+		strncpy(mdio_bus_id, "0", MII_BUS_ID_SIZE); /* fixed phys bus */
 		phy_id = pdev->id;
 	} else {
 		for (phy_id = 0; phy_id < PHY_MAX_ADDR; phy_id++) {
@@ -1126,7 +1126,7 @@ static int __devinit cpmac_probe(struct platform_device *pdev)
 				continue;
 			if (!cpmac_mii->phy_map[phy_id])
 				continue;
-			strncpy(mdio_bus_id, cpmac_mii->id, BUS_ID_SIZE);
+			strncpy(mdio_bus_id, cpmac_mii->id, MII_BUS_ID_SIZE);
 			break;
 		}
 	}
@@ -1167,7 +1167,7 @@ static int __devinit cpmac_probe(struct platform_device *pdev)
 	priv->msg_enable = netif_msg_init(debug_level, 0xff);
 	memcpy(dev->dev_addr, pdata->dev_addr, sizeof(dev->dev_addr));
 
-	snprintf(priv->phy_name, BUS_ID_SIZE, PHY_ID_FMT, mdio_bus_id, phy_id);
+	snprintf(priv->phy_name, MII_BUS_ID_SIZE, PHY_ID_FMT, mdio_bus_id, phy_id);
 
 	priv->phy = phy_connect(dev, priv->phy_name, &cpmac_adjust_link, 0,
 						PHY_INTERFACE_MODE_MII);
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index 977c3d35827..41bd7aeafd8 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -3083,7 +3083,6 @@ static const struct net_device_ops ehea_netdev_ops = {
 	.ndo_poll_controller	= ehea_netpoll,
 #endif
 	.ndo_get_stats		= ehea_get_stats,
-	.ndo_change_mtu		= eth_change_mtu,
 	.ndo_set_mac_address	= ehea_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_multicast_list	= ehea_set_multicast_list,
diff --git a/drivers/net/igb/e1000_mac.c b/drivers/net/igb/e1000_mac.c
index a0231cd079f..7d76bb085e1 100644
--- a/drivers/net/igb/e1000_mac.c
+++ b/drivers/net/igb/e1000_mac.c
@@ -286,41 +286,6 @@ void igb_mta_set(struct e1000_hw *hw, u32 hash_value)
 }
 
 /**
- *  igb_update_mc_addr_list - Update Multicast addresses
- *  @hw: pointer to the HW structure
- *  @mc_addr_list: array of multicast addresses to program
- *  @mc_addr_count: number of multicast addresses to program
- *
- *  Updates entire Multicast Table Array.
- *  The caller must have a packed mc_addr_list of multicast addresses.
- **/
-void igb_update_mc_addr_list(struct e1000_hw *hw,
-                             u8 *mc_addr_list, u32 mc_addr_count)
-{
-	u32 hash_value, hash_bit, hash_reg;
-	int i;
-
-	/* clear mta_shadow */
-	memset(&hw->mac.mta_shadow, 0, sizeof(hw->mac.mta_shadow));
-
-	/* update mta_shadow from mc_addr_list */
-	for (i = 0; (u32) i < mc_addr_count; i++) {
-		hash_value = igb_hash_mc_addr(hw, mc_addr_list);
-
-		hash_reg = (hash_value >> 5) & (hw->mac.mta_reg_count - 1);
-		hash_bit = hash_value & 0x1F;
-
-		hw->mac.mta_shadow[hash_reg] |= (1 << hash_bit);
-		mc_addr_list += (ETH_ALEN);
-	}
-
-	/* replace the entire MTA table */
-	for (i = hw->mac.mta_reg_count - 1; i >= 0; i--)
-		array_wr32(E1000_MTA, i, hw->mac.mta_shadow[i]);
-	wrfl();
-}
-
-/**
  *  igb_hash_mc_addr - Generate a multicast hash value
  *  @hw: pointer to the HW structure
  *  @mc_addr: pointer to a multicast address
@@ -329,7 +294,7 @@ void igb_update_mc_addr_list(struct e1000_hw *hw,
  *  the multicast filter table array address and new table value.  See
  *  igb_mta_set()
  **/
-u32 igb_hash_mc_addr(struct e1000_hw *hw, u8 *mc_addr)
+static u32 igb_hash_mc_addr(struct e1000_hw *hw, u8 *mc_addr)
 {
 	u32 hash_value, hash_mask;
 	u8 bit_shift = 0;
@@ -392,6 +357,41 @@ u32 igb_hash_mc_addr(struct e1000_hw *hw, u8 *mc_addr)
 }
 
 /**
+ *  igb_update_mc_addr_list - Update Multicast addresses
+ *  @hw: pointer to the HW structure
+ *  @mc_addr_list: array of multicast addresses to program
+ *  @mc_addr_count: number of multicast addresses to program
+ *
+ *  Updates entire Multicast Table Array.
+ *  The caller must have a packed mc_addr_list of multicast addresses.
+ **/
+void igb_update_mc_addr_list(struct e1000_hw *hw,
+                             u8 *mc_addr_list, u32 mc_addr_count)
+{
+	u32 hash_value, hash_bit, hash_reg;
+	int i;
+
+	/* clear mta_shadow */
+	memset(&hw->mac.mta_shadow, 0, sizeof(hw->mac.mta_shadow));
+
+	/* update mta_shadow from mc_addr_list */
+	for (i = 0; (u32) i < mc_addr_count; i++) {
+		hash_value = igb_hash_mc_addr(hw, mc_addr_list);
+
+		hash_reg = (hash_value >> 5) & (hw->mac.mta_reg_count - 1);
+		hash_bit = hash_value & 0x1F;
+
+		hw->mac.mta_shadow[hash_reg] |= (1 << hash_bit);
+		mc_addr_list += (ETH_ALEN);
+	}
+
+	/* replace the entire MTA table */
+	for (i = hw->mac.mta_reg_count - 1; i >= 0; i--)
+		array_wr32(E1000_MTA, i, hw->mac.mta_shadow[i]);
+	wrfl();
+}
+
+/**
  *  igb_clear_hw_cntrs_base - Clear base hardware counters
  *  @hw: pointer to the HW structure
  *
diff --git a/drivers/net/igb/e1000_mac.h b/drivers/net/igb/e1000_mac.h
index 7518af8cbbf..bca17d88241 100644
--- a/drivers/net/igb/e1000_mac.h
+++ b/drivers/net/igb/e1000_mac.h
@@ -88,6 +88,5 @@ enum e1000_mng_mode {
 #define E1000_MNG_DHCP_COOKIE_STATUS_VLAN    0x2
 
 extern void e1000_init_function_pointers_82575(struct e1000_hw *hw);
-extern u32 igb_hash_mc_addr(struct e1000_hw *hw, u8 *mc_addr);
 
 #endif
diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index dd688d45e9c..385be601666 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -267,7 +267,8 @@ struct ixgbe_adapter {
 	enum ixgbe_fc_mode last_lfc_mode;
 
 	/* Interrupt Throttle Rate */
-	u32 itr_setting;
+	u32 rx_itr_setting;
+	u32 tx_itr_setting;
 	u16 eitr_low;
 	u16 eitr_high;
 
@@ -351,7 +352,8 @@ struct ixgbe_adapter {
 	struct ixgbe_hw_stats stats;
 
 	/* Interrupt Throttle Rate */
-	u32 eitr_param;
+	u32 rx_eitr_param;
+	u32 tx_eitr_param;
 
 	unsigned long state;
 	u64 tx_busy;
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index 026e94a9984..53b0a668025 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -1929,7 +1929,7 @@ static int ixgbe_get_coalesce(struct net_device *netdev,
 	ec->tx_max_coalesced_frames_irq = adapter->tx_ring[0].work_limit;
 
 	/* only valid if in constant ITR mode */
-	switch (adapter->itr_setting) {
+	switch (adapter->rx_itr_setting) {
 	case 0:
 		/* throttling disabled */
 		ec->rx_coalesce_usecs = 0;
@@ -1940,9 +1940,25 @@ static int ixgbe_get_coalesce(struct net_device *netdev,
 		break;
 	default:
 		/* fixed interrupt rate mode */
-		ec->rx_coalesce_usecs = 1000000/adapter->eitr_param;
+		ec->rx_coalesce_usecs = 1000000/adapter->rx_eitr_param;
 		break;
 	}
+
+	/* only valid if in constant ITR mode */
+	switch (adapter->tx_itr_setting) {
+	case 0:
+		/* throttling disabled */
+		ec->tx_coalesce_usecs = 0;
+		break;
+	case 1:
+		/* dynamic ITR mode */
+		ec->tx_coalesce_usecs = 1;
+		break;
+	default:
+		ec->tx_coalesce_usecs = 1000000/adapter->tx_eitr_param;
+		break;
+	}
+
 	return 0;
 }
 
@@ -1953,6 +1969,14 @@ static int ixgbe_set_coalesce(struct net_device *netdev,
 	struct ixgbe_q_vector *q_vector;
 	int i;
 
+	/*
+	 * don't accept tx specific changes if we've got mixed RxTx vectors
+	 * test and jump out here if needed before changing the rx numbers
+	 */
+	if ((1000000/ec->tx_coalesce_usecs) != adapter->tx_eitr_param &&
+	    adapter->q_vector[0]->txr_count && adapter->q_vector[0]->rxr_count)
+		return -EINVAL;
+
 	if (ec->tx_max_coalesced_frames_irq)
 		adapter->tx_ring[0].work_limit = ec->tx_max_coalesced_frames_irq;
 
@@ -1963,26 +1987,49 @@ static int ixgbe_set_coalesce(struct net_device *netdev,
 			return -EINVAL;
 
 		/* store the value in ints/second */
-		adapter->eitr_param = 1000000/ec->rx_coalesce_usecs;
+		adapter->rx_eitr_param = 1000000/ec->rx_coalesce_usecs;
 
 		/* static value of interrupt rate */
-		adapter->itr_setting = adapter->eitr_param;
+		adapter->rx_itr_setting = adapter->rx_eitr_param;
 		/* clear the lower bit as its used for dynamic state */
-		adapter->itr_setting &= ~1;
+		adapter->rx_itr_setting &= ~1;
 	} else if (ec->rx_coalesce_usecs == 1) {
 		/* 1 means dynamic mode */
-		adapter->eitr_param = 20000;
-		adapter->itr_setting = 1;
+		adapter->rx_eitr_param = 20000;
+		adapter->rx_itr_setting = 1;
 	} else {
 		/*
 		 * any other value means disable eitr, which is best
 		 * served by setting the interrupt rate very high
 		 */
 		if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)
-			adapter->eitr_param = IXGBE_MAX_RSC_INT_RATE;
+			adapter->rx_eitr_param = IXGBE_MAX_RSC_INT_RATE;
 		else
-			adapter->eitr_param = IXGBE_MAX_INT_RATE;
-		adapter->itr_setting = 0;
+			adapter->rx_eitr_param = IXGBE_MAX_INT_RATE;
+		adapter->rx_itr_setting = 0;
+	}
+
+	if (ec->tx_coalesce_usecs > 1) {
+		/* check the limits */
+		if ((1000000/ec->tx_coalesce_usecs > IXGBE_MAX_INT_RATE) ||
+		    (1000000/ec->tx_coalesce_usecs < IXGBE_MIN_INT_RATE))
+			return -EINVAL;
+
+		/* store the value in ints/second */
+		adapter->tx_eitr_param = 1000000/ec->tx_coalesce_usecs;
+
+		/* static value of interrupt rate */
+		adapter->tx_itr_setting = adapter->tx_eitr_param;
+
+		/* clear the lower bit as its used for dynamic state */
+		adapter->tx_itr_setting &= ~1;
+	} else if (ec->tx_coalesce_usecs == 1) {
+		/* 1 means dynamic mode */
+		adapter->tx_eitr_param = 10000;
+		adapter->tx_itr_setting = 1;
+	} else {
+		adapter->tx_eitr_param = IXGBE_MAX_INT_RATE;
+		adapter->tx_itr_setting = 0;
 	}
 
 	/* MSI/MSIx Interrupt Mode */
@@ -1992,17 +2039,17 @@ static int ixgbe_set_coalesce(struct net_device *netdev,
 		for (i = 0; i < num_vectors; i++) {
 			q_vector = adapter->q_vector[i];
 			if (q_vector->txr_count && !q_vector->rxr_count)
-				/* tx vector gets half the rate */
-				q_vector->eitr = (adapter->eitr_param >> 1);
+				/* tx only */
+				q_vector->eitr = adapter->tx_eitr_param;
 			else
 				/* rx only or mixed */
-				q_vector->eitr = adapter->eitr_param;
+				q_vector->eitr = adapter->rx_eitr_param;
 			ixgbe_write_eitr(q_vector);
 		}
 	/* Legacy Interrupt Mode */
 	} else {
 		q_vector = adapter->q_vector[0];
-		q_vector->eitr = adapter->eitr_param;
+		q_vector->eitr = adapter->rx_eitr_param;
 		ixgbe_write_eitr(q_vector);
 	}
 
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 59ad9590e70..c407bd9de0d 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -926,12 +926,12 @@ static void ixgbe_configure_msix(struct ixgbe_adapter *adapter)
 			                      r_idx + 1);
 		}
 
-		/* if this is a tx only vector halve the interrupt rate */
 		if (q_vector->txr_count && !q_vector->rxr_count)
-			q_vector->eitr = (adapter->eitr_param >> 1);
+			/* tx only */
+			q_vector->eitr = adapter->tx_eitr_param;
 		else if (q_vector->rxr_count)
-			/* rx only */
-			q_vector->eitr = adapter->eitr_param;
+			/* rx or mixed */
+			q_vector->eitr = adapter->rx_eitr_param;
 
 		ixgbe_write_eitr(q_vector);
 	}
@@ -1359,7 +1359,7 @@ static int ixgbe_clean_rxonly(struct napi_struct *napi, int budget)
 	/* If all Rx work done, exit the polling mode */
 	if (work_done < budget) {
 		napi_complete(napi);
-		if (adapter->itr_setting & 1)
+		if (adapter->rx_itr_setting & 1)
 			ixgbe_set_itr_msix(q_vector);
 		if (!test_bit(__IXGBE_DOWN, &adapter->state))
 			ixgbe_irq_enable_queues(adapter,
@@ -1420,7 +1420,7 @@ static int ixgbe_clean_rxtx_many(struct napi_struct *napi, int budget)
 	/* If all Rx work done, exit the polling mode */
 	if (work_done < budget) {
 		napi_complete(napi);
-		if (adapter->itr_setting & 1)
+		if (adapter->rx_itr_setting & 1)
 			ixgbe_set_itr_msix(q_vector);
 		if (!test_bit(__IXGBE_DOWN, &adapter->state))
 			ixgbe_irq_enable_queues(adapter,
@@ -1458,10 +1458,10 @@ static int ixgbe_clean_txonly(struct napi_struct *napi, int budget)
 	if (!ixgbe_clean_tx_irq(q_vector, tx_ring))
 		work_done = budget;
 
-	/* If all Rx work done, exit the polling mode */
+	/* If all Tx work done, exit the polling mode */
 	if (work_done < budget) {
 		napi_complete(napi);
-		if (adapter->itr_setting & 1)
+		if (adapter->tx_itr_setting & 1)
 			ixgbe_set_itr_msix(q_vector);
 		if (!test_bit(__IXGBE_DOWN, &adapter->state))
 			ixgbe_irq_enable_queues(adapter, ((u64)1 << q_vector->v_idx));
@@ -1848,7 +1848,7 @@ static void ixgbe_configure_msi_and_legacy(struct ixgbe_adapter *adapter)
 	struct ixgbe_hw *hw = &adapter->hw;
 
 	IXGBE_WRITE_REG(hw, IXGBE_EITR(0),
-	                EITR_INTS_PER_SEC_TO_REG(adapter->eitr_param));
+	                EITR_INTS_PER_SEC_TO_REG(adapter->rx_eitr_param));
 
 	ixgbe_set_ivar(adapter, 0, 0, 0);
 	ixgbe_set_ivar(adapter, 1, 0, 0);
@@ -1970,6 +1970,50 @@ static u32 ixgbe_setup_mrqc(struct ixgbe_adapter *adapter)
 }
 
 /**
+ * ixgbe_configure_rscctl - enable RSC for the indicated ring
+ * @adapter:    address of board private structure
+ * @index:      index of ring to set
+ * @rx_buf_len: rx buffer length
+ **/
+static void ixgbe_configure_rscctl(struct ixgbe_adapter *adapter, int index,
+                                   int rx_buf_len)
+{
+	struct ixgbe_ring *rx_ring;
+	struct ixgbe_hw *hw = &adapter->hw;
+	int j;
+	u32 rscctrl;
+
+	rx_ring = &adapter->rx_ring[index];
+	j = rx_ring->reg_idx;
+	rscctrl = IXGBE_READ_REG(hw, IXGBE_RSCCTL(j));
+	rscctrl |= IXGBE_RSCCTL_RSCEN;
+	/*
+	 * we must limit the number of descriptors so that the
+	 * total size of max desc * buf_len is not greater
+	 * than 65535
+	 */
+	if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
+#if (MAX_SKB_FRAGS > 16)
+		rscctrl |= IXGBE_RSCCTL_MAXDESC_16;
+#elif (MAX_SKB_FRAGS > 8)
+		rscctrl |= IXGBE_RSCCTL_MAXDESC_8;
+#elif (MAX_SKB_FRAGS > 4)
+		rscctrl |= IXGBE_RSCCTL_MAXDESC_4;
+#else
+		rscctrl |= IXGBE_RSCCTL_MAXDESC_1;
+#endif
+	} else {
+		if (rx_buf_len < IXGBE_RXBUFFER_4096)
+			rscctrl |= IXGBE_RSCCTL_MAXDESC_16;
+		else if (rx_buf_len < IXGBE_RXBUFFER_8192)
+			rscctrl |= IXGBE_RSCCTL_MAXDESC_8;
+		else
+			rscctrl |= IXGBE_RSCCTL_MAXDESC_4;
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_RSCCTL(j), rscctrl);
+}
+
+/**
  * ixgbe_configure_rx - Configure 8259x Receive Unit after Reset
  * @adapter: board private structure
  *
@@ -1990,7 +2034,6 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter)
 	u32 fctrl, hlreg0;
 	u32 reta = 0, mrqc = 0;
 	u32 rdrxctl;
-	u32 rscctrl;
 	int rx_buf_len;
 
 	/* Decide whether to use packet split mode or not */
@@ -2148,36 +2191,9 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter)
 
 	if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) {
 		/* Enable 82599 HW-RSC */
-		for (i = 0; i < adapter->num_rx_queues; i++) {
-			rx_ring = &adapter->rx_ring[i];
-			j = rx_ring->reg_idx;
-			rscctrl = IXGBE_READ_REG(hw, IXGBE_RSCCTL(j));
-			rscctrl |= IXGBE_RSCCTL_RSCEN;
-			/*
-			 * we must limit the number of descriptors so that the
-			 * total size of max desc * buf_len is not greater
-			 * than 65535
-			 */
-			if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
-#if (MAX_SKB_FRAGS > 16)
-				rscctrl |= IXGBE_RSCCTL_MAXDESC_16;
-#elif (MAX_SKB_FRAGS > 8)
-				rscctrl |= IXGBE_RSCCTL_MAXDESC_8;
-#elif (MAX_SKB_FRAGS > 4)
-				rscctrl |= IXGBE_RSCCTL_MAXDESC_4;
-#else
-				rscctrl |= IXGBE_RSCCTL_MAXDESC_1;
-#endif
-			} else {
-				if (rx_buf_len < IXGBE_RXBUFFER_4096)
-					rscctrl |= IXGBE_RSCCTL_MAXDESC_16;
-				else if (rx_buf_len < IXGBE_RXBUFFER_8192)
-					rscctrl |= IXGBE_RSCCTL_MAXDESC_8;
-				else
-					rscctrl |= IXGBE_RSCCTL_MAXDESC_4;
-			}
-			IXGBE_WRITE_REG(hw, IXGBE_RSCCTL(j), rscctrl);
-		}
+		for (i = 0; i < adapter->num_rx_queues; i++)
+			ixgbe_configure_rscctl(adapter, i, rx_buf_len);
+
 		/* Disable RSC for ACK packets */
 		IXGBE_WRITE_REG(hw, IXGBE_RSCDBU,
 		   (IXGBE_RSCDBU_RSCACKDIS | IXGBE_READ_REG(hw, IXGBE_RSCDBU)));
@@ -2926,6 +2942,8 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
 
 	ixgbe_napi_disable_all(adapter);
 
+	clear_bit(__IXGBE_SFP_MODULE_NOT_FOUND, &adapter->state);
+	del_timer_sync(&adapter->sfp_timer);
 	del_timer_sync(&adapter->watchdog_timer);
 	cancel_work_sync(&adapter->watchdog_task);
 
@@ -2989,7 +3007,7 @@ static int ixgbe_poll(struct napi_struct *napi, int budget)
 	/* If budget not fully consumed, exit the polling mode */
 	if (work_done < budget) {
 		napi_complete(napi);
-		if (adapter->itr_setting & 1)
+		if (adapter->rx_itr_setting & 1)
 			ixgbe_set_itr(adapter);
 		if (!test_bit(__IXGBE_DOWN, &adapter->state))
 			ixgbe_irq_enable_queues(adapter, IXGBE_EIMS_RTX_QUEUE);
@@ -3599,7 +3617,10 @@ static int ixgbe_alloc_q_vectors(struct ixgbe_adapter *adapter)
 		if (!q_vector)
 			goto err_out;
 		q_vector->adapter = adapter;
-		q_vector->eitr = adapter->eitr_param;
+		if (q_vector->txr_count && !q_vector->rxr_count)
+			q_vector->eitr = adapter->tx_eitr_param;
+		else
+			q_vector->eitr = adapter->rx_eitr_param;
 		q_vector->v_idx = q_idx;
 		netif_napi_add(adapter->netdev, &q_vector->napi, (*poll), 64);
 		adapter->q_vector[q_idx] = q_vector;
@@ -3868,8 +3889,10 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
 	hw->fc.disable_fc_autoneg = false;
 
 	/* enable itr by default in dynamic mode */
-	adapter->itr_setting = 1;
-	adapter->eitr_param = 20000;
+	adapter->rx_itr_setting = 1;
+	adapter->rx_eitr_param = 20000;
+	adapter->tx_itr_setting = 1;
+	adapter->tx_eitr_param = 10000;
 
 	/* set defaults for eitr in MegaBytes */
 	adapter->eitr_low = 10;
diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index f7bdde111df..b5aa974827e 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -1469,6 +1469,7 @@ netxen_nic_resume(struct pci_dev *pdev)
 	}
 
 	netxen_schedule_work(adapter, netxen_fw_poll_work, FW_POLL_DELAY);
+	return 0;
 
 err_out_detach:
 	netxen_nic_detach(adapter);
@@ -1903,12 +1904,13 @@ static void netxen_tx_timeout_task(struct work_struct *work)
 
 		netif_wake_queue(adapter->netdev);
 
-		goto done;
+		clear_bit(__NX_RESETTING, &adapter->state);
 
 	} else {
+		clear_bit(__NX_RESETTING, &adapter->state);
 		if (!netxen_nic_reset_context(adapter)) {
 			adapter->netdev->trans_start = jiffies;
-			goto done;
+			return;
 		}
 
 		/* context reset failed, fall through for fw reset */
@@ -1916,8 +1918,6 @@ static void netxen_tx_timeout_task(struct work_struct *work)
 
 request_reset:
 	adapter->need_fw_reset = 1;
-done:
-	clear_bit(__NX_RESETTING, &adapter->state);
 }
 
 struct net_device_stats *netxen_nic_get_stats(struct net_device *netdev)
diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index 97db1c73234..474876c879c 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -340,12 +340,11 @@ static hw_info_t *get_hwinfo(struct pcmcia_device *link)
 	base = &virt[hw_info[i].offset & (req.Size-1)];
 	if ((readb(base+0) == hw_info[i].a0) &&
 	    (readb(base+2) == hw_info[i].a1) &&
-	    (readb(base+4) == hw_info[i].a2))
-	    break;
-    }
-    if (i < NR_INFO) {
-	for (j = 0; j < 6; j++)
-	    dev->dev_addr[j] = readb(base + (j<<1));
+	    (readb(base+4) == hw_info[i].a2)) {
+		for (j = 0; j < 6; j++)
+		    dev->dev_addr[j] = readb(base + (j<<1));
+		break;
+	}
     }
 
     iounmap(virt);
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 07a7e4b8f8f..cc4b2f99989 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -884,13 +884,12 @@ static int efx_wanted_rx_queues(void)
 	int count;
 	int cpu;
 
-	if (unlikely(!alloc_cpumask_var(&core_mask, GFP_KERNEL))) {
+	if (unlikely(!zalloc_cpumask_var(&core_mask, GFP_KERNEL))) {
 		printk(KERN_WARNING
 		       "sfc: RSS disabled due to allocation failure\n");
 		return 1;
 	}
 
-	cpumask_clear(core_mask);
 	count = 0;
 	for_each_online_cpu(cpu) {
 		if (!cpumask_test_cpu(cpu, core_mask)) {
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 15140f9f2e9..ef1165718dd 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1497,7 +1497,6 @@ static int sky2_up(struct net_device *dev)
 	if (ramsize > 0) {
 		u32 rxspace;
 
-		hw->flags |= SKY2_HW_RAM_BUFFER;
 		pr_debug(PFX "%s: ram buffer %dK\n", dev->name, ramsize);
 		if (ramsize < 16)
 			rxspace = ramsize / 2;
@@ -2926,6 +2925,9 @@ static int __devinit sky2_init(struct sky2_hw *hw)
 			++hw->ports;
 	}
 
+	if (sky2_read8(hw, B2_E_0))
+		hw->flags |= SKY2_HW_RAM_BUFFER;
+
 	return 0;
 }
 
diff --git a/drivers/net/sunvnet.c b/drivers/net/sunvnet.c
index f1e5e4542c2..bc74db0d12f 100644
--- a/drivers/net/sunvnet.c
+++ b/drivers/net/sunvnet.c
@@ -1016,7 +1016,6 @@ static const struct net_device_ops vnet_ops = {
 	.ndo_open		= vnet_open,
 	.ndo_stop		= vnet_close,
 	.ndo_set_multicast_list	= vnet_set_rx_mode,
-	.ndo_change_mtu		= eth_change_mtu,
 	.ndo_set_mac_address	= vnet_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_tx_timeout		= vnet_tx_timeout,
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d3ee1994b02..4fdfa2ae541 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -946,8 +946,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		char *name;
 		unsigned long flags = 0;
 
-		err = -EINVAL;
-
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
 		err = security_tun_dev_create();
@@ -964,7 +962,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 			flags |= TUN_TAP_DEV;
 			name = "tap%d";
 		} else
-			goto failed;
+			return -EINVAL;
 
 		if (*ifr->ifr_name)
 			name = ifr->ifr_name;
diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c
index e2a39b9be96..e391ef969c2 100644
--- a/drivers/net/usb/kaweth.c
+++ b/drivers/net/usb/kaweth.c
@@ -263,6 +263,7 @@ static int kaweth_control(struct kaweth_device *kaweth,
 			  int timeout)
 {
 	struct usb_ctrlrequest *dr;
+	int retval;
 
 	dbg("kaweth_control()");
 
@@ -278,18 +279,21 @@ static int kaweth_control(struct kaweth_device *kaweth,
 		return -ENOMEM;
 	}
 
-	dr->bRequestType= requesttype;
+	dr->bRequestType = requesttype;
 	dr->bRequest = request;
 	dr->wValue = cpu_to_le16(value);
 	dr->wIndex = cpu_to_le16(index);
 	dr->wLength = cpu_to_le16(size);
 
-	return kaweth_internal_control_msg(kaweth->dev,
-					pipe,
-					dr,
-					data,
-					size,
-					timeout);
+	retval = kaweth_internal_control_msg(kaweth->dev,
+					     pipe,
+					     dr,
+					     data,
+					     size,
+					     timeout);
+
+	kfree(dr);
+	return retval;
 }
 
 /****************************************************************
diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index 938fb3530a7..c6c922247d0 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -1227,7 +1227,7 @@ static const struct driver_info smsc95xx_info = {
 	.rx_fixup	= smsc95xx_rx_fixup,
 	.tx_fixup	= smsc95xx_tx_fixup,
 	.status		= smsc95xx_status,
-	.flags		= FLAG_ETHER,
+	.flags		= FLAG_ETHER | FLAG_SEND_ZLP,
 };
 
 static const struct usb_device_id products[] = {
@@ -1237,10 +1237,75 @@ static const struct usb_device_id products[] = {
 		.driver_info = (unsigned long) &smsc95xx_info,
 	},
 	{
+		/* SMSC9505 USB Ethernet Device */
+		USB_DEVICE(0x0424, 0x9505),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9500A USB Ethernet Device */
+		USB_DEVICE(0x0424, 0x9E00),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9505A USB Ethernet Device */
+		USB_DEVICE(0x0424, 0x9E01),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
 		/* SMSC9512/9514 USB Hub & Ethernet Device */
 		USB_DEVICE(0x0424, 0xec00),
 		.driver_info = (unsigned long) &smsc95xx_info,
 	},
+	{
+		/* SMSC9500 USB Ethernet Device (SAL10) */
+		USB_DEVICE(0x0424, 0x9900),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9505 USB Ethernet Device (SAL10) */
+		USB_DEVICE(0x0424, 0x9901),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9500A USB Ethernet Device (SAL10) */
+		USB_DEVICE(0x0424, 0x9902),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9505A USB Ethernet Device (SAL10) */
+		USB_DEVICE(0x0424, 0x9903),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9512/9514 USB Hub & Ethernet Device (SAL10) */
+		USB_DEVICE(0x0424, 0x9904),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9500A USB Ethernet Device (HAL) */
+		USB_DEVICE(0x0424, 0x9905),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9505A USB Ethernet Device (HAL) */
+		USB_DEVICE(0x0424, 0x9906),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9500 USB Ethernet Device (Alternate ID) */
+		USB_DEVICE(0x0424, 0x9907),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9500A USB Ethernet Device (Alternate ID) */
+		USB_DEVICE(0x0424, 0x9908),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
+	{
+		/* SMSC9512/9514 USB Hub & Ethernet Device (Alternate ID) */
+		USB_DEVICE(0x0424, 0x9909),
+		.driver_info = (unsigned long) &smsc95xx_info,
+	},
 	{ },		/* END */
 };
 MODULE_DEVICE_TABLE(usb, products);
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 24b36f79515..ca5ca5ae061 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1049,7 +1049,7 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 	 * NOTE:  strictly conforming cdc-ether devices should expect
 	 * the ZLP here, but ignore the one-byte packet.
 	 */
-	if ((length % dev->maxpacket) == 0) {
+	if (!(info->flags & FLAG_SEND_ZLP) && (length % dev->maxpacket) == 0) {
 		urb->transfer_buffer_length++;
 		if (skb_tailroom(skb)) {
 			skb->data[skb->len] = 0;
diff --git a/drivers/net/wireless/arlan-proc.c b/drivers/net/wireless/arlan-proc.c
index 2ab1d59870f..a8b689635a3 100644
--- a/drivers/net/wireless/arlan-proc.c
+++ b/drivers/net/wireless/arlan-proc.c
@@ -402,7 +402,7 @@ static int arlan_setup_card_by_book(struct net_device *dev)
 
 static char arlan_drive_info[ARLAN_STR_SIZE] = "A655\n\0";
 
-static int arlan_sysctl_info(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info(ctl_table * ctl, int write,
 		      void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -629,7 +629,7 @@ final:
 	*lenp = pos;
 
 	if (!write)
-		retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+		retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	else
 	{
 		*lenp = 0;
@@ -639,7 +639,7 @@ final:
 }
 
 
-static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info161719(ctl_table * ctl, int write,
 			    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -669,11 +669,11 @@ static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp
 
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
-static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_infotxRing(ctl_table * ctl, int write,
 			    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -698,11 +698,11 @@ static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp
 	SARLBNpln(u_char, txBuffer, 0x800);
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
-static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_inforxRing(ctl_table * ctl, int write,
 			    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -726,11 +726,11 @@ static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp
 	SARLBNpln(u_char, rxBuffer, 0x800);
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
-static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info18(ctl_table * ctl, int write,
 			void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -756,7 +756,7 @@ static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp,
 
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
@@ -766,7 +766,7 @@ final:
 
 static char conf_reset_result[200];
 
-static int arlan_configure(ctl_table * ctl, int write, struct file *filp,
+static int arlan_configure(ctl_table * ctl, int write,
 		    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int pos = 0;
@@ -788,10 +788,10 @@ static int arlan_configure(ctl_table * ctl, int write, struct file *filp,
 		return -1;
 
 	*lenp = pos;
-	return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	return proc_dostring(ctl, write, buffer, lenp, ppos);
 }
 
-static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_reset(ctl_table * ctl, int write,
 		       void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int pos = 0;
@@ -811,7 +811,7 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp,
 	} else
 		return -1;
 	*lenp = pos + 3;
-	return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	return proc_dostring(ctl, write, buffer, lenp, ppos);
 }
 
 
diff --git a/drivers/net/wireless/ath/ar9170/usb.c b/drivers/net/wireless/ath/ar9170/usb.c
index e0138ac8bf5..e974e5829e1 100644
--- a/drivers/net/wireless/ath/ar9170/usb.c
+++ b/drivers/net/wireless/ath/ar9170/usb.c
@@ -64,6 +64,8 @@ static struct usb_device_id ar9170_usb_ids[] = {
 	{ USB_DEVICE(0x0cf3, 0x9170) },
 	/* Atheros TG121N */
 	{ USB_DEVICE(0x0cf3, 0x1001) },
+	/* TP-Link TL-WN821N v2 */
+	{ USB_DEVICE(0x0cf3, 0x1002) },
 	/* Cace Airpcap NX */
 	{ USB_DEVICE(0xcace, 0x0300) },
 	/* D-Link DWA 160A */
diff --git a/drivers/net/wireless/ath/ath9k/calib.c b/drivers/net/wireless/ath/ath9k/calib.c
index 3234995e888..0ad6d0b76e9 100644
--- a/drivers/net/wireless/ath/ath9k/calib.c
+++ b/drivers/net/wireless/ath/ath9k/calib.c
@@ -609,14 +609,24 @@ void ath9k_hw_loadnf(struct ath_hw *ah, struct ath9k_channel *chan)
 		AR_PHY_CH1_EXT_CCA,
 		AR_PHY_CH2_EXT_CCA
 	};
-	u8 chainmask;
+	u8 chainmask, rx_chain_status;
 
+	rx_chain_status = REG_READ(ah, AR_PHY_RX_CHAINMASK);
 	if (AR_SREV_9285(ah))
 		chainmask = 0x9;
-	else if (AR_SREV_9280(ah) || AR_SREV_9287(ah))
-		chainmask = 0x1B;
-	else
-		chainmask = 0x3F;
+	else if (AR_SREV_9280(ah) || AR_SREV_9287(ah)) {
+		if ((rx_chain_status & 0x2) || (rx_chain_status & 0x4))
+			chainmask = 0x1B;
+		else
+			chainmask = 0x09;
+	} else {
+		if (rx_chain_status & 0x4)
+			chainmask = 0x3F;
+		else if (rx_chain_status & 0x2)
+			chainmask = 0x1B;
+		else
+			chainmask = 0x09;
+	}
 
 	h = ah->nfCalHist;
 
@@ -697,6 +707,8 @@ void ath9k_init_nfcal_hist_buffer(struct ath_hw *ah)
 		noise_floor = AR_PHY_CCA_MAX_AR9280_GOOD_VALUE;
 	else if (AR_SREV_9285(ah))
 		noise_floor = AR_PHY_CCA_MAX_AR9285_GOOD_VALUE;
+	else if (AR_SREV_9287(ah))
+		noise_floor = AR_PHY_CCA_MAX_AR9287_GOOD_VALUE;
 	else
 		noise_floor = AR_PHY_CCA_MAX_AR5416_GOOD_VALUE;
 
@@ -924,6 +936,7 @@ static inline void ath9k_hw_9285_pa_cal(struct ath_hw *ah, bool is_reset)
 		regVal |= (1 << (19 + i));
 		REG_WRITE(ah, 0x7834, regVal);
 		udelay(1);
+		regVal = REG_READ(ah, 0x7834);
 		regVal &= (~(0x1 << (19 + i)));
 		reg_field = MS(REG_READ(ah, 0x7840), AR9285_AN_RXTXBB1_SPARE9);
 		regVal |= (reg_field << (19 + i));
diff --git a/drivers/net/wireless/ath/ath9k/calib.h b/drivers/net/wireless/ath/ath9k/calib.h
index 019bcbba40e..9028ab193e4 100644
--- a/drivers/net/wireless/ath/ath9k/calib.h
+++ b/drivers/net/wireless/ath/ath9k/calib.h
@@ -28,6 +28,7 @@ extern const struct ath9k_percal_data adc_init_dc_cal;
 #define AR_PHY_CCA_MAX_AR5416_GOOD_VALUE	-85
 #define AR_PHY_CCA_MAX_AR9280_GOOD_VALUE	-112
 #define AR_PHY_CCA_MAX_AR9285_GOOD_VALUE	-118
+#define AR_PHY_CCA_MAX_AR9287_GOOD_VALUE	-118
 #define AR_PHY_CCA_MAX_HIGH_VALUE      		-62
 #define AR_PHY_CCA_MIN_BAD_VALUE       		-140
 #define AR_PHY_CCA_FILTERWINDOW_LENGTH_INIT     3
diff --git a/drivers/net/wireless/ath/ath9k/eeprom_def.c b/drivers/net/wireless/ath/ath9k/eeprom_def.c
index ae7fb5dcb26..4071fc91da0 100644
--- a/drivers/net/wireless/ath/ath9k/eeprom_def.c
+++ b/drivers/net/wireless/ath/ath9k/eeprom_def.c
@@ -509,6 +509,8 @@ static void ath9k_hw_def_set_board_values(struct ath_hw *ah,
 			REG_RMW_FIELD(ah, AR_AN_TOP1, AR_AN_TOP1_DACIPMODE,
 				      eep->baseEepHeader.dacLpMode);
 
+		udelay(100);
+
 		REG_RMW_FIELD(ah, AR_PHY_FRAME_CTL, AR_PHY_FRAME_CTL_TX_CLIP,
 			      pModal->miscBits >> 2);
 
@@ -902,7 +904,7 @@ static void ath9k_hw_set_def_power_per_rate_table(struct ath_hw *ah,
 						  u16 powerLimit)
 {
 #define REDUCE_SCALED_POWER_BY_TWO_CHAIN     6  /* 10*log10(2)*2 */
-#define REDUCE_SCALED_POWER_BY_THREE_CHAIN   10 /* 10*log10(3)*2 */
+#define REDUCE_SCALED_POWER_BY_THREE_CHAIN   9 /* 10*log10(3)*2 */
 
 	struct ath_regulatory *regulatory = ath9k_hw_regulatory(ah);
 	struct ar5416_eeprom_def *pEepData = &ah->eeprom.def;
diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index b6c6cca0781..ca7694caf36 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -842,7 +842,7 @@ static void ath9k_hw_init_mode_regs(struct ath_hw *ah)
 
 static void ath9k_hw_init_mode_gain_regs(struct ath_hw *ah)
 {
-	if (AR_SREV_9287_11(ah))
+	if (AR_SREV_9287_11_OR_LATER(ah))
 		INIT_INI_ARRAY(&ah->iniModesRxGain,
 		ar9287Modes_rx_gain_9287_1_1,
 		ARRAY_SIZE(ar9287Modes_rx_gain_9287_1_1), 6);
@@ -853,7 +853,7 @@ static void ath9k_hw_init_mode_gain_regs(struct ath_hw *ah)
 	else if (AR_SREV_9280_20(ah))
 		ath9k_hw_init_rxgain_ini(ah);
 
-	if (AR_SREV_9287_11(ah)) {
+	if (AR_SREV_9287_11_OR_LATER(ah)) {
 		INIT_INI_ARRAY(&ah->iniModesTxGain,
 		ar9287Modes_tx_gain_9287_1_1,
 		ARRAY_SIZE(ar9287Modes_tx_gain_9287_1_1), 6);
@@ -965,7 +965,7 @@ int ath9k_hw_init(struct ath_hw *ah)
 	ath9k_hw_init_mode_regs(ah);
 
 	if (ah->is_pciexpress)
-		ath9k_hw_configpcipowersave(ah, 0);
+		ath9k_hw_configpcipowersave(ah, 0, 0);
 	else
 		ath9k_hw_disablepcie(ah);
 
@@ -1273,6 +1273,15 @@ static void ath9k_hw_override_ini(struct ath_hw *ah,
 	 */
 	REG_SET_BIT(ah, AR_DIAG_SW, (AR_DIAG_RX_DIS | AR_DIAG_RX_ABORT));
 
+	if (AR_SREV_9280_10_OR_LATER(ah)) {
+		val = REG_READ(ah, AR_PCU_MISC_MODE2) &
+			       (~AR_PCU_MISC_MODE2_HWWAR1);
+
+		if (AR_SREV_9287_10_OR_LATER(ah))
+			val = val & (~AR_PCU_MISC_MODE2_HWWAR2);
+
+		REG_WRITE(ah, AR_PCU_MISC_MODE2, val);
+	}
 
 	if (!AR_SREV_5416_20_OR_LATER(ah) ||
 	    AR_SREV_9280_10_OR_LATER(ah))
@@ -1784,7 +1793,7 @@ static void ath9k_hw_set_regs(struct ath_hw *ah, struct ath9k_channel *chan,
 static bool ath9k_hw_chip_reset(struct ath_hw *ah,
 				struct ath9k_channel *chan)
 {
-	if (OLC_FOR_AR9280_20_LATER) {
+	if (AR_SREV_9280(ah) && ah->eep_ops->get_eeprom(ah, EEP_OL_PWRCTRL)) {
 		if (!ath9k_hw_set_reset_reg(ah, ATH9K_RESET_POWER_ON))
 			return false;
 	} else if (!ath9k_hw_set_reset_reg(ah, ATH9K_RESET_WARM))
@@ -2338,6 +2347,7 @@ int ath9k_hw_reset(struct ath_hw *ah, struct ath9k_channel *chan,
 	struct ath9k_channel *curchan = ah->curchan;
 	u32 saveDefAntenna;
 	u32 macStaId1;
+	u64 tsf = 0;
 	int i, rx_chainmask, r;
 
 	ah->extprotspacing = sc->ht_extprotspacing;
@@ -2347,7 +2357,7 @@ int ath9k_hw_reset(struct ath_hw *ah, struct ath9k_channel *chan,
 	if (!ath9k_hw_setpower(ah, ATH9K_PM_AWAKE))
 		return -EIO;
 
-	if (curchan)
+	if (curchan && !ah->chip_fullsleep)
 		ath9k_hw_getnf(ah, curchan);
 
 	if (bChannelChange &&
@@ -2356,8 +2366,8 @@ int ath9k_hw_reset(struct ath_hw *ah, struct ath9k_channel *chan,
 	    (chan->channel != ah->curchan->channel) &&
 	    ((chan->channelFlags & CHANNEL_ALL) ==
 	     (ah->curchan->channelFlags & CHANNEL_ALL)) &&
-	    (!AR_SREV_9280(ah) || (!IS_CHAN_A_5MHZ_SPACED(chan) &&
-				   !IS_CHAN_A_5MHZ_SPACED(ah->curchan)))) {
+	     !(AR_SREV_9280(ah) || IS_CHAN_A_5MHZ_SPACED(chan) ||
+	     IS_CHAN_A_5MHZ_SPACED(ah->curchan))) {
 
 		if (ath9k_hw_channel_change(ah, chan, sc->tx_chan_width)) {
 			ath9k_hw_loadnf(ah, ah->curchan);
@@ -2372,6 +2382,10 @@ int ath9k_hw_reset(struct ath_hw *ah, struct ath9k_channel *chan,
 
 	macStaId1 = REG_READ(ah, AR_STA_ID1) & AR_STA_ID1_BASE_RATE_11B;
 
+	/* For chips on which RTC reset is done, save TSF before it gets cleared */
+	if (AR_SREV_9280(ah) && ah->eep_ops->get_eeprom(ah, EEP_OL_PWRCTRL))
+		tsf = ath9k_hw_gettsf64(ah);
+
 	saveLedState = REG_READ(ah, AR_CFG_LED) &
 		(AR_CFG_LED_ASSOC_CTL | AR_CFG_LED_MODE_SEL |
 		 AR_CFG_LED_BLINK_THRESH_SEL | AR_CFG_LED_BLINK_SLOW);
@@ -2398,6 +2412,10 @@ int ath9k_hw_reset(struct ath_hw *ah, struct ath9k_channel *chan,
 		udelay(50);
 	}
 
+	/* Restore TSF */
+	if (tsf && AR_SREV_9280(ah) && ah->eep_ops->get_eeprom(ah, EEP_OL_PWRCTRL))
+		ath9k_hw_settsf64(ah, tsf);
+
 	if (AR_SREV_9280_10_OR_LATER(ah))
 		REG_SET_BIT(ah, AR_GPIO_INPUT_EN_VAL, AR_GPIO_JTAG_DISABLE);
 
@@ -3005,9 +3023,10 @@ void ath9k_ps_restore(struct ath_softc *sc)
  * Programming the SerDes must go through the same 288 bit serial shift
  * register as the other analog registers.  Hence the 9 writes.
  */
-void ath9k_hw_configpcipowersave(struct ath_hw *ah, int restore)
+void ath9k_hw_configpcipowersave(struct ath_hw *ah, int restore, int power_off)
 {
 	u8 i;
+	u32 val;
 
 	if (ah->is_pciexpress != true)
 		return;
@@ -3017,84 +3036,113 @@ void ath9k_hw_configpcipowersave(struct ath_hw *ah, int restore)
 		return;
 
 	/* Nothing to do on restore for 11N */
-	if (restore)
-		return;
+	if (!restore) {
+		if (AR_SREV_9280_20_OR_LATER(ah)) {
+			/*
+			 * AR9280 2.0 or later chips use SerDes values from the
+			 * initvals.h initialized depending on chipset during
+			 * ath9k_hw_init()
+			 */
+			for (i = 0; i < ah->iniPcieSerdes.ia_rows; i++) {
+				REG_WRITE(ah, INI_RA(&ah->iniPcieSerdes, i, 0),
+					  INI_RA(&ah->iniPcieSerdes, i, 1));
+			}
+		} else if (AR_SREV_9280(ah) &&
+			   (ah->hw_version.macRev == AR_SREV_REVISION_9280_10)) {
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x9248fd00);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x24924924);
+
+			/* RX shut off when elecidle is asserted */
+			REG_WRITE(ah, AR_PCIE_SERDES, 0xa8000019);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x13160820);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0xe5980560);
+
+			/* Shut off CLKREQ active in L1 */
+			if (ah->config.pcie_clock_req)
+				REG_WRITE(ah, AR_PCIE_SERDES, 0x401deffc);
+			else
+				REG_WRITE(ah, AR_PCIE_SERDES, 0x401deffd);
 
-	if (AR_SREV_9280_20_OR_LATER(ah)) {
-		/*
-		 * AR9280 2.0 or later chips use SerDes values from the
-		 * initvals.h initialized depending on chipset during
-		 * ath9k_hw_init()
-		 */
-		for (i = 0; i < ah->iniPcieSerdes.ia_rows; i++) {
-			REG_WRITE(ah, INI_RA(&ah->iniPcieSerdes, i, 0),
-				  INI_RA(&ah->iniPcieSerdes, i, 1));
-		}
-	} else if (AR_SREV_9280(ah) &&
-		   (ah->hw_version.macRev == AR_SREV_REVISION_9280_10)) {
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x9248fd00);
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x24924924);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x1aaabe40);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0xbe105554);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x00043007);
 
-		/* RX shut off when elecidle is asserted */
-		REG_WRITE(ah, AR_PCIE_SERDES, 0xa8000019);
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x13160820);
-		REG_WRITE(ah, AR_PCIE_SERDES, 0xe5980560);
+			/* Load the new settings */
+			REG_WRITE(ah, AR_PCIE_SERDES2, 0x00000000);
 
-		/* Shut off CLKREQ active in L1 */
-		if (ah->config.pcie_clock_req)
-			REG_WRITE(ah, AR_PCIE_SERDES, 0x401deffc);
-		else
-			REG_WRITE(ah, AR_PCIE_SERDES, 0x401deffd);
-
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x1aaabe40);
-		REG_WRITE(ah, AR_PCIE_SERDES, 0xbe105554);
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x00043007);
+		} else {
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x9248fc00);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x24924924);
 
-		/* Load the new settings */
-		REG_WRITE(ah, AR_PCIE_SERDES2, 0x00000000);
+			/* RX shut off when elecidle is asserted */
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x28000039);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x53160824);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0xe5980579);
 
-	} else {
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x9248fc00);
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x24924924);
+			/*
+			 * Ignore ah->ah_config.pcie_clock_req setting for
+			 * pre-AR9280 11n
+			 */
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x001defff);
 
-		/* RX shut off when elecidle is asserted */
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x28000039);
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x53160824);
-		REG_WRITE(ah, AR_PCIE_SERDES, 0xe5980579);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x1aaabe40);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0xbe105554);
+			REG_WRITE(ah, AR_PCIE_SERDES, 0x000e3007);
 
-		/*
-		 * Ignore ah->ah_config.pcie_clock_req setting for
-		 * pre-AR9280 11n
-		 */
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x001defff);
+			/* Load the new settings */
+			REG_WRITE(ah, AR_PCIE_SERDES2, 0x00000000);
+		}
 
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x1aaabe40);
-		REG_WRITE(ah, AR_PCIE_SERDES, 0xbe105554);
-		REG_WRITE(ah, AR_PCIE_SERDES, 0x000e3007);
+		udelay(1000);
 
-		/* Load the new settings */
-		REG_WRITE(ah, AR_PCIE_SERDES2, 0x00000000);
-	}
+		/* set bit 19 to allow forcing of pcie core into L1 state */
+		REG_SET_BIT(ah, AR_PCIE_PM_CTRL, AR_PCIE_PM_CTRL_ENA);
 
-	udelay(1000);
+		/* Several PCIe massages to ensure proper behaviour */
+		if (ah->config.pcie_waen) {
+			val = ah->config.pcie_waen;
+			if (!power_off)
+				val &= (~AR_WA_D3_L1_DISABLE);
+		} else {
+			if (AR_SREV_9285(ah) || AR_SREV_9271(ah) ||
+			    AR_SREV_9287(ah)) {
+				val = AR9285_WA_DEFAULT;
+				if (!power_off)
+					val &= (~AR_WA_D3_L1_DISABLE);
+			} else if (AR_SREV_9280(ah)) {
+				/*
+				 * On AR9280 chips bit 22 of 0x4004 needs to be
+				 * set otherwise card may disappear.
+				 */
+				val = AR9280_WA_DEFAULT;
+				if (!power_off)
+					val &= (~AR_WA_D3_L1_DISABLE);
+			} else
+				val = AR_WA_DEFAULT;
+		}
 
-	/* set bit 19 to allow forcing of pcie core into L1 state */
-	REG_SET_BIT(ah, AR_PCIE_PM_CTRL, AR_PCIE_PM_CTRL_ENA);
+		REG_WRITE(ah, AR_WA, val);
+	}
 
-	/* Several PCIe massages to ensure proper behaviour */
-	if (ah->config.pcie_waen) {
-		REG_WRITE(ah, AR_WA, ah->config.pcie_waen);
-	} else {
-		if (AR_SREV_9285(ah) || AR_SREV_9271(ah) || AR_SREV_9287(ah))
-			REG_WRITE(ah, AR_WA, AR9285_WA_DEFAULT);
+	if (power_off) {
 		/*
-		 * On AR9280 chips bit 22 of 0x4004 needs to be set to
-		 * otherwise card may disappear.
+		 * Set PCIe workaround bits
+		 * bit 14 in WA register (disable L1) should only
+		 * be set when device enters D3 and be cleared
+		 * when device comes back to D0.
 		 */
-		else if (AR_SREV_9280(ah))
-			REG_WRITE(ah, AR_WA, AR9280_WA_DEFAULT);
-		else
-			REG_WRITE(ah, AR_WA, AR_WA_DEFAULT);
+		if (ah->config.pcie_waen) {
+			if (ah->config.pcie_waen & AR_WA_D3_L1_DISABLE)
+				REG_SET_BIT(ah, AR_WA, AR_WA_D3_L1_DISABLE);
+		} else {
+			if (((AR_SREV_9285(ah) || AR_SREV_9271(ah) ||
+			      AR_SREV_9287(ah)) &&
+			     (AR9285_WA_DEFAULT & AR_WA_D3_L1_DISABLE)) ||
+			    (AR_SREV_9280(ah) &&
+			     (AR9280_WA_DEFAULT & AR_WA_D3_L1_DISABLE))) {
+				REG_SET_BIT(ah, AR_WA, AR_WA_D3_L1_DISABLE);
+			}
+		}
 	}
 }
 
@@ -3652,15 +3700,7 @@ void ath9k_hw_fill_cap_info(struct ath_hw *ah)
 	}
 #endif
 
-	if ((ah->hw_version.macVersion == AR_SREV_VERSION_5416_PCI) ||
-	    (ah->hw_version.macVersion == AR_SREV_VERSION_5416_PCIE) ||
-	    (ah->hw_version.macVersion == AR_SREV_VERSION_9160) ||
-	    (ah->hw_version.macVersion == AR_SREV_VERSION_9100) ||
-	    (ah->hw_version.macVersion == AR_SREV_VERSION_9280) ||
-	    (ah->hw_version.macVersion == AR_SREV_VERSION_9285))
-		pCap->hw_caps &= ~ATH9K_HW_CAP_AUTOSLEEP;
-	else
-		pCap->hw_caps |= ATH9K_HW_CAP_AUTOSLEEP;
+	pCap->hw_caps &= ~ATH9K_HW_CAP_AUTOSLEEP;
 
 	if (AR_SREV_9280(ah) || AR_SREV_9285(ah))
 		pCap->hw_caps &= ~ATH9K_HW_CAP_4KB_SPLITTRANS;
diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h
index 9106a0b537d..b8923457182 100644
--- a/drivers/net/wireless/ath/ath9k/hw.h
+++ b/drivers/net/wireless/ath/ath9k/hw.h
@@ -106,7 +106,7 @@
 #define AH_TSF_WRITE_TIMEOUT        100    /* (us) */
 #define AH_TIME_QUANTUM             10
 #define AR_KEYTABLE_SIZE            128
-#define POWER_UP_TIME               200000
+#define POWER_UP_TIME               10000
 #define SPUR_RSSI_THRESH            40
 
 #define CAB_TIMEOUT_VAL             10
@@ -650,7 +650,7 @@ void ath9k_hw_set_sta_beacon_timers(struct ath_hw *ah,
 				    const struct ath9k_beacon_state *bs);
 bool ath9k_hw_setpower(struct ath_hw *ah,
 		       enum ath9k_power_mode mode);
-void ath9k_hw_configpcipowersave(struct ath_hw *ah, int restore);
+void ath9k_hw_configpcipowersave(struct ath_hw *ah, int restore, int power_off);
 
 /* Interrupt Handling */
 bool ath9k_hw_intrpend(struct ath_hw *ah);
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 3dc7b5a13e6..52bed89063d 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1131,7 +1131,7 @@ void ath_radio_enable(struct ath_softc *sc)
 	int r;
 
 	ath9k_ps_wakeup(sc);
-	ath9k_hw_configpcipowersave(ah, 0);
+	ath9k_hw_configpcipowersave(ah, 0, 0);
 
 	if (!ah->curchan)
 		ah->curchan = ath_get_curchannel(sc, sc->hw);
@@ -1202,7 +1202,7 @@ void ath_radio_disable(struct ath_softc *sc)
 	spin_unlock_bh(&sc->sc_resetlock);
 
 	ath9k_hw_phy_disable(ah);
-	ath9k_hw_configpcipowersave(ah, 1);
+	ath9k_hw_configpcipowersave(ah, 1, 1);
 	ath9k_ps_restore(sc);
 	ath9k_hw_setpower(ah, ATH9K_PM_FULL_SLEEP);
 }
@@ -1226,11 +1226,6 @@ static void ath9k_rfkill_poll_state(struct ieee80211_hw *hw)
 	bool blocked = !!ath_is_rfkill_set(sc);
 
 	wiphy_rfkill_set_hw_state(hw->wiphy, blocked);
-
-	if (blocked)
-		ath_radio_disable(sc);
-	else
-		ath_radio_enable(sc);
 }
 
 static void ath_start_rfkill_poll(struct ath_softc *sc)
@@ -1260,6 +1255,7 @@ void ath_detach(struct ath_softc *sc)
 	DPRINTF(sc, ATH_DBG_CONFIG, "Detach ATH hw\n");
 
 	ath_deinit_leds(sc);
+	wiphy_rfkill_stop_polling(sc->hw->wiphy);
 
 	for (i = 0; i < sc->num_sec_wiphy; i++) {
 		struct ath_wiphy *aphy = sc->sec_wiphy[i];
@@ -1942,7 +1938,7 @@ static int ath9k_start(struct ieee80211_hw *hw)
 	init_channel = ath_get_curchannel(sc, hw);
 
 	/* Reset SERDES registers */
-	ath9k_hw_configpcipowersave(sc->sc_ah, 0);
+	ath9k_hw_configpcipowersave(sc->sc_ah, 0, 0);
 
 	/*
 	 * The basic interface to setting the hardware in a good
@@ -2166,11 +2162,9 @@ static void ath9k_stop(struct ieee80211_hw *hw)
 	} else
 		sc->rx.rxlink = NULL;
 
-	wiphy_rfkill_stop_polling(sc->hw->wiphy);
-
 	/* disable HAL and put h/w to sleep */
 	ath9k_hw_disable(sc->sc_ah);
-	ath9k_hw_configpcipowersave(sc->sc_ah, 1);
+	ath9k_hw_configpcipowersave(sc->sc_ah, 1, 1);
 	ath9k_hw_setpower(sc->sc_ah, ATH9K_PM_FULL_SLEEP);
 
 	sc->sc_flags |= SC_OP_INVALID;
diff --git a/drivers/net/wireless/ath/ath9k/reg.h b/drivers/net/wireless/ath/ath9k/reg.h
index e5c29eb86e8..d83b77f821e 100644
--- a/drivers/net/wireless/ath/ath9k/reg.h
+++ b/drivers/net/wireless/ath/ath9k/reg.h
@@ -676,8 +676,9 @@
 #define AR_RC_HOSTIF         0x00000100
 
 #define AR_WA                		0x4004
+#define AR_WA_D3_L1_DISABLE		(1 << 14)
 #define AR9285_WA_DEFAULT 		0x004a05cb
-#define AR9280_WA_DEFAULT           	0x0040073f
+#define AR9280_WA_DEFAULT           	0x0040073b
 #define AR_WA_DEFAULT               	0x0000073f
 
 
diff --git a/drivers/net/wireless/b43/Kconfig b/drivers/net/wireless/b43/Kconfig
index 83e38134acc..54ea61c15d8 100644
--- a/drivers/net/wireless/b43/Kconfig
+++ b/drivers/net/wireless/b43/Kconfig
@@ -61,11 +61,28 @@ config B43_PCMCIA
 
 	  If unsure, say N.
 
+config B43_SDIO
+	bool "Broadcom 43xx SDIO device support (EXPERIMENTAL)"
+	depends on B43 && SSB_SDIOHOST_POSSIBLE && EXPERIMENTAL
+	select SSB_SDIOHOST
+	---help---
+	  Broadcom 43xx device support for Soft-MAC SDIO devices.
+
+	  With this config option you can drive Soft-MAC b43 cards with a
+	  Secure Digital I/O interface.
+	  This includes the WLAN daughter card found on the Nintendo Wii
+	  video game console.
+	  Note that this does not support Broadcom 43xx Full-MAC devices.
+
+	  It's safe to select Y here, even if you don't have a B43 SDIO device.
+
+	  If unsure, say N.
+
 # Data transfers to the device via PIO
-# This is only needed on PCMCIA devices. All others can do DMA properly.
+# This is only needed on PCMCIA and SDIO devices. All others can do DMA properly.
 config B43_PIO
 	bool
-	depends on B43 && (B43_PCMCIA || B43_FORCE_PIO)
+	depends on B43 && (B43_SDIO || B43_PCMCIA || B43_FORCE_PIO)
 	select SSB_BLOCKIO
 	default y
 
diff --git a/drivers/net/wireless/b43/Makefile b/drivers/net/wireless/b43/Makefile
index da379f4b0c3..84772a2542d 100644
--- a/drivers/net/wireless/b43/Makefile
+++ b/drivers/net/wireless/b43/Makefile
@@ -16,6 +16,7 @@ b43-$(CONFIG_B43_PIO)		+= pio.o
 b43-y				+= rfkill.o
 b43-$(CONFIG_B43_LEDS)		+= leds.o
 b43-$(CONFIG_B43_PCMCIA)	+= pcmcia.o
+b43-$(CONFIG_B43_SDIO)		+= sdio.o
 b43-$(CONFIG_B43_DEBUG)		+= debugfs.o
 
 obj-$(CONFIG_B43)		+= b43.o
diff --git a/drivers/net/wireless/b43/b43.h b/drivers/net/wireless/b43/b43.h
index 09cfe68537b..fa1549a03c7 100644
--- a/drivers/net/wireless/b43/b43.h
+++ b/drivers/net/wireless/b43/b43.h
@@ -629,13 +629,6 @@ struct b43_wl {
 	 * from the mac80211 subsystem. */
 	u16 mac80211_initially_registered_queues;
 
-	/* R/W lock for data transmission.
-	 * Transmissions on 2+ queues can run concurrently, but somebody else
-	 * might sync with TX by write_lock_irqsave()'ing. */
-	rwlock_t tx_lock;
-	/* Lock for LEDs access. */
-	spinlock_t leds_lock;
-
 	/* We can only have one operating interface (802.11 core)
 	 * at a time. General information about this interface follows.
 	 */
@@ -686,6 +679,9 @@ struct b43_wl {
 	struct work_struct tx_work;
 	/* Queue of packets to be transmitted. */
 	struct sk_buff_head tx_queue;
+
+	/* The device LEDs. */
+	struct b43_leds leds;
 };
 
 /* The type of the firmware file. */
@@ -768,13 +764,10 @@ struct b43_wldev {
 	/* The device initialization status.
 	 * Use b43_status() to query. */
 	atomic_t __init_status;
-	/* Saved init status for handling suspend. */
-	int suspend_init_status;
 
 	bool bad_frames_preempt;	/* Use "Bad Frames Preemption" (default off) */
 	bool dfq_valid;		/* Directed frame queue valid (IBSS PS mode, ATIM) */
 	bool radio_hw_enable;	/* saved state of radio hardware enabled state */
-	bool suspend_in_progress;	/* TRUE, if we are in a suspend/resume cycle */
 	bool qos_enabled;		/* TRUE, if QoS is used. */
 	bool hwcrypto_enabled;		/* TRUE, if HW crypto acceleration is enabled. */
 
@@ -794,12 +787,6 @@ struct b43_wldev {
 	/* Various statistics about the physical device. */
 	struct b43_stats stats;
 
-	/* The device LEDs. */
-	struct b43_led led_tx;
-	struct b43_led led_rx;
-	struct b43_led led_assoc;
-	struct b43_led led_radio;
-
 	/* Reason code of the last interrupt. */
 	u32 irq_reason;
 	u32 dma_reason[6];
@@ -830,6 +817,10 @@ struct b43_wldev {
 	/* Debugging stuff follows. */
 #ifdef CONFIG_B43_DEBUG
 	struct b43_dfsentry *dfsentry;
+	unsigned int irq_count;
+	unsigned int irq_bit_count[32];
+	unsigned int tx_count;
+	unsigned int rx_count;
 #endif
 };
 
diff --git a/drivers/net/wireless/b43/debugfs.c b/drivers/net/wireless/b43/debugfs.c
index 8f64943e3f6..80b19a44a40 100644
--- a/drivers/net/wireless/b43/debugfs.c
+++ b/drivers/net/wireless/b43/debugfs.c
@@ -689,6 +689,7 @@ static void b43_add_dynamic_debug(struct b43_wldev *dev)
 	add_dyn_dbg("debug_lo", B43_DBG_LO, 0);
 	add_dyn_dbg("debug_firmware", B43_DBG_FIRMWARE, 0);
 	add_dyn_dbg("debug_keys", B43_DBG_KEYS, 0);
+	add_dyn_dbg("debug_verbose_stats", B43_DBG_VERBOSESTATS, 0);
 
 #undef add_dyn_dbg
 }
diff --git a/drivers/net/wireless/b43/debugfs.h b/drivers/net/wireless/b43/debugfs.h
index e47b4b488b0..822aad8842f 100644
--- a/drivers/net/wireless/b43/debugfs.h
+++ b/drivers/net/wireless/b43/debugfs.h
@@ -13,6 +13,7 @@ enum b43_dyndbg {		/* Dynamic debugging features */
 	B43_DBG_LO,
 	B43_DBG_FIRMWARE,
 	B43_DBG_KEYS,
+	B43_DBG_VERBOSESTATS,
 	__B43_NR_DYNDBG,
 };
 
diff --git a/drivers/net/wireless/b43/dma.c b/drivers/net/wireless/b43/dma.c
index a467ee260a1..8701034569f 100644
--- a/drivers/net/wireless/b43/dma.c
+++ b/drivers/net/wireless/b43/dma.c
@@ -1428,9 +1428,9 @@ void b43_dma_handle_txstatus(struct b43_wldev *dev,
 				ring->nr_failed_tx_packets++;
 			ring->nr_total_packet_tries += status->frame_count;
 #endif /* DEBUG */
-			ieee80211_tx_status_irqsafe(dev->wl->hw, meta->skb);
+			ieee80211_tx_status(dev->wl->hw, meta->skb);
 
-			/* skb is freed by ieee80211_tx_status_irqsafe() */
+			/* skb is freed by ieee80211_tx_status() */
 			meta->skb = NULL;
 		} else {
 			/* No need to call free_descriptor_buffer here, as
diff --git a/drivers/net/wireless/b43/leds.c b/drivers/net/wireless/b43/leds.c
index c8b317094c3..fbe3d4f62ce 100644
--- a/drivers/net/wireless/b43/leds.c
+++ b/drivers/net/wireless/b43/leds.c
@@ -34,57 +34,88 @@
 static void b43_led_turn_on(struct b43_wldev *dev, u8 led_index,
 			    bool activelow)
 {
-	struct b43_wl *wl = dev->wl;
-	unsigned long flags;
 	u16 ctl;
 
-	spin_lock_irqsave(&wl->leds_lock, flags);
 	ctl = b43_read16(dev, B43_MMIO_GPIO_CONTROL);
 	if (activelow)
 		ctl &= ~(1 << led_index);
 	else
 		ctl |= (1 << led_index);
 	b43_write16(dev, B43_MMIO_GPIO_CONTROL, ctl);
-	spin_unlock_irqrestore(&wl->leds_lock, flags);
 }
 
 static void b43_led_turn_off(struct b43_wldev *dev, u8 led_index,
 			     bool activelow)
 {
-	struct b43_wl *wl = dev->wl;
-	unsigned long flags;
 	u16 ctl;
 
-	spin_lock_irqsave(&wl->leds_lock, flags);
 	ctl = b43_read16(dev, B43_MMIO_GPIO_CONTROL);
 	if (activelow)
 		ctl |= (1 << led_index);
 	else
 		ctl &= ~(1 << led_index);
 	b43_write16(dev, B43_MMIO_GPIO_CONTROL, ctl);
-	spin_unlock_irqrestore(&wl->leds_lock, flags);
 }
 
-/* Callback from the LED subsystem. */
-static void b43_led_brightness_set(struct led_classdev *led_dev,
-				   enum led_brightness brightness)
+static void b43_led_update(struct b43_wldev *dev,
+			   struct b43_led *led)
 {
-	struct b43_led *led = container_of(led_dev, struct b43_led, led_dev);
-	struct b43_wldev *dev = led->dev;
 	bool radio_enabled;
+	bool turn_on;
 
-	if (unlikely(b43_status(dev) < B43_STAT_INITIALIZED))
+	if (!led->wl)
 		return;
 
-	/* Checking the radio-enabled status here is slightly racy,
-	 * but we want to avoid the locking overhead and we don't care
-	 * whether the LED has the wrong state for a second. */
 	radio_enabled = (dev->phy.radio_on && dev->radio_hw_enable);
 
-	if (brightness == LED_OFF || !radio_enabled)
-		b43_led_turn_off(dev, led->index, led->activelow);
+	/* The led->state read is racy, but we don't care. In case we raced
+	 * with the brightness_set handler, we will be called again soon
+	 * to fixup our state. */
+	if (radio_enabled)
+		turn_on = atomic_read(&led->state) != LED_OFF;
 	else
+		turn_on = 0;
+	if (turn_on == led->hw_state)
+		return;
+	led->hw_state = turn_on;
+
+	if (turn_on)
 		b43_led_turn_on(dev, led->index, led->activelow);
+	else
+		b43_led_turn_off(dev, led->index, led->activelow);
+}
+
+static void b43_leds_work(struct work_struct *work)
+{
+	struct b43_leds *leds = container_of(work, struct b43_leds, work);
+	struct b43_wl *wl = container_of(leds, struct b43_wl, leds);
+	struct b43_wldev *dev;
+
+	mutex_lock(&wl->mutex);
+	dev = wl->current_dev;
+	if (unlikely(!dev || b43_status(dev) < B43_STAT_STARTED))
+		goto out_unlock;
+
+	b43_led_update(dev, &wl->leds.led_tx);
+	b43_led_update(dev, &wl->leds.led_rx);
+	b43_led_update(dev, &wl->leds.led_radio);
+	b43_led_update(dev, &wl->leds.led_assoc);
+
+out_unlock:
+	mutex_unlock(&wl->mutex);
+}
+
+/* Callback from the LED subsystem. */
+static void b43_led_brightness_set(struct led_classdev *led_dev,
+				   enum led_brightness brightness)
+{
+	struct b43_led *led = container_of(led_dev, struct b43_led, led_dev);
+	struct b43_wl *wl = led->wl;
+
+	if (likely(!wl->leds.stop)) {
+		atomic_set(&led->state, brightness);
+		ieee80211_queue_work(wl->hw, &wl->leds.work);
+	}
 }
 
 static int b43_register_led(struct b43_wldev *dev, struct b43_led *led,
@@ -93,15 +124,15 @@ static int b43_register_led(struct b43_wldev *dev, struct b43_led *led,
 {
 	int err;
 
-	b43_led_turn_off(dev, led_index, activelow);
-	if (led->dev)
+	if (led->wl)
 		return -EEXIST;
 	if (!default_trigger)
 		return -EINVAL;
-	led->dev = dev;
+	led->wl = dev->wl;
 	led->index = led_index;
 	led->activelow = activelow;
 	strncpy(led->name, name, sizeof(led->name));
+	atomic_set(&led->state, 0);
 
 	led->led_dev.name = led->name;
 	led->led_dev.default_trigger = default_trigger;
@@ -110,19 +141,19 @@ static int b43_register_led(struct b43_wldev *dev, struct b43_led *led,
 	err = led_classdev_register(dev->dev->dev, &led->led_dev);
 	if (err) {
 		b43warn(dev->wl, "LEDs: Failed to register %s\n", name);
-		led->dev = NULL;
+		led->wl = NULL;
 		return err;
 	}
+
 	return 0;
 }
 
 static void b43_unregister_led(struct b43_led *led)
 {
-	if (!led->dev)
+	if (!led->wl)
 		return;
 	led_classdev_unregister(&led->led_dev);
-	b43_led_turn_off(led->dev, led->index, led->activelow);
-	led->dev = NULL;
+	led->wl = NULL;
 }
 
 static void b43_map_led(struct b43_wldev *dev,
@@ -137,24 +168,20 @@ static void b43_map_led(struct b43_wldev *dev,
 	 * generic LED triggers. */
 	switch (behaviour) {
 	case B43_LED_INACTIVE:
-		break;
 	case B43_LED_OFF:
-		b43_led_turn_off(dev, led_index, activelow);
-		break;
 	case B43_LED_ON:
-		b43_led_turn_on(dev, led_index, activelow);
 		break;
 	case B43_LED_ACTIVITY:
 	case B43_LED_TRANSFER:
 	case B43_LED_APTRANSFER:
 		snprintf(name, sizeof(name),
 			 "b43-%s::tx", wiphy_name(hw->wiphy));
-		b43_register_led(dev, &dev->led_tx, name,
+		b43_register_led(dev, &dev->wl->leds.led_tx, name,
 				 ieee80211_get_tx_led_name(hw),
 				 led_index, activelow);
 		snprintf(name, sizeof(name),
 			 "b43-%s::rx", wiphy_name(hw->wiphy));
-		b43_register_led(dev, &dev->led_rx, name,
+		b43_register_led(dev, &dev->wl->leds.led_rx, name,
 				 ieee80211_get_rx_led_name(hw),
 				 led_index, activelow);
 		break;
@@ -164,18 +191,15 @@ static void b43_map_led(struct b43_wldev *dev,
 	case B43_LED_MODE_BG:
 		snprintf(name, sizeof(name),
 			 "b43-%s::radio", wiphy_name(hw->wiphy));
-		b43_register_led(dev, &dev->led_radio, name,
+		b43_register_led(dev, &dev->wl->leds.led_radio, name,
 				 ieee80211_get_radio_led_name(hw),
 				 led_index, activelow);
-		/* Sync the RF-kill LED state with radio and switch states. */
-		if (dev->phy.radio_on && b43_is_hw_radio_enabled(dev))
-			b43_led_turn_on(dev, led_index, activelow);
 		break;
 	case B43_LED_WEIRD:
 	case B43_LED_ASSOC:
 		snprintf(name, sizeof(name),
 			 "b43-%s::assoc", wiphy_name(hw->wiphy));
-		b43_register_led(dev, &dev->led_assoc, name,
+		b43_register_led(dev, &dev->wl->leds.led_assoc, name,
 				 ieee80211_get_assoc_led_name(hw),
 				 led_index, activelow);
 		break;
@@ -186,58 +210,150 @@ static void b43_map_led(struct b43_wldev *dev,
 	}
 }
 
-void b43_leds_init(struct b43_wldev *dev)
+static void b43_led_get_sprominfo(struct b43_wldev *dev,
+				  unsigned int led_index,
+				  enum b43_led_behaviour *behaviour,
+				  bool *activelow)
 {
 	struct ssb_bus *bus = dev->dev->bus;
 	u8 sprom[4];
-	int i;
-	enum b43_led_behaviour behaviour;
-	bool activelow;
 
 	sprom[0] = bus->sprom.gpio0;
 	sprom[1] = bus->sprom.gpio1;
 	sprom[2] = bus->sprom.gpio2;
 	sprom[3] = bus->sprom.gpio3;
 
-	for (i = 0; i < 4; i++) {
-		if (sprom[i] == 0xFF) {
-			/* There is no LED information in the SPROM
-			 * for this LED. Hardcode it here. */
-			activelow = 0;
-			switch (i) {
-			case 0:
-				behaviour = B43_LED_ACTIVITY;
-				activelow = 1;
-				if (bus->boardinfo.vendor == PCI_VENDOR_ID_COMPAQ)
-					behaviour = B43_LED_RADIO_ALL;
-				break;
-			case 1:
-				behaviour = B43_LED_RADIO_B;
-				if (bus->boardinfo.vendor == PCI_VENDOR_ID_ASUSTEK)
-					behaviour = B43_LED_ASSOC;
-				break;
-			case 2:
-				behaviour = B43_LED_RADIO_A;
-				break;
-			case 3:
-				behaviour = B43_LED_OFF;
-				break;
-			default:
-				B43_WARN_ON(1);
-				return;
-			}
+	if (sprom[led_index] == 0xFF) {
+		/* There is no LED information in the SPROM
+		 * for this LED. Hardcode it here. */
+		*activelow = 0;
+		switch (led_index) {
+		case 0:
+			*behaviour = B43_LED_ACTIVITY;
+			*activelow = 1;
+			if (bus->boardinfo.vendor == PCI_VENDOR_ID_COMPAQ)
+				*behaviour = B43_LED_RADIO_ALL;
+			break;
+		case 1:
+			*behaviour = B43_LED_RADIO_B;
+			if (bus->boardinfo.vendor == PCI_VENDOR_ID_ASUSTEK)
+				*behaviour = B43_LED_ASSOC;
+			break;
+		case 2:
+			*behaviour = B43_LED_RADIO_A;
+			break;
+		case 3:
+			*behaviour = B43_LED_OFF;
+			break;
+		default:
+			B43_WARN_ON(1);
+			return;
+		}
+	} else {
+		*behaviour = sprom[led_index] & B43_LED_BEHAVIOUR;
+		*activelow = !!(sprom[led_index] & B43_LED_ACTIVELOW);
+	}
+}
+
+void b43_leds_init(struct b43_wldev *dev)
+{
+	struct b43_led *led;
+	unsigned int i;
+	enum b43_led_behaviour behaviour;
+	bool activelow;
+
+	/* Sync the RF-kill LED state (if we have one) with radio and switch states. */
+	led = &dev->wl->leds.led_radio;
+	if (led->wl) {
+		if (dev->phy.radio_on && b43_is_hw_radio_enabled(dev)) {
+			b43_led_turn_on(dev, led->index, led->activelow);
+			led->hw_state = 1;
+			atomic_set(&led->state, 1);
 		} else {
-			behaviour = sprom[i] & B43_LED_BEHAVIOUR;
-			activelow = !!(sprom[i] & B43_LED_ACTIVELOW);
+			b43_led_turn_off(dev, led->index, led->activelow);
+			led->hw_state = 0;
+			atomic_set(&led->state, 0);
 		}
-		b43_map_led(dev, i, behaviour, activelow);
 	}
+
+	/* Initialize TX/RX/ASSOC leds */
+	led = &dev->wl->leds.led_tx;
+	if (led->wl) {
+		b43_led_turn_off(dev, led->index, led->activelow);
+		led->hw_state = 0;
+		atomic_set(&led->state, 0);
+	}
+	led = &dev->wl->leds.led_rx;
+	if (led->wl) {
+		b43_led_turn_off(dev, led->index, led->activelow);
+		led->hw_state = 0;
+		atomic_set(&led->state, 0);
+	}
+	led = &dev->wl->leds.led_assoc;
+	if (led->wl) {
+		b43_led_turn_off(dev, led->index, led->activelow);
+		led->hw_state = 0;
+		atomic_set(&led->state, 0);
+	}
+
+	/* Initialize other LED states. */
+	for (i = 0; i < B43_MAX_NR_LEDS; i++) {
+		b43_led_get_sprominfo(dev, i, &behaviour, &activelow);
+		switch (behaviour) {
+		case B43_LED_OFF:
+			b43_led_turn_off(dev, i, activelow);
+			break;
+		case B43_LED_ON:
+			b43_led_turn_on(dev, i, activelow);
+			break;
+		default:
+			/* Leave others as-is. */
+			break;
+		}
+	}
+
+	dev->wl->leds.stop = 0;
 }
 
 void b43_leds_exit(struct b43_wldev *dev)
 {
-	b43_unregister_led(&dev->led_tx);
-	b43_unregister_led(&dev->led_rx);
-	b43_unregister_led(&dev->led_assoc);
-	b43_unregister_led(&dev->led_radio);
+	struct b43_leds *leds = &dev->wl->leds;
+
+	b43_led_turn_off(dev, leds->led_tx.index, leds->led_tx.activelow);
+	b43_led_turn_off(dev, leds->led_rx.index, leds->led_rx.activelow);
+	b43_led_turn_off(dev, leds->led_assoc.index, leds->led_assoc.activelow);
+	b43_led_turn_off(dev, leds->led_radio.index, leds->led_radio.activelow);
+}
+
+void b43_leds_stop(struct b43_wldev *dev)
+{
+	struct b43_leds *leds = &dev->wl->leds;
+
+	leds->stop = 1;
+	cancel_work_sync(&leds->work);
+}
+
+void b43_leds_register(struct b43_wldev *dev)
+{
+	unsigned int i;
+	enum b43_led_behaviour behaviour;
+	bool activelow;
+
+	INIT_WORK(&dev->wl->leds.work, b43_leds_work);
+
+	/* Register the LEDs to the LED subsystem. */
+	for (i = 0; i < B43_MAX_NR_LEDS; i++) {
+		b43_led_get_sprominfo(dev, i, &behaviour, &activelow);
+		b43_map_led(dev, i, behaviour, activelow);
+	}
+}
+
+void b43_leds_unregister(struct b43_wldev *dev)
+{
+	struct b43_leds *leds = &dev->wl->leds;
+
+	b43_unregister_led(&leds->led_tx);
+	b43_unregister_led(&leds->led_rx);
+	b43_unregister_led(&leds->led_assoc);
+	b43_unregister_led(&leds->led_radio);
 }
diff --git a/drivers/net/wireless/b43/leds.h b/drivers/net/wireless/b43/leds.h
index b8b1dd52124..9592e4c5a5f 100644
--- a/drivers/net/wireless/b43/leds.h
+++ b/drivers/net/wireless/b43/leds.h
@@ -7,12 +7,13 @@ struct b43_wldev;
 
 #include <linux/types.h>
 #include <linux/leds.h>
+#include <linux/workqueue.h>
 
 
 #define B43_LED_MAX_NAME_LEN	31
 
 struct b43_led {
-	struct b43_wldev *dev;
+	struct b43_wl *wl;
 	/* The LED class device */
 	struct led_classdev led_dev;
 	/* The index number of the LED. */
@@ -22,8 +23,24 @@ struct b43_led {
 	bool activelow;
 	/* The unique name string for this LED device. */
 	char name[B43_LED_MAX_NAME_LEN + 1];
+	/* The current status of the LED. This is updated locklessly. */
+	atomic_t state;
+	/* The active state in hardware. */
+	bool hw_state;
 };
 
+struct b43_leds {
+	struct b43_led led_tx;
+	struct b43_led led_rx;
+	struct b43_led led_radio;
+	struct b43_led led_assoc;
+
+	bool stop;
+	struct work_struct work;
+};
+
+#define B43_MAX_NR_LEDS			4
+
 #define B43_LED_BEHAVIOUR		0x7F
 #define B43_LED_ACTIVELOW		0x80
 /* LED behaviour values */
@@ -42,23 +59,35 @@ enum b43_led_behaviour {
 	B43_LED_INACTIVE,
 };
 
+void b43_leds_register(struct b43_wldev *dev);
+void b43_leds_unregister(struct b43_wldev *dev);
 void b43_leds_init(struct b43_wldev *dev);
 void b43_leds_exit(struct b43_wldev *dev);
+void b43_leds_stop(struct b43_wldev *dev);
 
 
 #else /* CONFIG_B43_LEDS */
 /* LED support disabled */
 
-struct b43_led {
+struct b43_leds {
 	/* empty */
 };
 
+static inline void b43_leds_register(struct b43_wldev *dev)
+{
+}
+static inline void b43_leds_unregister(struct b43_wldev *dev)
+{
+}
 static inline void b43_leds_init(struct b43_wldev *dev)
 {
 }
 static inline void b43_leds_exit(struct b43_wldev *dev)
 {
 }
+static inline void b43_leds_stop(struct b43_wldev *dev)
+{
+}
 #endif /* CONFIG_B43_LEDS */
 
 #endif /* B43_LEDS_H_ */
diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c
index e789792a36b..9b907a36bb8 100644
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -8,6 +8,9 @@
   Copyright (c) 2005 Danny van Dyk <kugelfang@gentoo.org>
   Copyright (c) 2005 Andreas Jaggi <andreas.jaggi@waterwave.ch>
 
+  SDIO support
+  Copyright (c) 2009 Albert Herranz <albert_herranz@yahoo.es>
+
   Some parts of the code in this file are derived from the ipw2200
   driver  Copyright(c) 2003 - 2004 Intel Corporation.
 
@@ -53,6 +56,8 @@
 #include "xmit.h"
 #include "lo.h"
 #include "pcmcia.h"
+#include "sdio.h"
+#include <linux/mmc/sdio_func.h>
 
 MODULE_DESCRIPTION("Broadcom B43 wireless driver");
 MODULE_AUTHOR("Martin Langer");
@@ -1587,7 +1592,7 @@ static void b43_beacon_update_trigger_work(struct work_struct *work)
 	mutex_lock(&wl->mutex);
 	dev = wl->current_dev;
 	if (likely(dev && (b43_status(dev) >= B43_STAT_INITIALIZED))) {
-		if (0 /*FIXME dev->dev->bus->bustype == SSB_BUSTYPE_SDIO*/) {
+		if (dev->dev->bus->bustype == SSB_BUSTYPE_SDIO) {
 			/* wl->mutex is enough. */
 			b43_do_beacon_update_trigger_work(dev);
 			mmiowb();
@@ -1825,6 +1830,16 @@ static void b43_do_interrupt_thread(struct b43_wldev *dev)
 
 	/* Re-enable interrupts on the device by restoring the current interrupt mask. */
 	b43_write32(dev, B43_MMIO_GEN_IRQ_MASK, dev->irq_mask);
+
+#if B43_DEBUG
+	if (b43_debug(dev, B43_DBG_VERBOSESTATS)) {
+		dev->irq_count++;
+		for (i = 0; i < ARRAY_SIZE(dev->irq_bit_count); i++) {
+			if (reason & (1 << i))
+				dev->irq_bit_count[i]++;
+		}
+	}
+#endif
 }
 
 /* Interrupt thread handler. Handles device interrupts in thread context. */
@@ -1905,6 +1920,21 @@ static irqreturn_t b43_interrupt_handler(int irq, void *dev_id)
 	return ret;
 }
 
+/* SDIO interrupt handler. This runs in process context. */
+static void b43_sdio_interrupt_handler(struct b43_wldev *dev)
+{
+	struct b43_wl *wl = dev->wl;
+	irqreturn_t ret;
+
+	mutex_lock(&wl->mutex);
+
+	ret = b43_do_interrupt(dev);
+	if (ret == IRQ_WAKE_THREAD)
+		b43_do_interrupt_thread(dev);
+
+	mutex_unlock(&wl->mutex);
+}
+
 void b43_do_release_fw(struct b43_firmware_file *fw)
 {
 	release_firmware(fw->data);
@@ -2645,6 +2675,20 @@ static void b43_adjust_opmode(struct b43_wldev *dev)
 			cfp_pretbtt = 50;
 	}
 	b43_write16(dev, 0x612, cfp_pretbtt);
+
+	/* FIXME: We don't currently implement the PMQ mechanism,
+	 *        so always disable it. If we want to implement PMQ,
+	 *        we need to enable it here (clear DISCPMQ) in AP mode.
+	 */
+	if (0  /* ctl & B43_MACCTL_AP */) {
+		b43_write32(dev, B43_MMIO_MACCTL,
+			    b43_read32(dev, B43_MMIO_MACCTL)
+			    & ~B43_MACCTL_DISCPMQ);
+	} else {
+		b43_write32(dev, B43_MMIO_MACCTL,
+			    b43_read32(dev, B43_MMIO_MACCTL)
+			    | B43_MACCTL_DISCPMQ);
+	}
 }
 
 static void b43_rate_memory_write(struct b43_wldev *dev, u16 rate, int is_ofdm)
@@ -2873,6 +2917,27 @@ static void b43_periodic_every15sec(struct b43_wldev *dev)
 
 	atomic_set(&phy->txerr_cnt, B43_PHY_TX_BADNESS_LIMIT);
 	wmb();
+
+#if B43_DEBUG
+	if (b43_debug(dev, B43_DBG_VERBOSESTATS)) {
+		unsigned int i;
+
+		b43dbg(dev->wl, "Stats: %7u IRQs/sec, %7u TX/sec, %7u RX/sec\n",
+		       dev->irq_count / 15,
+		       dev->tx_count / 15,
+		       dev->rx_count / 15);
+		dev->irq_count = 0;
+		dev->tx_count = 0;
+		dev->rx_count = 0;
+		for (i = 0; i < ARRAY_SIZE(dev->irq_bit_count); i++) {
+			if (dev->irq_bit_count[i]) {
+				b43dbg(dev->wl, "Stats: %7u IRQ-%02u/sec (0x%08X)\n",
+				       dev->irq_bit_count[i] / 15, i, (1 << i));
+				dev->irq_bit_count[i] = 0;
+			}
+		}
+	}
+#endif
 }
 
 static void do_periodic_work(struct b43_wldev *dev)
@@ -3002,14 +3067,18 @@ static void b43_security_init(struct b43_wldev *dev)
 static int b43_rng_read(struct hwrng *rng, u32 *data)
 {
 	struct b43_wl *wl = (struct b43_wl *)rng->priv;
+	struct b43_wldev *dev;
+	int count = -ENODEV;
 
-	/* FIXME: We need to take wl->mutex here to make sure the device
-	 * is not going away from under our ass. However it could deadlock
-	 * with hwrng internal locking. */
-
-	*data = b43_read16(wl->current_dev, B43_MMIO_RNG);
+	mutex_lock(&wl->mutex);
+	dev = wl->current_dev;
+	if (likely(dev && b43_status(dev) >= B43_STAT_INITIALIZED)) {
+		*data = b43_read16(dev, B43_MMIO_RNG);
+		count = sizeof(u16);
+	}
+	mutex_unlock(&wl->mutex);
 
-	return (sizeof(u16));
+	return count;
 }
 #endif /* CONFIG_B43_HWRNG */
 
@@ -3068,6 +3137,9 @@ static void b43_tx_work(struct work_struct *work)
 			dev_kfree_skb(skb); /* Drop it */
 	}
 
+#if B43_DEBUG
+	dev->tx_count++;
+#endif
 	mutex_unlock(&wl->mutex);
 }
 
@@ -3820,7 +3892,7 @@ redo:
 
 	/* Disable interrupts on the device. */
 	b43_set_status(dev, B43_STAT_INITIALIZED);
-	if (0 /*FIXME dev->dev->bus->bustype == SSB_BUSTYPE_SDIO*/) {
+	if (dev->dev->bus->bustype == SSB_BUSTYPE_SDIO) {
 		/* wl->mutex is locked. That is enough. */
 		b43_write32(dev, B43_MMIO_GEN_IRQ_MASK, 0);
 		b43_read32(dev, B43_MMIO_GEN_IRQ_MASK);	/* Flush */
@@ -3830,10 +3902,15 @@ redo:
 		b43_read32(dev, B43_MMIO_GEN_IRQ_MASK);	/* Flush */
 		spin_unlock_irq(&wl->hardirq_lock);
 	}
-	/* Synchronize the interrupt handlers. Unlock to avoid deadlocks. */
+	/* Synchronize and free the interrupt handlers. Unlock to avoid deadlocks. */
 	orig_dev = dev;
 	mutex_unlock(&wl->mutex);
-	synchronize_irq(dev->dev->irq);
+	if (dev->dev->bus->bustype == SSB_BUSTYPE_SDIO) {
+		b43_sdio_free_irq(dev);
+	} else {
+		synchronize_irq(dev->dev->irq);
+		free_irq(dev->dev->irq, dev);
+	}
 	mutex_lock(&wl->mutex);
 	dev = wl->current_dev;
 	if (!dev)
@@ -3850,7 +3927,7 @@ redo:
 		dev_kfree_skb(skb_dequeue(&wl->tx_queue));
 
 	b43_mac_suspend(dev);
-	free_irq(dev->dev->irq, dev);
+	b43_leds_exit(dev);
 	b43dbg(wl, "Wireless interface stopped\n");
 
 	return dev;
@@ -3864,12 +3941,20 @@ static int b43_wireless_core_start(struct b43_wldev *dev)
 	B43_WARN_ON(b43_status(dev) != B43_STAT_INITIALIZED);
 
 	drain_txstatus_queue(dev);
-	err = request_threaded_irq(dev->dev->irq, b43_interrupt_handler,
-				   b43_interrupt_thread_handler,
-				   IRQF_SHARED, KBUILD_MODNAME, dev);
-	if (err) {
-		b43err(dev->wl, "Cannot request IRQ-%d\n", dev->dev->irq);
-		goto out;
+	if (dev->dev->bus->bustype == SSB_BUSTYPE_SDIO) {
+		err = b43_sdio_request_irq(dev, b43_sdio_interrupt_handler);
+		if (err) {
+			b43err(dev->wl, "Cannot request SDIO IRQ\n");
+			goto out;
+		}
+	} else {
+		err = request_threaded_irq(dev->dev->irq, b43_interrupt_handler,
+					   b43_interrupt_thread_handler,
+					   IRQF_SHARED, KBUILD_MODNAME, dev);
+		if (err) {
+			b43err(dev->wl, "Cannot request IRQ-%d\n", dev->dev->irq);
+			goto out;
+		}
 	}
 
 	/* We are ready to run. */
@@ -3882,8 +3967,10 @@ static int b43_wireless_core_start(struct b43_wldev *dev)
 	/* Start maintainance work */
 	b43_periodic_tasks_setup(dev);
 
+	b43_leds_init(dev);
+
 	b43dbg(dev->wl, "Wireless interface started\n");
-      out:
+out:
 	return err;
 }
 
@@ -4160,10 +4247,6 @@ static void b43_wireless_core_exit(struct b43_wldev *dev)
 	macctl |= B43_MACCTL_PSM_JMP0;
 	b43_write32(dev, B43_MMIO_MACCTL, macctl);
 
-	if (!dev->suspend_in_progress) {
-		b43_leds_exit(dev);
-		b43_rng_exit(dev->wl);
-	}
 	b43_dma_free(dev);
 	b43_pio_free(dev);
 	b43_chip_exit(dev);
@@ -4180,7 +4263,6 @@ static void b43_wireless_core_exit(struct b43_wldev *dev)
 /* Initialize a wireless core */
 static int b43_wireless_core_init(struct b43_wldev *dev)
 {
-	struct b43_wl *wl = dev->wl;
 	struct ssb_bus *bus = dev->dev->bus;
 	struct ssb_sprom *sprom = &bus->sprom;
 	struct b43_phy *phy = &dev->phy;
@@ -4264,7 +4346,9 @@ static int b43_wireless_core_init(struct b43_wldev *dev)
 	/* Maximum Contention Window */
 	b43_shm_write16(dev, B43_SHM_SCRATCH, B43_SHM_SC_MAXCONT, 0x3FF);
 
-	if ((dev->dev->bus->bustype == SSB_BUSTYPE_PCMCIA) || B43_FORCE_PIO) {
+	if ((dev->dev->bus->bustype == SSB_BUSTYPE_PCMCIA) ||
+	    (dev->dev->bus->bustype == SSB_BUSTYPE_SDIO) ||
+	    B43_FORCE_PIO) {
 		dev->__using_pio_transfers = 1;
 		err = b43_pio_init(dev);
 	} else {
@@ -4280,15 +4364,13 @@ static int b43_wireless_core_init(struct b43_wldev *dev)
 	ssb_bus_powerup(bus, !(sprom->boardflags_lo & B43_BFL_XTAL_NOSLOW));
 	b43_upload_card_macaddress(dev);
 	b43_security_init(dev);
-	if (!dev->suspend_in_progress)
-		b43_rng_init(wl);
+
+	ieee80211_wake_queues(dev->wl->hw);
 
 	ieee80211_wake_queues(dev->wl->hw);
 
 	b43_set_status(dev, B43_STAT_INITIALIZED);
 
-	if (!dev->suspend_in_progress)
-		b43_leds_init(dev);
 out:
 	return err;
 
@@ -4837,7 +4919,6 @@ static int b43_wireless_init(struct ssb_device *dev)
 
 	/* Initialize struct b43_wl */
 	wl->hw = hw;
-	spin_lock_init(&wl->leds_lock);
 	mutex_init(&wl->mutex);
 	spin_lock_init(&wl->hardirq_lock);
 	INIT_LIST_HEAD(&wl->devlist);
@@ -4878,6 +4959,8 @@ static int b43_probe(struct ssb_device *dev, const struct ssb_device_id *id)
 		err = ieee80211_register_hw(wl->hw);
 		if (err)
 			goto err_one_core_detach;
+		b43_leds_register(wl->current_dev);
+		b43_rng_init(wl);
 	}
 
       out:
@@ -4906,12 +4989,15 @@ static void b43_remove(struct ssb_device *dev)
 		 * might have modified it. Restoring is important, so the networking
 		 * stack can properly free resources. */
 		wl->hw->queues = wl->mac80211_initially_registered_queues;
+		b43_leds_stop(wldev);
 		ieee80211_unregister_hw(wl->hw);
 	}
 
 	b43_one_core_detach(dev);
 
 	if (list_empty(&wl->devlist)) {
+		b43_rng_exit(wl);
+		b43_leds_unregister(wldev);
 		/* Last core on the chip unregistered.
 		 * We can destroy common struct b43_wl.
 		 */
@@ -4929,80 +5015,17 @@ void b43_controller_restart(struct b43_wldev *dev, const char *reason)
 	ieee80211_queue_work(dev->wl->hw, &dev->restart_work);
 }
 
-#ifdef CONFIG_PM
-
-static int b43_suspend(struct ssb_device *dev, pm_message_t state)
-{
-	struct b43_wldev *wldev = ssb_get_drvdata(dev);
-	struct b43_wl *wl = wldev->wl;
-
-	b43dbg(wl, "Suspending...\n");
-
-	mutex_lock(&wl->mutex);
-	wldev->suspend_in_progress = true;
-	wldev->suspend_init_status = b43_status(wldev);
-	if (wldev->suspend_init_status >= B43_STAT_STARTED)
-		wldev = b43_wireless_core_stop(wldev);
-	if (wldev && wldev->suspend_init_status >= B43_STAT_INITIALIZED)
-		b43_wireless_core_exit(wldev);
-	mutex_unlock(&wl->mutex);
-
-	b43dbg(wl, "Device suspended.\n");
-
-	return 0;
-}
-
-static int b43_resume(struct ssb_device *dev)
-{
-	struct b43_wldev *wldev = ssb_get_drvdata(dev);
-	struct b43_wl *wl = wldev->wl;
-	int err = 0;
-
-	b43dbg(wl, "Resuming...\n");
-
-	mutex_lock(&wl->mutex);
-	if (wldev->suspend_init_status >= B43_STAT_INITIALIZED) {
-		err = b43_wireless_core_init(wldev);
-		if (err) {
-			b43err(wl, "Resume failed at core init\n");
-			goto out;
-		}
-	}
-	if (wldev->suspend_init_status >= B43_STAT_STARTED) {
-		err = b43_wireless_core_start(wldev);
-		if (err) {
-			b43_leds_exit(wldev);
-			b43_rng_exit(wldev->wl);
-			b43_wireless_core_exit(wldev);
-			b43err(wl, "Resume failed at core start\n");
-			goto out;
-		}
-	}
-	b43dbg(wl, "Device resumed.\n");
- out:
-	wldev->suspend_in_progress = false;
-	mutex_unlock(&wl->mutex);
-	return err;
-}
-
-#else /* CONFIG_PM */
-# define b43_suspend	NULL
-# define b43_resume	NULL
-#endif /* CONFIG_PM */
-
 static struct ssb_driver b43_ssb_driver = {
 	.name		= KBUILD_MODNAME,
 	.id_table	= b43_ssb_tbl,
 	.probe		= b43_probe,
 	.remove		= b43_remove,
-	.suspend	= b43_suspend,
-	.resume		= b43_resume,
 };
 
 static void b43_print_driverinfo(void)
 {
 	const char *feat_pci = "", *feat_pcmcia = "", *feat_nphy = "",
-		   *feat_leds = "";
+		   *feat_leds = "", *feat_sdio = "";
 
 #ifdef CONFIG_B43_PCI_AUTOSELECT
 	feat_pci = "P";
@@ -5016,11 +5039,14 @@ static void b43_print_driverinfo(void)
 #ifdef CONFIG_B43_LEDS
 	feat_leds = "L";
 #endif
+#ifdef CONFIG_B43_SDIO
+	feat_sdio = "S";
+#endif
 	printk(KERN_INFO "Broadcom 43xx driver loaded "
-	       "[ Features: %s%s%s%s, Firmware-ID: "
+	       "[ Features: %s%s%s%s%s, Firmware-ID: "
 	       B43_SUPPORTED_FIRMWARE_ID " ]\n",
 	       feat_pci, feat_pcmcia, feat_nphy,
-	       feat_leds);
+	       feat_leds, feat_sdio);
 }
 
 static int __init b43_init(void)
@@ -5031,13 +5057,18 @@ static int __init b43_init(void)
 	err = b43_pcmcia_init();
 	if (err)
 		goto err_dfs_exit;
-	err = ssb_driver_register(&b43_ssb_driver);
+	err = b43_sdio_init();
 	if (err)
 		goto err_pcmcia_exit;
+	err = ssb_driver_register(&b43_ssb_driver);
+	if (err)
+		goto err_sdio_exit;
 	b43_print_driverinfo();
 
 	return err;
 
+err_sdio_exit:
+	b43_sdio_exit();
 err_pcmcia_exit:
 	b43_pcmcia_exit();
 err_dfs_exit:
@@ -5048,6 +5079,7 @@ err_dfs_exit:
 static void __exit b43_exit(void)
 {
 	ssb_driver_unregister(&b43_ssb_driver);
+	b43_sdio_exit();
 	b43_pcmcia_exit();
 	b43_debugfs_exit();
 }
diff --git a/drivers/net/wireless/b43/phy_lp.c b/drivers/net/wireless/b43/phy_lp.c
index 3e02d969f68..1e318d815a5 100644
--- a/drivers/net/wireless/b43/phy_lp.c
+++ b/drivers/net/wireless/b43/phy_lp.c
@@ -2228,6 +2228,16 @@ static enum b43_txpwr_result b43_lpphy_op_recalc_txpower(struct b43_wldev *dev,
 	return B43_TXPWR_RES_DONE;
 }
 
+void b43_lpphy_op_switch_analog(struct b43_wldev *dev, bool on)
+{
+       if (on) {
+               b43_phy_mask(dev, B43_LPPHY_AFE_CTL_OVR, 0xfff8);
+       } else {
+               b43_phy_set(dev, B43_LPPHY_AFE_CTL_OVRVAL, 0x0007);
+               b43_phy_set(dev, B43_LPPHY_AFE_CTL_OVR, 0x0007);
+       }
+}
+
 const struct b43_phy_operations b43_phyops_lp = {
 	.allocate		= b43_lpphy_op_allocate,
 	.free			= b43_lpphy_op_free,
@@ -2239,7 +2249,7 @@ const struct b43_phy_operations b43_phyops_lp = {
 	.radio_read		= b43_lpphy_op_radio_read,
 	.radio_write		= b43_lpphy_op_radio_write,
 	.software_rfkill	= b43_lpphy_op_software_rfkill,
-	.switch_analog		= b43_phyop_switch_analog_generic,
+	.switch_analog		= b43_lpphy_op_switch_analog,
 	.switch_channel		= b43_lpphy_op_switch_channel,
 	.get_default_chan	= b43_lpphy_op_get_default_chan,
 	.set_rx_antenna		= b43_lpphy_op_set_rx_antenna,
diff --git a/drivers/net/wireless/b43/pio.c b/drivers/net/wireless/b43/pio.c
index 3498b68385e..e96091b3149 100644
--- a/drivers/net/wireless/b43/pio.c
+++ b/drivers/net/wireless/b43/pio.c
@@ -574,7 +574,7 @@ void b43_pio_handle_txstatus(struct b43_wldev *dev,
 	q->buffer_used -= total_len;
 	q->free_packet_slots += 1;
 
-	ieee80211_tx_status_irqsafe(dev->wl->hw, pack->skb);
+	ieee80211_tx_status(dev->wl->hw, pack->skb);
 	pack->skb = NULL;
 	list_add(&pack->list, &q->packets_list);
 
diff --git a/drivers/net/wireless/b43/rfkill.c b/drivers/net/wireless/b43/rfkill.c
index 31e55999893..7a3218c5ba7 100644
--- a/drivers/net/wireless/b43/rfkill.c
+++ b/drivers/net/wireless/b43/rfkill.c
@@ -28,7 +28,7 @@
 /* Returns TRUE, if the radio is enabled in hardware. */
 bool b43_is_hw_radio_enabled(struct b43_wldev *dev)
 {
-	if (dev->phy.rev >= 3) {
+	if (dev->phy.rev >= 3 || dev->phy.type == B43_PHYTYPE_LP) {
 		if (!(b43_read32(dev, B43_MMIO_RADIO_HWENABLED_HI)
 		      & B43_MMIO_RADIO_HWENABLED_HI_MASK))
 			return 1;
diff --git a/drivers/net/wireless/b43/sdio.c b/drivers/net/wireless/b43/sdio.c
new file mode 100644
index 00000000000..0d3ac64147a
--- /dev/null
+++ b/drivers/net/wireless/b43/sdio.c
@@ -0,0 +1,202 @@
+/*
+ * Broadcom B43 wireless driver
+ *
+ * SDIO over Sonics Silicon Backplane bus glue for b43.
+ *
+ * Copyright (C) 2009 Albert Herranz
+ * Copyright (C) 2009 Michael Buesch <mb@bu3sch.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mmc/card.h>
+#include <linux/mmc/sdio_func.h>
+#include <linux/mmc/sdio_ids.h>
+#include <linux/ssb/ssb.h>
+
+#include "sdio.h"
+#include "b43.h"
+
+
+#define HNBU_CHIPID		0x01	/* vendor & device id */
+
+#define B43_SDIO_BLOCK_SIZE	64	/* rx fifo max size in bytes */
+
+
+static const struct b43_sdio_quirk {
+	u16 vendor;
+	u16 device;
+	unsigned int quirks;
+} b43_sdio_quirks[] = {
+	{ 0x14E4, 0x4318, SSB_QUIRK_SDIO_READ_AFTER_WRITE32, },
+	{ },
+};
+
+
+static unsigned int b43_sdio_get_quirks(u16 vendor, u16 device)
+{
+	const struct b43_sdio_quirk *q;
+
+	for (q = b43_sdio_quirks; q->quirks; q++) {
+		if (vendor == q->vendor && device == q->device)
+			return q->quirks;
+	}
+
+	return 0;
+}
+
+static void b43_sdio_interrupt_dispatcher(struct sdio_func *func)
+{
+	struct b43_sdio *sdio = sdio_get_drvdata(func);
+	struct b43_wldev *dev = sdio->irq_handler_opaque;
+
+	if (unlikely(b43_status(dev) < B43_STAT_STARTED))
+		return;
+
+	sdio_release_host(func);
+	sdio->irq_handler(dev);
+	sdio_claim_host(func);
+}
+
+int b43_sdio_request_irq(struct b43_wldev *dev,
+			 void (*handler)(struct b43_wldev *dev))
+{
+	struct ssb_bus *bus = dev->dev->bus;
+	struct sdio_func *func = bus->host_sdio;
+	struct b43_sdio *sdio = sdio_get_drvdata(func);
+	int err;
+
+	sdio->irq_handler_opaque = dev;
+	sdio->irq_handler = handler;
+	sdio_claim_host(func);
+	err = sdio_claim_irq(func, b43_sdio_interrupt_dispatcher);
+	sdio_release_host(func);
+
+	return err;
+}
+
+void b43_sdio_free_irq(struct b43_wldev *dev)
+{
+	struct ssb_bus *bus = dev->dev->bus;
+	struct sdio_func *func = bus->host_sdio;
+	struct b43_sdio *sdio = sdio_get_drvdata(func);
+
+	sdio_claim_host(func);
+	sdio_release_irq(func);
+	sdio_release_host(func);
+	sdio->irq_handler_opaque = NULL;
+	sdio->irq_handler = NULL;
+}
+
+static int b43_sdio_probe(struct sdio_func *func,
+			  const struct sdio_device_id *id)
+{
+	struct b43_sdio *sdio;
+	struct sdio_func_tuple *tuple;
+	u16 vendor = 0, device = 0;
+	int error;
+
+	/* Look for the card chip identifier. */
+	tuple = func->tuples;
+	while (tuple) {
+		switch (tuple->code) {
+		case 0x80:
+			switch (tuple->data[0]) {
+			case HNBU_CHIPID:
+				if (tuple->size != 5)
+					break;
+				vendor = tuple->data[1] | (tuple->data[2]<<8);
+				device = tuple->data[3] | (tuple->data[4]<<8);
+				dev_info(&func->dev, "Chip ID %04x:%04x\n",
+					 vendor, device);
+				break;
+			default:
+				break;
+			}
+			break;
+		default:
+			break;
+		}
+		tuple = tuple->next;
+	}
+	if (!vendor || !device) {
+		error = -ENODEV;
+		goto out;
+	}
+
+	sdio_claim_host(func);
+	error = sdio_set_block_size(func, B43_SDIO_BLOCK_SIZE);
+	if (error) {
+		dev_err(&func->dev, "failed to set block size to %u bytes,"
+			" error %d\n", B43_SDIO_BLOCK_SIZE, error);
+		goto err_release_host;
+	}
+	error = sdio_enable_func(func);
+	if (error) {
+		dev_err(&func->dev, "failed to enable func, error %d\n", error);
+		goto err_release_host;
+	}
+	sdio_release_host(func);
+
+	sdio = kzalloc(sizeof(*sdio), GFP_KERNEL);
+	if (!sdio) {
+		error = -ENOMEM;
+		dev_err(&func->dev, "failed to allocate ssb bus\n");
+		goto err_disable_func;
+	}
+	error = ssb_bus_sdiobus_register(&sdio->ssb, func,
+					 b43_sdio_get_quirks(vendor, device));
+	if (error) {
+		dev_err(&func->dev, "failed to register ssb sdio bus,"
+			" error %d\n", error);
+		goto err_free_ssb;
+	}
+	sdio_set_drvdata(func, sdio);
+
+	return 0;
+
+err_free_ssb:
+	kfree(sdio);
+err_disable_func:
+	sdio_disable_func(func);
+err_release_host:
+	sdio_release_host(func);
+out:
+	return error;
+}
+
+static void b43_sdio_remove(struct sdio_func *func)
+{
+	struct b43_sdio *sdio = sdio_get_drvdata(func);
+
+	ssb_bus_unregister(&sdio->ssb);
+	sdio_disable_func(func);
+	kfree(sdio);
+	sdio_set_drvdata(func, NULL);
+}
+
+static const struct sdio_device_id b43_sdio_ids[] = {
+	{ SDIO_DEVICE(0x02d0, 0x044b) }, /* Nintendo Wii WLAN daughter card */
+	{ },
+};
+
+static struct sdio_driver b43_sdio_driver = {
+	.name		= "b43-sdio",
+	.id_table	= b43_sdio_ids,
+	.probe		= b43_sdio_probe,
+	.remove		= b43_sdio_remove,
+};
+
+int b43_sdio_init(void)
+{
+	return sdio_register_driver(&b43_sdio_driver);
+}
+
+void b43_sdio_exit(void)
+{
+	sdio_unregister_driver(&b43_sdio_driver);
+}
diff --git a/drivers/net/wireless/b43/sdio.h b/drivers/net/wireless/b43/sdio.h
new file mode 100644
index 00000000000..fb633094403
--- /dev/null
+++ b/drivers/net/wireless/b43/sdio.h
@@ -0,0 +1,45 @@
+#ifndef B43_SDIO_H_
+#define B43_SDIO_H_
+
+#include <linux/ssb/ssb.h>
+
+struct b43_wldev;
+
+
+#ifdef CONFIG_B43_SDIO
+
+struct b43_sdio {
+	struct ssb_bus ssb;
+	void *irq_handler_opaque;
+	void (*irq_handler)(struct b43_wldev *dev);
+};
+
+int b43_sdio_request_irq(struct b43_wldev *dev,
+			 void (*handler)(struct b43_wldev *dev));
+void b43_sdio_free_irq(struct b43_wldev *dev);
+
+int b43_sdio_init(void);
+void b43_sdio_exit(void);
+
+
+#else /* CONFIG_B43_SDIO */
+
+
+int b43_sdio_request_irq(struct b43_wldev *dev,
+			 void (*handler)(struct b43_wldev *dev))
+{
+	return -ENODEV;
+}
+void b43_sdio_free_irq(struct b43_wldev *dev)
+{
+}
+static inline int b43_sdio_init(void)
+{
+	return 0;
+}
+static inline void b43_sdio_exit(void)
+{
+}
+
+#endif /* CONFIG_B43_SDIO */
+#endif /* B43_SDIO_H_ */
diff --git a/drivers/net/wireless/b43/xmit.c b/drivers/net/wireless/b43/xmit.c
index 14f541248b5..ac9f600995e 100644
--- a/drivers/net/wireless/b43/xmit.c
+++ b/drivers/net/wireless/b43/xmit.c
@@ -690,8 +690,11 @@ void b43_rx(struct b43_wldev *dev, struct sk_buff *skb, const void *_rxhdr)
 	}
 
 	memcpy(IEEE80211_SKB_RXCB(skb), &status, sizeof(status));
-	ieee80211_rx_irqsafe(dev->wl->hw, skb);
+	ieee80211_rx(dev->wl->hw, skb);
 
+#if B43_DEBUG
+	dev->rx_count++;
+#endif
 	return;
 drop:
 	b43dbg(dev->wl, "RX: Packet dropped\n");
diff --git a/drivers/net/wireless/iwlwifi/iwl-4965.c b/drivers/net/wireless/iwlwifi/iwl-4965.c
index ca61d3796ce..3259b884154 100644
--- a/drivers/net/wireless/iwlwifi/iwl-4965.c
+++ b/drivers/net/wireless/iwlwifi/iwl-4965.c
@@ -2021,6 +2021,12 @@ static int iwl4965_tx_status_reply_tx(struct iwl_priv *priv,
 					   agg->frame_count, txq_id, idx);
 
 			hdr = iwl_tx_queue_get_hdr(priv, txq_id, idx);
+			if (!hdr) {
+				IWL_ERR(priv,
+					"BUG_ON idx doesn't point to valid skb"
+					" idx=%d, txq_id=%d\n", idx, txq_id);
+				return -1;
+			}
 
 			sc = le16_to_cpu(hdr->seq_ctrl);
 			if (idx != (SEQ_TO_SN(sc) & 0xff)) {
diff --git a/drivers/net/wireless/iwlwifi/iwl-5000.c b/drivers/net/wireless/iwlwifi/iwl-5000.c
index 1d539e3b8db..a6391c7fea5 100644
--- a/drivers/net/wireless/iwlwifi/iwl-5000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-5000.c
@@ -1163,6 +1163,12 @@ static int iwl5000_tx_status_reply_tx(struct iwl_priv *priv,
 					   agg->frame_count, txq_id, idx);
 
 			hdr = iwl_tx_queue_get_hdr(priv, txq_id, idx);
+			if (!hdr) {
+				IWL_ERR(priv,
+					"BUG_ON idx doesn't point to valid skb"
+					" idx=%d, txq_id=%d\n", idx, txq_id);
+				return -1;
+			}
 
 			sc = le16_to_cpu(hdr->seq_ctrl);
 			if (idx != (SEQ_TO_SN(sc) & 0xff)) {
diff --git a/drivers/net/wireless/iwlwifi/iwl-rx.c b/drivers/net/wireless/iwlwifi/iwl-rx.c
index b90adcb73b0..8e1bb53c0aa 100644
--- a/drivers/net/wireless/iwlwifi/iwl-rx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-rx.c
@@ -250,12 +250,20 @@ void iwl_rx_allocate(struct iwl_priv *priv, gfp_t priority)
 		}
 		spin_unlock_irqrestore(&rxq->lock, flags);
 
+		if (rxq->free_count > RX_LOW_WATERMARK)
+			priority |= __GFP_NOWARN;
 		/* Alloc a new receive buffer */
 		skb = alloc_skb(priv->hw_params.rx_buf_size + 256,
 						priority);
 
 		if (!skb) {
-			IWL_CRIT(priv, "Can not allocate SKB buffers\n");
+			if (net_ratelimit())
+				IWL_DEBUG_INFO(priv, "Failed to allocate SKB buffer.\n");
+			if ((rxq->free_count <= RX_LOW_WATERMARK) &&
+			    net_ratelimit())
+				IWL_CRIT(priv, "Failed to allocate SKB buffer with %s. Only %u free buffers remaining.\n",
+					 priority == GFP_ATOMIC ?  "GFP_ATOMIC" : "GFP_KERNEL",
+					 rxq->free_count);
 			/* We don't reschedule replenish work here -- we will
 			 * call the restock method and if it still needs
 			 * more buffers it will schedule replenish */
diff --git a/drivers/net/wireless/iwlwifi/iwl-sta.c b/drivers/net/wireless/iwlwifi/iwl-sta.c
index a2b9ec82b96..c6633fec821 100644
--- a/drivers/net/wireless/iwlwifi/iwl-sta.c
+++ b/drivers/net/wireless/iwlwifi/iwl-sta.c
@@ -520,7 +520,7 @@ int iwl_send_static_wepkey_cmd(struct iwl_priv *priv, u8 send_if_empty)
 	struct iwl_host_cmd cmd = {
 		.id = REPLY_WEPKEY,
 		.data = wep_cmd,
-		.flags = CMD_SYNC,
+		.flags = CMD_ASYNC,
 	};
 
 	memset(wep_cmd, 0, cmd_size +
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index 090966837f3..4f2d4393728 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -1146,11 +1146,18 @@ static void iwl3945_rx_allocate(struct iwl_priv *priv, gfp_t priority)
 		}
 		spin_unlock_irqrestore(&rxq->lock, flags);
 
+		if (rxq->free_count > RX_LOW_WATERMARK)
+			priority |= __GFP_NOWARN;
 		/* Alloc a new receive buffer */
 		skb = alloc_skb(priv->hw_params.rx_buf_size, priority);
 		if (!skb) {
 			if (net_ratelimit())
-				IWL_CRIT(priv, ": Can not allocate SKB buffers\n");
+				IWL_DEBUG_INFO(priv, "Failed to allocate SKB buffer.\n");
+			if ((rxq->free_count <= RX_LOW_WATERMARK) &&
+			    net_ratelimit())
+				IWL_CRIT(priv, "Failed to allocate SKB buffer with %s. Only %u free buffers remaining.\n",
+					 priority == GFP_ATOMIC ?  "GFP_ATOMIC" : "GFP_KERNEL",
+					 rxq->free_count);
 			/* We don't reschedule replenish work here -- we will
 			 * call the restock method and if it still needs
 			 * more buffers it will schedule replenish */
diff --git a/drivers/net/wireless/rt2x00/rt2x00lib.h b/drivers/net/wireless/rt2x00/rt2x00lib.h
index 5462cb5ad99..567f029a8cd 100644
--- a/drivers/net/wireless/rt2x00/rt2x00lib.h
+++ b/drivers/net/wireless/rt2x00/rt2x00lib.h
@@ -380,7 +380,7 @@ static inline void rt2x00crypto_tx_insert_iv(struct sk_buff *skb,
 {
 }
 
-static inline void rt2x00crypto_rx_insert_iv(struct sk_buff *skb, bool l2pad,
+static inline void rt2x00crypto_rx_insert_iv(struct sk_buff *skb,
 					     unsigned int header_length,
 					     struct rxdone_entry_desc *rxdesc)
 {
diff --git a/drivers/net/wireless/wl12xx/Kconfig b/drivers/net/wireless/wl12xx/Kconfig
index 7b14d5bc63d..88060e11754 100644
--- a/drivers/net/wireless/wl12xx/Kconfig
+++ b/drivers/net/wireless/wl12xx/Kconfig
@@ -1,5 +1,5 @@
 menuconfig WL12XX
-	boolean "TI wl12xx driver support"
+	tristate "TI wl12xx driver support"
 	depends on MAC80211 && WLAN_80211 && EXPERIMENTAL
 	---help---
 	  This will enable TI wl12xx driver support. The drivers make
diff --git a/drivers/net/wireless/zd1211rw/zd_usb.c b/drivers/net/wireless/zd1211rw/zd_usb.c
index 38688847d56..23a6a6d4863 100644
--- a/drivers/net/wireless/zd1211rw/zd_usb.c
+++ b/drivers/net/wireless/zd1211rw/zd_usb.c
@@ -1070,7 +1070,7 @@ static int eject_installer(struct usb_interface *intf)
 
 	/* Find bulk out endpoint */
 	endpoint = &iface_desc->endpoint[1].desc;
-	if ((endpoint->bEndpointAddress & USB_TYPE_MASK) == USB_DIR_OUT &&
+	if (usb_endpoint_dir_out(endpoint) &&
 	    usb_endpoint_xfer_bulk(endpoint)) {
 		bulk_out_ep = endpoint->bEndpointAddress;
 	} else {
diff --git a/drivers/net/xilinx_emaclite.c b/drivers/net/xilinx_emaclite.c
index dc22782633a..83a044dbd1d 100644
--- a/drivers/net/xilinx_emaclite.c
+++ b/drivers/net/xilinx_emaclite.c
@@ -134,18 +134,15 @@ static void xemaclite_enable_interrupts(struct net_local *drvdata)
 	}
 
 	/* Enable the Rx interrupts for the first buffer */
-	reg_data = in_be32(drvdata->base_addr + XEL_RSR_OFFSET);
 	out_be32(drvdata->base_addr + XEL_RSR_OFFSET,
-		 reg_data | XEL_RSR_RECV_IE_MASK);
+		 XEL_RSR_RECV_IE_MASK);
 
 	/* Enable the Rx interrupts for the second Buffer if
 	 * configured in HW */
 	if (drvdata->rx_ping_pong != 0) {
-		reg_data = in_be32(drvdata->base_addr + XEL_BUFFER_OFFSET +
-				   XEL_RSR_OFFSET);
 		out_be32(drvdata->base_addr + XEL_BUFFER_OFFSET +
 			 XEL_RSR_OFFSET,
-			 reg_data | XEL_RSR_RECV_IE_MASK);
+			 XEL_RSR_RECV_IE_MASK);
 	}
 
 	/* Enable the Global Interrupt Enable */
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 8574622e36a..c9e2ae90f19 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -154,9 +154,8 @@ int sync_start(void)
 {
 	int err;
 
-	if (!alloc_cpumask_var(&marked_cpus, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL))
 		return -ENOMEM;
-	cpumask_clear(marked_cpus);
 
 	start_cpu_work();
 
diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index 554e11f9e1c..8eefe56f1cb 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -31,7 +31,7 @@
 #define PARPORT_MIN_SPINTIME_VALUE 1
 #define PARPORT_MAX_SPINTIME_VALUE 1000
 
-static int do_active_device(ctl_table *table, int write, struct file *filp,
+static int do_active_device(ctl_table *table, int write,
 		      void __user *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -68,7 +68,7 @@ static int do_active_device(ctl_table *table, int write, struct file *filp,
 }
 
 #ifdef CONFIG_PARPORT_1284
-static int do_autoprobe(ctl_table *table, int write, struct file *filp,
+static int do_autoprobe(ctl_table *table, int write,
 			void __user *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport_device_info *info = table->extra2;
@@ -111,7 +111,7 @@ static int do_autoprobe(ctl_table *table, int write, struct file *filp,
 #endif /* IEEE1284.3 support. */
 
 static int do_hardware_base_addr (ctl_table *table, int write,
-				  struct file *filp, void __user *result,
+				  void __user *result,
 				  size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -139,7 +139,7 @@ static int do_hardware_base_addr (ctl_table *table, int write,
 }
 
 static int do_hardware_irq (ctl_table *table, int write,
-			    struct file *filp, void __user *result,
+			    void __user *result,
 			    size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -167,7 +167,7 @@ static int do_hardware_irq (ctl_table *table, int write,
 }
 
 static int do_hardware_dma (ctl_table *table, int write,
-			    struct file *filp, void __user *result,
+			    void __user *result,
 			    size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -195,7 +195,7 @@ static int do_hardware_dma (ctl_table *table, int write,
 }
 
 static int do_hardware_modes (ctl_table *table, int write,
-			      struct file *filp, void __user *result,
+			      void __user *result,
 			      size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 36faa9a8e18..3070f77eb56 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -72,15 +72,9 @@ do {									\
 
 #define SLOT_NAME_SIZE 10
 struct slot {
-	u8 bus;
-	u8 device;
 	u8 state;
-	u8 hp_slot;
-	u32 number;
 	struct controller *ctrl;
-	struct hpc_ops *hpc_ops;
 	struct hotplug_slot *hotplug_slot;
-	struct list_head	slot_list;
 	struct delayed_work work;	/* work for button event */
 	struct mutex lock;
 };
@@ -92,18 +86,10 @@ struct event_info {
 };
 
 struct controller {
-	struct mutex crit_sect;		/* critical section mutex */
 	struct mutex ctrl_lock;		/* controller lock */
-	int num_slots;			/* Number of slots on ctlr */
-	int slot_num_inc;		/* 1 or -1 */
-	struct pci_dev *pci_dev;
 	struct pcie_device *pcie;	/* PCI Express port service */
-	struct list_head slot_list;
-	struct hpc_ops *hpc_ops;
+	struct slot *slot;
 	wait_queue_head_t queue;	/* sleep & wake process */
-	u8 slot_device_offset;
-	u32 first_slot;		/* First physical slot number */  /* PCIE only has 1 slot */
-	u8 slot_bus;		/* Bus where the slots handled by this controller sit */
 	u32 slot_cap;
 	u8 cap_base;
 	struct timer_list poll_timer;
@@ -131,40 +117,20 @@ struct controller {
 #define POWERON_STATE			3
 #define POWEROFF_STATE			4
 
-/* Error messages */
-#define INTERLOCK_OPEN			0x00000002
-#define ADD_NOT_SUPPORTED		0x00000003
-#define CARD_FUNCTIONING		0x00000005
-#define ADAPTER_NOT_SAME		0x00000006
-#define NO_ADAPTER_PRESENT		0x00000009
-#define NOT_ENOUGH_RESOURCES		0x0000000B
-#define DEVICE_TYPE_NOT_SUPPORTED	0x0000000C
-#define WRONG_BUS_FREQUENCY		0x0000000D
-#define POWER_FAILURE			0x0000000E
-
-/* Field definitions in Slot Capabilities Register */
-#define ATTN_BUTTN_PRSN	0x00000001
-#define	PWR_CTRL_PRSN	0x00000002
-#define MRL_SENS_PRSN	0x00000004
-#define ATTN_LED_PRSN	0x00000008
-#define PWR_LED_PRSN	0x00000010
-#define HP_SUPR_RM_SUP	0x00000020
-#define EMI_PRSN	0x00020000
-#define NO_CMD_CMPL_SUP	0x00040000
-
-#define ATTN_BUTTN(ctrl)	((ctrl)->slot_cap & ATTN_BUTTN_PRSN)
-#define POWER_CTRL(ctrl)	((ctrl)->slot_cap & PWR_CTRL_PRSN)
-#define MRL_SENS(ctrl)		((ctrl)->slot_cap & MRL_SENS_PRSN)
-#define ATTN_LED(ctrl)		((ctrl)->slot_cap & ATTN_LED_PRSN)
-#define PWR_LED(ctrl)		((ctrl)->slot_cap & PWR_LED_PRSN)
-#define HP_SUPR_RM(ctrl)	((ctrl)->slot_cap & HP_SUPR_RM_SUP)
-#define EMI(ctrl)		((ctrl)->slot_cap & EMI_PRSN)
-#define NO_CMD_CMPL(ctrl)	((ctrl)->slot_cap & NO_CMD_CMPL_SUP)
+#define ATTN_BUTTN(ctrl)	((ctrl)->slot_cap & PCI_EXP_SLTCAP_ABP)
+#define POWER_CTRL(ctrl)	((ctrl)->slot_cap & PCI_EXP_SLTCAP_PCP)
+#define MRL_SENS(ctrl)		((ctrl)->slot_cap & PCI_EXP_SLTCAP_MRLSP)
+#define ATTN_LED(ctrl)		((ctrl)->slot_cap & PCI_EXP_SLTCAP_AIP)
+#define PWR_LED(ctrl)		((ctrl)->slot_cap & PCI_EXP_SLTCAP_PIP)
+#define HP_SUPR_RM(ctrl)	((ctrl)->slot_cap & PCI_EXP_SLTCAP_HPS)
+#define EMI(ctrl)		((ctrl)->slot_cap & PCI_EXP_SLTCAP_EIP)
+#define NO_CMD_CMPL(ctrl)	((ctrl)->slot_cap & PCI_EXP_SLTCAP_NCCS)
+#define PSN(ctrl)		((ctrl)->slot_cap >> 19)
 
 extern int pciehp_sysfs_enable_slot(struct slot *slot);
 extern int pciehp_sysfs_disable_slot(struct slot *slot);
 extern u8 pciehp_handle_attention_button(struct slot *p_slot);
-  extern u8 pciehp_handle_switch_change(struct slot *p_slot);
+extern u8 pciehp_handle_switch_change(struct slot *p_slot);
 extern u8 pciehp_handle_presence_change(struct slot *p_slot);
 extern u8 pciehp_handle_power_fault(struct slot *p_slot);
 extern int pciehp_configure_device(struct slot *p_slot);
@@ -175,45 +141,30 @@ int pcie_init_notification(struct controller *ctrl);
 int pciehp_enable_slot(struct slot *p_slot);
 int pciehp_disable_slot(struct slot *p_slot);
 int pcie_enable_notification(struct controller *ctrl);
+int pciehp_power_on_slot(struct slot *slot);
+int pciehp_power_off_slot(struct slot *slot);
+int pciehp_get_power_status(struct slot *slot, u8 *status);
+int pciehp_get_attention_status(struct slot *slot, u8 *status);
+
+int pciehp_set_attention_status(struct slot *slot, u8 status);
+int pciehp_get_latch_status(struct slot *slot, u8 *status);
+int pciehp_get_adapter_status(struct slot *slot, u8 *status);
+int pciehp_get_max_link_speed(struct slot *slot, enum pci_bus_speed *speed);
+int pciehp_get_max_link_width(struct slot *slot, enum pcie_link_width *val);
+int pciehp_get_cur_link_speed(struct slot *slot, enum pci_bus_speed *speed);
+int pciehp_get_cur_link_width(struct slot *slot, enum pcie_link_width *val);
+int pciehp_query_power_fault(struct slot *slot);
+void pciehp_green_led_on(struct slot *slot);
+void pciehp_green_led_off(struct slot *slot);
+void pciehp_green_led_blink(struct slot *slot);
+int pciehp_check_link_status(struct controller *ctrl);
+void pciehp_release_ctrl(struct controller *ctrl);
 
 static inline const char *slot_name(struct slot *slot)
 {
 	return hotplug_slot_name(slot->hotplug_slot);
 }
 
-static inline struct slot *pciehp_find_slot(struct controller *ctrl, u8 device)
-{
-	struct slot *slot;
-
-	list_for_each_entry(slot, &ctrl->slot_list, slot_list) {
-		if (slot->device == device)
-			return slot;
-	}
-
-	ctrl_err(ctrl, "Slot (device=0x%02x) not found\n", device);
-	return NULL;
-}
-
-struct hpc_ops {
-	int (*power_on_slot)(struct slot *slot);
-	int (*power_off_slot)(struct slot *slot);
-	int (*get_power_status)(struct slot *slot, u8 *status);
-	int (*get_attention_status)(struct slot *slot, u8 *status);
-	int (*set_attention_status)(struct slot *slot, u8 status);
-	int (*get_latch_status)(struct slot *slot, u8 *status);
-	int (*get_adapter_status)(struct slot *slot, u8 *status);
-	int (*get_max_bus_speed)(struct slot *slot, enum pci_bus_speed *speed);
-	int (*get_cur_bus_speed)(struct slot *slot, enum pci_bus_speed *speed);
-	int (*get_max_lnk_width)(struct slot *slot, enum pcie_link_width *val);
-	int (*get_cur_lnk_width)(struct slot *slot, enum pcie_link_width *val);
-	int (*query_power_fault)(struct slot *slot);
-	void (*green_led_on)(struct slot *slot);
-	void (*green_led_off)(struct slot *slot);
-	void (*green_led_blink)(struct slot *slot);
-	void (*release_ctlr)(struct controller *ctrl);
-	int (*check_lnk_status)(struct controller *ctrl);
-};
-
 #ifdef CONFIG_ACPI
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c
index 7163e6a6cfa..37c8d3d0323 100644
--- a/drivers/pci/hotplug/pciehp_acpi.c
+++ b/drivers/pci/hotplug/pciehp_acpi.c
@@ -33,6 +33,11 @@
 #define PCIEHP_DETECT_AUTO	(2)
 #define PCIEHP_DETECT_DEFAULT	PCIEHP_DETECT_AUTO
 
+struct dummy_slot {
+	u32 number;
+	struct list_head list;
+};
+
 static int slot_detection_mode;
 static char *pciehp_detect_mode;
 module_param(pciehp_detect_mode, charp, 0444);
@@ -77,7 +82,7 @@ static int __init dummy_probe(struct pcie_device *dev)
 	int pos;
 	u32 slot_cap;
 	acpi_handle handle;
-	struct slot *slot, *tmp;
+	struct dummy_slot *slot, *tmp;
 	struct pci_dev *pdev = dev->port;
 	/* Note: pciehp_detect_mode != PCIEHP_DETECT_ACPI here */
 	if (pciehp_get_hp_hw_control_from_firmware(pdev))
@@ -89,11 +94,11 @@ static int __init dummy_probe(struct pcie_device *dev)
 	if (!slot)
 		return -ENOMEM;
 	slot->number = slot_cap >> 19;
-	list_for_each_entry(tmp, &dummy_slots, slot_list) {
+	list_for_each_entry(tmp, &dummy_slots, list) {
 		if (tmp->number == slot->number)
 			dup_slot_id++;
 	}
-	list_add_tail(&slot->slot_list, &dummy_slots);
+	list_add_tail(&slot->list, &dummy_slots);
 	handle = DEVICE_ACPI_HANDLE(&pdev->dev);
 	if (!acpi_slot_detected && acpi_pci_detect_ejectable(handle))
 		acpi_slot_detected = 1;
@@ -109,11 +114,11 @@ static struct pcie_port_service_driver __initdata dummy_driver = {
 
 static int __init select_detection_mode(void)
 {
-	struct slot *slot, *tmp;
+	struct dummy_slot *slot, *tmp;
 	pcie_port_service_register(&dummy_driver);
 	pcie_port_service_unregister(&dummy_driver);
-	list_for_each_entry_safe(slot, tmp, &dummy_slots, slot_list) {
-		list_del(&slot->slot_list);
+	list_for_each_entry_safe(slot, tmp, &dummy_slots, list) {
+		list_del(&slot->list);
 		kfree(slot);
 	}
 	if (acpi_slot_detected && dup_slot_id)
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 2317557fdee..bc234719b1d 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -99,65 +99,55 @@ static void release_slot(struct hotplug_slot *hotplug_slot)
 	kfree(hotplug_slot);
 }
 
-static int init_slots(struct controller *ctrl)
+static int init_slot(struct controller *ctrl)
 {
-	struct slot *slot;
-	struct hotplug_slot *hotplug_slot;
-	struct hotplug_slot_info *info;
+	struct slot *slot = ctrl->slot;
+	struct hotplug_slot *hotplug = NULL;
+	struct hotplug_slot_info *info = NULL;
 	char name[SLOT_NAME_SIZE];
 	int retval = -ENOMEM;
 
-	list_for_each_entry(slot, &ctrl->slot_list, slot_list) {
-		hotplug_slot = kzalloc(sizeof(*hotplug_slot), GFP_KERNEL);
-		if (!hotplug_slot)
-			goto error;
-
-		info = kzalloc(sizeof(*info), GFP_KERNEL);
-		if (!info)
-			goto error_hpslot;
-
-		/* register this slot with the hotplug pci core */
-		hotplug_slot->info = info;
-		hotplug_slot->private = slot;
-		hotplug_slot->release = &release_slot;
-		hotplug_slot->ops = &pciehp_hotplug_slot_ops;
-		slot->hotplug_slot = hotplug_slot;
-		snprintf(name, SLOT_NAME_SIZE, "%u", slot->number);
-
- 		ctrl_dbg(ctrl, "Registering domain:bus:dev=%04x:%02x:%02x "
- 			 "hp_slot=%x sun=%x slot_device_offset=%x\n",
- 			 pci_domain_nr(ctrl->pci_dev->subordinate),
- 			 slot->bus, slot->device, slot->hp_slot, slot->number,
- 			 ctrl->slot_device_offset);
-		retval = pci_hp_register(hotplug_slot,
-					 ctrl->pci_dev->subordinate,
-					 slot->device,
-					 name);
-		if (retval) {
-			ctrl_err(ctrl, "pci_hp_register failed with error %d\n",
-				 retval);
-			goto error_info;
-		}
-		get_power_status(hotplug_slot, &info->power_status);
-		get_attention_status(hotplug_slot, &info->attention_status);
-		get_latch_status(hotplug_slot, &info->latch_status);
-		get_adapter_status(hotplug_slot, &info->adapter_status);
+	hotplug = kzalloc(sizeof(*hotplug), GFP_KERNEL);
+	if (!hotplug)
+		goto out;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		goto out;
+
+	/* register this slot with the hotplug pci core */
+	hotplug->info = info;
+	hotplug->private = slot;
+	hotplug->release = &release_slot;
+	hotplug->ops = &pciehp_hotplug_slot_ops;
+	slot->hotplug_slot = hotplug;
+	snprintf(name, SLOT_NAME_SIZE, "%u", PSN(ctrl));
+
+	ctrl_dbg(ctrl, "Registering domain:bus:dev=%04x:%02x:00 sun=%x\n",
+		 pci_domain_nr(ctrl->pcie->port->subordinate),
+		 ctrl->pcie->port->subordinate->number, PSN(ctrl));
+	retval = pci_hp_register(hotplug,
+				 ctrl->pcie->port->subordinate, 0, name);
+	if (retval) {
+		ctrl_err(ctrl,
+			 "pci_hp_register failed with error %d\n", retval);
+		goto out;
+	}
+	get_power_status(hotplug, &info->power_status);
+	get_attention_status(hotplug, &info->attention_status);
+	get_latch_status(hotplug, &info->latch_status);
+	get_adapter_status(hotplug, &info->adapter_status);
+out:
+	if (retval) {
+		kfree(info);
+		kfree(hotplug);
 	}
-
-	return 0;
-error_info:
-	kfree(info);
-error_hpslot:
-	kfree(hotplug_slot);
-error:
 	return retval;
 }
 
-static void cleanup_slots(struct controller *ctrl)
+static void cleanup_slot(struct controller *ctrl)
 {
-	struct slot *slot;
-	list_for_each_entry(slot, &ctrl->slot_list, slot_list)
-		pci_hp_deregister(slot->hotplug_slot);
+	pci_hp_deregister(ctrl->slot->hotplug_slot);
 }
 
 /*
@@ -173,7 +163,7 @@ static int set_attention_status(struct hotplug_slot *hotplug_slot, u8 status)
 	hotplug_slot->info->attention_status = status;
 
 	if (ATTN_LED(slot->ctrl))
-		slot->hpc_ops->set_attention_status(slot, status);
+		pciehp_set_attention_status(slot, status);
 
 	return 0;
 }
@@ -208,7 +198,7 @@ static int get_power_status(struct hotplug_slot *hotplug_slot, u8 *value)
 	ctrl_dbg(slot->ctrl, "%s: physical_slot = %s\n",
 		  __func__, slot_name(slot));
 
-	retval = slot->hpc_ops->get_power_status(slot, value);
+	retval = pciehp_get_power_status(slot, value);
 	if (retval < 0)
 		*value = hotplug_slot->info->power_status;
 
@@ -223,7 +213,7 @@ static int get_attention_status(struct hotplug_slot *hotplug_slot, u8 *value)
 	ctrl_dbg(slot->ctrl, "%s: physical_slot = %s\n",
 		  __func__, slot_name(slot));
 
-	retval = slot->hpc_ops->get_attention_status(slot, value);
+	retval = pciehp_get_attention_status(slot, value);
 	if (retval < 0)
 		*value = hotplug_slot->info->attention_status;
 
@@ -238,7 +228,7 @@ static int get_latch_status(struct hotplug_slot *hotplug_slot, u8 *value)
 	ctrl_dbg(slot->ctrl, "%s: physical_slot = %s\n",
 		 __func__, slot_name(slot));
 
-	retval = slot->hpc_ops->get_latch_status(slot, value);
+	retval = pciehp_get_latch_status(slot, value);
 	if (retval < 0)
 		*value = hotplug_slot->info->latch_status;
 
@@ -253,7 +243,7 @@ static int get_adapter_status(struct hotplug_slot *hotplug_slot, u8 *value)
 	ctrl_dbg(slot->ctrl, "%s: physical_slot = %s\n",
 		 __func__, slot_name(slot));
 
-	retval = slot->hpc_ops->get_adapter_status(slot, value);
+	retval = pciehp_get_adapter_status(slot, value);
 	if (retval < 0)
 		*value = hotplug_slot->info->adapter_status;
 
@@ -269,7 +259,7 @@ static int get_max_bus_speed(struct hotplug_slot *hotplug_slot,
 	ctrl_dbg(slot->ctrl, "%s: physical_slot = %s\n",
 		 __func__, slot_name(slot));
 
-	retval = slot->hpc_ops->get_max_bus_speed(slot, value);
+	retval = pciehp_get_max_link_speed(slot, value);
 	if (retval < 0)
 		*value = PCI_SPEED_UNKNOWN;
 
@@ -284,7 +274,7 @@ static int get_cur_bus_speed(struct hotplug_slot *hotplug_slot, enum pci_bus_spe
 	ctrl_dbg(slot->ctrl, "%s: physical_slot = %s\n",
 		 __func__, slot_name(slot));
 
-	retval = slot->hpc_ops->get_cur_bus_speed(slot, value);
+	retval = pciehp_get_cur_link_speed(slot, value);
 	if (retval < 0)
 		*value = PCI_SPEED_UNKNOWN;
 
@@ -295,7 +285,7 @@ static int pciehp_probe(struct pcie_device *dev)
 {
 	int rc;
 	struct controller *ctrl;
-	struct slot *t_slot;
+	struct slot *slot;
 	u8 value;
 	struct pci_dev *pdev = dev->port;
 
@@ -314,7 +304,7 @@ static int pciehp_probe(struct pcie_device *dev)
 	set_service_data(dev, ctrl);
 
 	/* Setup the slot information structures */
-	rc = init_slots(ctrl);
+	rc = init_slot(ctrl);
 	if (rc) {
 		if (rc == -EBUSY)
 			ctrl_warn(ctrl, "Slot already registered by another "
@@ -332,15 +322,15 @@ static int pciehp_probe(struct pcie_device *dev)
 	}
 
 	/* Check if slot is occupied */
-	t_slot = pciehp_find_slot(ctrl, ctrl->slot_device_offset);
-	t_slot->hpc_ops->get_adapter_status(t_slot, &value);
+	slot = ctrl->slot;
+	pciehp_get_adapter_status(slot, &value);
 	if (value) {
 		if (pciehp_force)
-			pciehp_enable_slot(t_slot);
+			pciehp_enable_slot(slot);
 	} else {
 		/* Power off slot if not occupied */
 		if (POWER_CTRL(ctrl)) {
-			rc = t_slot->hpc_ops->power_off_slot(t_slot);
+			rc = pciehp_power_off_slot(slot);
 			if (rc)
 				goto err_out_free_ctrl_slot;
 		}
@@ -349,19 +339,19 @@ static int pciehp_probe(struct pcie_device *dev)
 	return 0;
 
 err_out_free_ctrl_slot:
-	cleanup_slots(ctrl);
+	cleanup_slot(ctrl);
 err_out_release_ctlr:
-	ctrl->hpc_ops->release_ctlr(ctrl);
+	pciehp_release_ctrl(ctrl);
 err_out_none:
 	return -ENODEV;
 }
 
-static void pciehp_remove (struct pcie_device *dev)
+static void pciehp_remove(struct pcie_device *dev)
 {
 	struct controller *ctrl = get_service_data(dev);
 
-	cleanup_slots(ctrl);
-	ctrl->hpc_ops->release_ctlr(ctrl);
+	cleanup_slot(ctrl);
+	pciehp_release_ctrl(ctrl);
 }
 
 #ifdef CONFIG_PM
@@ -376,20 +366,20 @@ static int pciehp_resume (struct pcie_device *dev)
 	dev_info(&dev->device, "%s ENTRY\n", __func__);
 	if (pciehp_force) {
 		struct controller *ctrl = get_service_data(dev);
-		struct slot *t_slot;
+		struct slot *slot;
 		u8 status;
 
 		/* reinitialize the chipset's event detection logic */
 		pcie_enable_notification(ctrl);
 
-		t_slot = pciehp_find_slot(ctrl, ctrl->slot_device_offset);
+		slot = ctrl->slot;
 
 		/* Check if slot is occupied */
-		t_slot->hpc_ops->get_adapter_status(t_slot, &status);
+		pciehp_get_adapter_status(slot, &status);
 		if (status)
-			pciehp_enable_slot(t_slot);
+			pciehp_enable_slot(slot);
 		else
-			pciehp_disable_slot(t_slot);
+			pciehp_disable_slot(slot);
 	}
 	return 0;
 }
diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
index b97cb4c3e0f..84487d126e4 100644
--- a/drivers/pci/hotplug/pciehp_ctrl.c
+++ b/drivers/pci/hotplug/pciehp_ctrl.c
@@ -82,7 +82,7 @@ u8 pciehp_handle_switch_change(struct slot *p_slot)
 	/* Switch Change */
 	ctrl_dbg(ctrl, "Switch interrupt received\n");
 
-	p_slot->hpc_ops->get_latch_status(p_slot, &getstatus);
+	pciehp_get_latch_status(p_slot, &getstatus);
 	if (getstatus) {
 		/*
 		 * Switch opened
@@ -114,7 +114,7 @@ u8 pciehp_handle_presence_change(struct slot *p_slot)
 	/* Switch is open, assume a presence change
 	 * Save the presence state
 	 */
-	p_slot->hpc_ops->get_adapter_status(p_slot, &presence_save);
+	pciehp_get_adapter_status(p_slot, &presence_save);
 	if (presence_save) {
 		/*
 		 * Card Present
@@ -143,7 +143,7 @@ u8 pciehp_handle_power_fault(struct slot *p_slot)
 	/* power fault */
 	ctrl_dbg(ctrl, "Power fault interrupt received\n");
 
-	if ( !(p_slot->hpc_ops->query_power_fault(p_slot))) {
+	if (!pciehp_query_power_fault(p_slot)) {
 		/*
 		 * power fault Cleared
 		 */
@@ -172,7 +172,7 @@ static void set_slot_off(struct controller *ctrl, struct slot * pslot)
 {
 	/* turn off slot, turn on Amber LED, turn off Green LED if supported*/
 	if (POWER_CTRL(ctrl)) {
-		if (pslot->hpc_ops->power_off_slot(pslot)) {
+		if (pciehp_power_off_slot(pslot)) {
 			ctrl_err(ctrl,
 				 "Issue of Slot Power Off command failed\n");
 			return;
@@ -186,10 +186,10 @@ static void set_slot_off(struct controller *ctrl, struct slot * pslot)
 	}
 
 	if (PWR_LED(ctrl))
-		pslot->hpc_ops->green_led_off(pslot);
+		pciehp_green_led_off(pslot);
 
 	if (ATTN_LED(ctrl)) {
-		if (pslot->hpc_ops->set_attention_status(pslot, 1)) {
+		if (pciehp_set_attention_status(pslot, 1)) {
 			ctrl_err(ctrl,
 				 "Issue of Set Attention Led command failed\n");
 			return;
@@ -208,24 +208,20 @@ static int board_added(struct slot *p_slot)
 {
 	int retval = 0;
 	struct controller *ctrl = p_slot->ctrl;
-	struct pci_bus *parent = ctrl->pci_dev->subordinate;
-
-	ctrl_dbg(ctrl, "%s: slot device, slot offset, hp slot = %d, %d, %d\n",
-		 __func__, p_slot->device, ctrl->slot_device_offset,
-		 p_slot->hp_slot);
+	struct pci_bus *parent = ctrl->pcie->port->subordinate;
 
 	if (POWER_CTRL(ctrl)) {
 		/* Power on slot */
-		retval = p_slot->hpc_ops->power_on_slot(p_slot);
+		retval = pciehp_power_on_slot(p_slot);
 		if (retval)
 			return retval;
 	}
 
 	if (PWR_LED(ctrl))
-		p_slot->hpc_ops->green_led_blink(p_slot);
+		pciehp_green_led_blink(p_slot);
 
 	/* Check link training status */
-	retval = p_slot->hpc_ops->check_lnk_status(ctrl);
+	retval = pciehp_check_link_status(ctrl);
 	if (retval) {
 		ctrl_err(ctrl, "Failed to check link status\n");
 		set_slot_off(ctrl, p_slot);
@@ -233,21 +229,21 @@ static int board_added(struct slot *p_slot)
 	}
 
 	/* Check for a power fault */
-	if (p_slot->hpc_ops->query_power_fault(p_slot)) {
+	if (pciehp_query_power_fault(p_slot)) {
 		ctrl_dbg(ctrl, "Power fault detected\n");
-		retval = POWER_FAILURE;
+		retval = -EIO;
 		goto err_exit;
 	}
 
 	retval = pciehp_configure_device(p_slot);
 	if (retval) {
-		ctrl_err(ctrl, "Cannot add device at %04x:%02x:%02x\n",
-			 pci_domain_nr(parent), p_slot->bus, p_slot->device);
+		ctrl_err(ctrl, "Cannot add device at %04x:%02x:00\n",
+			 pci_domain_nr(parent), parent->number);
 		goto err_exit;
 	}
 
 	if (PWR_LED(ctrl))
-  		p_slot->hpc_ops->green_led_on(p_slot);
+		pciehp_green_led_on(p_slot);
 
 	return 0;
 
@@ -269,11 +265,9 @@ static int remove_board(struct slot *p_slot)
 	if (retval)
 		return retval;
 
-	ctrl_dbg(ctrl, "%s: hp_slot = %d\n", __func__, p_slot->hp_slot);
-
 	if (POWER_CTRL(ctrl)) {
 		/* power off slot */
-		retval = p_slot->hpc_ops->power_off_slot(p_slot);
+		retval = pciehp_power_off_slot(p_slot);
 		if (retval) {
 			ctrl_err(ctrl,
 				 "Issue of Slot Disable command failed\n");
@@ -287,9 +281,9 @@ static int remove_board(struct slot *p_slot)
 		msleep(1000);
 	}
 
+	/* turn off Green LED */
 	if (PWR_LED(ctrl))
-		/* turn off Green LED */
-		p_slot->hpc_ops->green_led_off(p_slot);
+		pciehp_green_led_off(p_slot);
 
 	return 0;
 }
@@ -317,18 +311,17 @@ static void pciehp_power_thread(struct work_struct *work)
 	case POWEROFF_STATE:
 		mutex_unlock(&p_slot->lock);
 		ctrl_dbg(p_slot->ctrl,
-			 "Disabling domain:bus:device=%04x:%02x:%02x\n",
-			 pci_domain_nr(p_slot->ctrl->pci_dev->subordinate),
-			 p_slot->bus, p_slot->device);
+			 "Disabling domain:bus:device=%04x:%02x:00\n",
+			 pci_domain_nr(p_slot->ctrl->pcie->port->subordinate),
+			 p_slot->ctrl->pcie->port->subordinate->number);
 		pciehp_disable_slot(p_slot);
 		mutex_lock(&p_slot->lock);
 		p_slot->state = STATIC_STATE;
 		break;
 	case POWERON_STATE:
 		mutex_unlock(&p_slot->lock);
-		if (pciehp_enable_slot(p_slot) &&
-		    PWR_LED(p_slot->ctrl))
-			p_slot->hpc_ops->green_led_off(p_slot);
+		if (pciehp_enable_slot(p_slot) && PWR_LED(p_slot->ctrl))
+			pciehp_green_led_off(p_slot);
 		mutex_lock(&p_slot->lock);
 		p_slot->state = STATIC_STATE;
 		break;
@@ -379,10 +372,10 @@ static int update_slot_info(struct slot *slot)
 	if (!info)
 		return -ENOMEM;
 
-	slot->hpc_ops->get_power_status(slot, &(info->power_status));
-	slot->hpc_ops->get_attention_status(slot, &(info->attention_status));
-	slot->hpc_ops->get_latch_status(slot, &(info->latch_status));
-	slot->hpc_ops->get_adapter_status(slot, &(info->adapter_status));
+	pciehp_get_power_status(slot, &info->power_status);
+	pciehp_get_attention_status(slot, &info->attention_status);
+	pciehp_get_latch_status(slot, &info->latch_status);
+	pciehp_get_adapter_status(slot, &info->adapter_status);
 
 	result = pci_hp_change_slot_info(slot->hotplug_slot, info);
 	kfree (info);
@@ -399,7 +392,7 @@ static void handle_button_press_event(struct slot *p_slot)
 
 	switch (p_slot->state) {
 	case STATIC_STATE:
-		p_slot->hpc_ops->get_power_status(p_slot, &getstatus);
+		pciehp_get_power_status(p_slot, &getstatus);
 		if (getstatus) {
 			p_slot->state = BLINKINGOFF_STATE;
 			ctrl_info(ctrl,
@@ -413,9 +406,9 @@ static void handle_button_press_event(struct slot *p_slot)
 		}
 		/* blink green LED and turn off amber */
 		if (PWR_LED(ctrl))
-			p_slot->hpc_ops->green_led_blink(p_slot);
+			pciehp_green_led_blink(p_slot);
 		if (ATTN_LED(ctrl))
-			p_slot->hpc_ops->set_attention_status(p_slot, 0);
+			pciehp_set_attention_status(p_slot, 0);
 
 		schedule_delayed_work(&p_slot->work, 5*HZ);
 		break;
@@ -430,13 +423,13 @@ static void handle_button_press_event(struct slot *p_slot)
 		cancel_delayed_work(&p_slot->work);
 		if (p_slot->state == BLINKINGOFF_STATE) {
 			if (PWR_LED(ctrl))
-				p_slot->hpc_ops->green_led_on(p_slot);
+				pciehp_green_led_on(p_slot);
 		} else {
 			if (PWR_LED(ctrl))
-				p_slot->hpc_ops->green_led_off(p_slot);
+				pciehp_green_led_off(p_slot);
 		}
 		if (ATTN_LED(ctrl))
-			p_slot->hpc_ops->set_attention_status(p_slot, 0);
+			pciehp_set_attention_status(p_slot, 0);
 		ctrl_info(ctrl, "PCI slot #%s - action canceled "
 			  "due to button press\n", slot_name(p_slot));
 		p_slot->state = STATIC_STATE;
@@ -474,7 +467,7 @@ static void handle_surprise_event(struct slot *p_slot)
 	info->p_slot = p_slot;
 	INIT_WORK(&info->work, pciehp_power_thread);
 
-	p_slot->hpc_ops->get_adapter_status(p_slot, &getstatus);
+	pciehp_get_adapter_status(p_slot, &getstatus);
 	if (!getstatus)
 		p_slot->state = POWEROFF_STATE;
 	else
@@ -498,9 +491,9 @@ static void interrupt_event_handler(struct work_struct *work)
 		if (!POWER_CTRL(ctrl))
 			break;
 		if (ATTN_LED(ctrl))
-			p_slot->hpc_ops->set_attention_status(p_slot, 1);
+			pciehp_set_attention_status(p_slot, 1);
 		if (PWR_LED(ctrl))
-			p_slot->hpc_ops->green_led_off(p_slot);
+			pciehp_green_led_off(p_slot);
 		break;
 	case INT_PRESENCE_ON:
 	case INT_PRESENCE_OFF:
@@ -525,45 +518,38 @@ int pciehp_enable_slot(struct slot *p_slot)
 	int rc;
 	struct controller *ctrl = p_slot->ctrl;
 
-	/* Check to see if (latch closed, card present, power off) */
-	mutex_lock(&p_slot->ctrl->crit_sect);
-
-	rc = p_slot->hpc_ops->get_adapter_status(p_slot, &getstatus);
+	rc = pciehp_get_adapter_status(p_slot, &getstatus);
 	if (rc || !getstatus) {
 		ctrl_info(ctrl, "No adapter on slot(%s)\n", slot_name(p_slot));
-		mutex_unlock(&p_slot->ctrl->crit_sect);
 		return -ENODEV;
 	}
 	if (MRL_SENS(p_slot->ctrl)) {
-		rc = p_slot->hpc_ops->get_latch_status(p_slot, &getstatus);
+		rc = pciehp_get_latch_status(p_slot, &getstatus);
 		if (rc || getstatus) {
 			ctrl_info(ctrl, "Latch open on slot(%s)\n",
 				  slot_name(p_slot));
-			mutex_unlock(&p_slot->ctrl->crit_sect);
 			return -ENODEV;
 		}
 	}
 
 	if (POWER_CTRL(p_slot->ctrl)) {
-		rc = p_slot->hpc_ops->get_power_status(p_slot, &getstatus);
+		rc = pciehp_get_power_status(p_slot, &getstatus);
 		if (rc || getstatus) {
 			ctrl_info(ctrl, "Already enabled on slot(%s)\n",
 				  slot_name(p_slot));
-			mutex_unlock(&p_slot->ctrl->crit_sect);
 			return -EINVAL;
 		}
 	}
 
-	p_slot->hpc_ops->get_latch_status(p_slot, &getstatus);
+	pciehp_get_latch_status(p_slot, &getstatus);
 
 	rc = board_added(p_slot);
 	if (rc) {
-		p_slot->hpc_ops->get_latch_status(p_slot, &getstatus);
+		pciehp_get_latch_status(p_slot, &getstatus);
 	}
 
 	update_slot_info(p_slot);
 
-	mutex_unlock(&p_slot->ctrl->crit_sect);
 	return rc;
 }
 
@@ -577,35 +563,29 @@ int pciehp_disable_slot(struct slot *p_slot)
 	if (!p_slot->ctrl)
 		return 1;
 
-	/* Check to see if (latch closed, card present, power on) */
-	mutex_lock(&p_slot->ctrl->crit_sect);
-
 	if (!HP_SUPR_RM(p_slot->ctrl)) {
-		ret = p_slot->hpc_ops->get_adapter_status(p_slot, &getstatus);
+		ret = pciehp_get_adapter_status(p_slot, &getstatus);
 		if (ret || !getstatus) {
 			ctrl_info(ctrl, "No adapter on slot(%s)\n",
 				  slot_name(p_slot));
-			mutex_unlock(&p_slot->ctrl->crit_sect);
 			return -ENODEV;
 		}
 	}
 
 	if (MRL_SENS(p_slot->ctrl)) {
-		ret = p_slot->hpc_ops->get_latch_status(p_slot, &getstatus);
+		ret = pciehp_get_latch_status(p_slot, &getstatus);
 		if (ret || getstatus) {
 			ctrl_info(ctrl, "Latch open on slot(%s)\n",
 				  slot_name(p_slot));
-			mutex_unlock(&p_slot->ctrl->crit_sect);
 			return -ENODEV;
 		}
 	}
 
 	if (POWER_CTRL(p_slot->ctrl)) {
-		ret = p_slot->hpc_ops->get_power_status(p_slot, &getstatus);
+		ret = pciehp_get_power_status(p_slot, &getstatus);
 		if (ret || !getstatus) {
 			ctrl_info(ctrl, "Already disabled on slot(%s)\n",
 				  slot_name(p_slot));
-			mutex_unlock(&p_slot->ctrl->crit_sect);
 			return -EINVAL;
 		}
 	}
@@ -613,7 +593,6 @@ int pciehp_disable_slot(struct slot *p_slot)
 	ret = remove_board(p_slot);
 	update_slot_info(p_slot);
 
-	mutex_unlock(&p_slot->ctrl->crit_sect);
 	return ret;
 }
 
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 271f917b6f2..9ef4605c1ef 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -44,25 +44,25 @@ static atomic_t pciehp_num_controllers = ATOMIC_INIT(0);
 
 static inline int pciehp_readw(struct controller *ctrl, int reg, u16 *value)
 {
-	struct pci_dev *dev = ctrl->pci_dev;
+	struct pci_dev *dev = ctrl->pcie->port;
 	return pci_read_config_word(dev, ctrl->cap_base + reg, value);
 }
 
 static inline int pciehp_readl(struct controller *ctrl, int reg, u32 *value)
 {
-	struct pci_dev *dev = ctrl->pci_dev;
+	struct pci_dev *dev = ctrl->pcie->port;
 	return pci_read_config_dword(dev, ctrl->cap_base + reg, value);
 }
 
 static inline int pciehp_writew(struct controller *ctrl, int reg, u16 value)
 {
-	struct pci_dev *dev = ctrl->pci_dev;
+	struct pci_dev *dev = ctrl->pcie->port;
 	return pci_write_config_word(dev, ctrl->cap_base + reg, value);
 }
 
 static inline int pciehp_writel(struct controller *ctrl, int reg, u32 value)
 {
-	struct pci_dev *dev = ctrl->pci_dev;
+	struct pci_dev *dev = ctrl->pcie->port;
 	return pci_write_config_dword(dev, ctrl->cap_base + reg, value);
 }
 
@@ -266,7 +266,7 @@ static void pcie_wait_link_active(struct controller *ctrl)
 	ctrl_dbg(ctrl, "Data Link Layer Link Active not set in 1000 msec\n");
 }
 
-static int hpc_check_lnk_status(struct controller *ctrl)
+int pciehp_check_link_status(struct controller *ctrl)
 {
 	u16 lnk_status;
 	int retval = 0;
@@ -305,7 +305,7 @@ static int hpc_check_lnk_status(struct controller *ctrl)
 	return retval;
 }
 
-static int hpc_get_attention_status(struct slot *slot, u8 *status)
+int pciehp_get_attention_status(struct slot *slot, u8 *status)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_ctrl;
@@ -344,7 +344,7 @@ static int hpc_get_attention_status(struct slot *slot, u8 *status)
 	return 0;
 }
 
-static int hpc_get_power_status(struct slot *slot, u8 *status)
+int pciehp_get_power_status(struct slot *slot, u8 *status)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_ctrl;
@@ -376,7 +376,7 @@ static int hpc_get_power_status(struct slot *slot, u8 *status)
 	return retval;
 }
 
-static int hpc_get_latch_status(struct slot *slot, u8 *status)
+int pciehp_get_latch_status(struct slot *slot, u8 *status)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_status;
@@ -392,7 +392,7 @@ static int hpc_get_latch_status(struct slot *slot, u8 *status)
 	return 0;
 }
 
-static int hpc_get_adapter_status(struct slot *slot, u8 *status)
+int pciehp_get_adapter_status(struct slot *slot, u8 *status)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_status;
@@ -408,7 +408,7 @@ static int hpc_get_adapter_status(struct slot *slot, u8 *status)
 	return 0;
 }
 
-static int hpc_query_power_fault(struct slot *slot)
+int pciehp_query_power_fault(struct slot *slot)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_status;
@@ -422,7 +422,7 @@ static int hpc_query_power_fault(struct slot *slot)
 	return !!(slot_status & PCI_EXP_SLTSTA_PFD);
 }
 
-static int hpc_set_attention_status(struct slot *slot, u8 value)
+int pciehp_set_attention_status(struct slot *slot, u8 value)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_cmd;
@@ -450,7 +450,7 @@ static int hpc_set_attention_status(struct slot *slot, u8 value)
 	return rc;
 }
 
-static void hpc_set_green_led_on(struct slot *slot)
+void pciehp_green_led_on(struct slot *slot)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_cmd;
@@ -463,7 +463,7 @@ static void hpc_set_green_led_on(struct slot *slot)
 		 __func__, ctrl->cap_base + PCI_EXP_SLTCTL, slot_cmd);
 }
 
-static void hpc_set_green_led_off(struct slot *slot)
+void pciehp_green_led_off(struct slot *slot)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_cmd;
@@ -476,7 +476,7 @@ static void hpc_set_green_led_off(struct slot *slot)
 		 __func__, ctrl->cap_base + PCI_EXP_SLTCTL, slot_cmd);
 }
 
-static void hpc_set_green_led_blink(struct slot *slot)
+void pciehp_green_led_blink(struct slot *slot)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_cmd;
@@ -489,7 +489,7 @@ static void hpc_set_green_led_blink(struct slot *slot)
 		 __func__, ctrl->cap_base + PCI_EXP_SLTCTL, slot_cmd);
 }
 
-static int hpc_power_on_slot(struct slot * slot)
+int pciehp_power_on_slot(struct slot * slot)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_cmd;
@@ -497,8 +497,6 @@ static int hpc_power_on_slot(struct slot * slot)
 	u16 slot_status;
 	int retval = 0;
 
-	ctrl_dbg(ctrl, "%s: slot->hp_slot %x\n", __func__, slot->hp_slot);
-
 	/* Clear sticky power-fault bit from previous power failures */
 	retval = pciehp_readw(ctrl, PCI_EXP_SLTSTA, &slot_status);
 	if (retval) {
@@ -539,7 +537,7 @@ static int hpc_power_on_slot(struct slot * slot)
 
 static inline int pcie_mask_bad_dllp(struct controller *ctrl)
 {
-	struct pci_dev *dev = ctrl->pci_dev;
+	struct pci_dev *dev = ctrl->pcie->port;
 	int pos;
 	u32 reg;
 
@@ -556,7 +554,7 @@ static inline int pcie_mask_bad_dllp(struct controller *ctrl)
 
 static inline void pcie_unmask_bad_dllp(struct controller *ctrl)
 {
-	struct pci_dev *dev = ctrl->pci_dev;
+	struct pci_dev *dev = ctrl->pcie->port;
 	u32 reg;
 	int pos;
 
@@ -570,7 +568,7 @@ static inline void pcie_unmask_bad_dllp(struct controller *ctrl)
 	pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg);
 }
 
-static int hpc_power_off_slot(struct slot * slot)
+int pciehp_power_off_slot(struct slot * slot)
 {
 	struct controller *ctrl = slot->ctrl;
 	u16 slot_cmd;
@@ -578,8 +576,6 @@ static int hpc_power_off_slot(struct slot * slot)
 	int retval = 0;
 	int changed;
 
-	ctrl_dbg(ctrl, "%s: slot->hp_slot %x\n", __func__, slot->hp_slot);
-
 	/*
 	 * Set Bad DLLP Mask bit in Correctable Error Mask
 	 * Register. This is the workaround against Bad DLLP error
@@ -614,8 +610,8 @@ static int hpc_power_off_slot(struct slot * slot)
 static irqreturn_t pcie_isr(int irq, void *dev_id)
 {
 	struct controller *ctrl = (struct controller *)dev_id;
+	struct slot *slot = ctrl->slot;
 	u16 detected, intr_loc;
-	struct slot *p_slot;
 
 	/*
 	 * In order to guarantee that all interrupt events are
@@ -656,29 +652,27 @@ static irqreturn_t pcie_isr(int irq, void *dev_id)
 	if (!(intr_loc & ~PCI_EXP_SLTSTA_CC))
 		return IRQ_HANDLED;
 
-	p_slot = pciehp_find_slot(ctrl, ctrl->slot_device_offset);
-
 	/* Check MRL Sensor Changed */
 	if (intr_loc & PCI_EXP_SLTSTA_MRLSC)
-		pciehp_handle_switch_change(p_slot);
+		pciehp_handle_switch_change(slot);
 
 	/* Check Attention Button Pressed */
 	if (intr_loc & PCI_EXP_SLTSTA_ABP)
-		pciehp_handle_attention_button(p_slot);
+		pciehp_handle_attention_button(slot);
 
 	/* Check Presence Detect Changed */
 	if (intr_loc & PCI_EXP_SLTSTA_PDC)
-		pciehp_handle_presence_change(p_slot);
+		pciehp_handle_presence_change(slot);
 
 	/* Check Power Fault Detected */
 	if ((intr_loc & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) {
 		ctrl->power_fault_detected = 1;
-		pciehp_handle_power_fault(p_slot);
+		pciehp_handle_power_fault(slot);
 	}
 	return IRQ_HANDLED;
 }
 
-static int hpc_get_max_lnk_speed(struct slot *slot, enum pci_bus_speed *value)
+int pciehp_get_max_link_speed(struct slot *slot, enum pci_bus_speed *value)
 {
 	struct controller *ctrl = slot->ctrl;
 	enum pcie_link_speed lnk_speed;
@@ -709,7 +703,7 @@ static int hpc_get_max_lnk_speed(struct slot *slot, enum pci_bus_speed *value)
 	return retval;
 }
 
-static int hpc_get_max_lnk_width(struct slot *slot,
+int pciehp_get_max_lnk_width(struct slot *slot,
 				 enum pcie_link_width *value)
 {
 	struct controller *ctrl = slot->ctrl;
@@ -759,7 +753,7 @@ static int hpc_get_max_lnk_width(struct slot *slot,
 	return retval;
 }
 
-static int hpc_get_cur_lnk_speed(struct slot *slot, enum pci_bus_speed *value)
+int pciehp_get_cur_link_speed(struct slot *slot, enum pci_bus_speed *value)
 {
 	struct controller *ctrl = slot->ctrl;
 	enum pcie_link_speed lnk_speed = PCI_SPEED_UNKNOWN;
@@ -791,7 +785,7 @@ static int hpc_get_cur_lnk_speed(struct slot *slot, enum pci_bus_speed *value)
 	return retval;
 }
 
-static int hpc_get_cur_lnk_width(struct slot *slot,
+int pciehp_get_cur_lnk_width(struct slot *slot,
 				 enum pcie_link_width *value)
 {
 	struct controller *ctrl = slot->ctrl;
@@ -842,30 +836,6 @@ static int hpc_get_cur_lnk_width(struct slot *slot,
 	return retval;
 }
 
-static void pcie_release_ctrl(struct controller *ctrl);
-static struct hpc_ops pciehp_hpc_ops = {
-	.power_on_slot			= hpc_power_on_slot,
-	.power_off_slot			= hpc_power_off_slot,
-	.set_attention_status		= hpc_set_attention_status,
-	.get_power_status		= hpc_get_power_status,
-	.get_attention_status		= hpc_get_attention_status,
-	.get_latch_status		= hpc_get_latch_status,
-	.get_adapter_status		= hpc_get_adapter_status,
-
-	.get_max_bus_speed		= hpc_get_max_lnk_speed,
-	.get_cur_bus_speed		= hpc_get_cur_lnk_speed,
-	.get_max_lnk_width		= hpc_get_max_lnk_width,
-	.get_cur_lnk_width		= hpc_get_cur_lnk_width,
-
-	.query_power_fault		= hpc_query_power_fault,
-	.green_led_on			= hpc_set_green_led_on,
-	.green_led_off			= hpc_set_green_led_off,
-	.green_led_blink		= hpc_set_green_led_blink,
-
-	.release_ctlr			= pcie_release_ctrl,
-	.check_lnk_status		= hpc_check_lnk_status,
-};
-
 int pcie_enable_notification(struct controller *ctrl)
 {
 	u16 cmd, mask;
@@ -930,23 +900,16 @@ static int pcie_init_slot(struct controller *ctrl)
 	if (!slot)
 		return -ENOMEM;
 
-	slot->hp_slot = 0;
 	slot->ctrl = ctrl;
-	slot->bus = ctrl->pci_dev->subordinate->number;
-	slot->device = ctrl->slot_device_offset + slot->hp_slot;
-	slot->hpc_ops = ctrl->hpc_ops;
-	slot->number = ctrl->first_slot;
 	mutex_init(&slot->lock);
 	INIT_DELAYED_WORK(&slot->work, pciehp_queue_pushbutton_work);
-	list_add(&slot->slot_list, &ctrl->slot_list);
+	ctrl->slot = slot;
 	return 0;
 }
 
 static void pcie_cleanup_slot(struct controller *ctrl)
 {
-	struct slot *slot;
-	slot = list_first_entry(&ctrl->slot_list, struct slot, slot_list);
-	list_del(&slot->slot_list);
+	struct slot *slot = ctrl->slot;
 	cancel_delayed_work(&slot->work);
 	flush_scheduled_work();
 	flush_workqueue(pciehp_wq);
@@ -957,7 +920,7 @@ static inline void dbg_ctrl(struct controller *ctrl)
 {
 	int i;
 	u16 reg16;
-	struct pci_dev *pdev = ctrl->pci_dev;
+	struct pci_dev *pdev = ctrl->pcie->port;
 
 	if (!pciehp_debug)
 		return;
@@ -980,7 +943,7 @@ static inline void dbg_ctrl(struct controller *ctrl)
 			  (unsigned long long)pci_resource_start(pdev, i));
 	}
 	ctrl_info(ctrl, "Slot Capabilities      : 0x%08x\n", ctrl->slot_cap);
-	ctrl_info(ctrl, "  Physical Slot Number : %d\n", ctrl->first_slot);
+	ctrl_info(ctrl, "  Physical Slot Number : %d\n", PSN(ctrl));
 	ctrl_info(ctrl, "  Attention Button     : %3s\n",
 		  ATTN_BUTTN(ctrl) ? "yes" : "no");
 	ctrl_info(ctrl, "  Power Controller     : %3s\n",
@@ -1014,10 +977,7 @@ struct controller *pcie_init(struct pcie_device *dev)
 		dev_err(&dev->device, "%s: Out of memory\n", __func__);
 		goto abort;
 	}
-	INIT_LIST_HEAD(&ctrl->slot_list);
-
 	ctrl->pcie = dev;
-	ctrl->pci_dev = pdev;
 	ctrl->cap_base = pci_find_capability(pdev, PCI_CAP_ID_EXP);
 	if (!ctrl->cap_base) {
 		ctrl_err(ctrl, "Cannot find PCI Express capability\n");
@@ -1029,11 +989,6 @@ struct controller *pcie_init(struct pcie_device *dev)
 	}
 
 	ctrl->slot_cap = slot_cap;
-	ctrl->first_slot = slot_cap >> 19;
-	ctrl->slot_device_offset = 0;
-	ctrl->num_slots = 1;
-	ctrl->hpc_ops = &pciehp_hpc_ops;
-	mutex_init(&ctrl->crit_sect);
 	mutex_init(&ctrl->ctrl_lock);
 	init_waitqueue_head(&ctrl->queue);
 	dbg_ctrl(ctrl);
@@ -1089,7 +1044,7 @@ abort:
 	return NULL;
 }
 
-void pcie_release_ctrl(struct controller *ctrl)
+void pciehp_release_ctrl(struct controller *ctrl)
 {
 	pcie_shutdown_notification(ctrl);
 	pcie_cleanup_slot(ctrl);
diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c
index 02e24d63b3e..21733108add 100644
--- a/drivers/pci/hotplug/pciehp_pci.c
+++ b/drivers/pci/hotplug/pciehp_pci.c
@@ -63,27 +63,27 @@ static int __ref pciehp_add_bridge(struct pci_dev *dev)
 int pciehp_configure_device(struct slot *p_slot)
 {
 	struct pci_dev *dev;
-	struct pci_bus *parent = p_slot->ctrl->pci_dev->subordinate;
+	struct pci_bus *parent = p_slot->ctrl->pcie->port->subordinate;
 	int num, fn;
 	struct controller *ctrl = p_slot->ctrl;
 
-	dev = pci_get_slot(parent, PCI_DEVFN(p_slot->device, 0));
+	dev = pci_get_slot(parent, PCI_DEVFN(0, 0));
 	if (dev) {
 		ctrl_err(ctrl, "Device %s already exists "
-			 "at %04x:%02x:%02x, cannot hot-add\n", pci_name(dev),
-			 pci_domain_nr(parent), p_slot->bus, p_slot->device);
+			 "at %04x:%02x:00, cannot hot-add\n", pci_name(dev),
+			 pci_domain_nr(parent), parent->number);
 		pci_dev_put(dev);
 		return -EINVAL;
 	}
 
-	num = pci_scan_slot(parent, PCI_DEVFN(p_slot->device, 0));
+	num = pci_scan_slot(parent, PCI_DEVFN(0, 0));
 	if (num == 0) {
 		ctrl_err(ctrl, "No new device found\n");
 		return -ENODEV;
 	}
 
 	for (fn = 0; fn < 8; fn++) {
-		dev = pci_get_slot(parent, PCI_DEVFN(p_slot->device, fn));
+		dev = pci_get_slot(parent, PCI_DEVFN(0, fn));
 		if (!dev)
 			continue;
 		if ((dev->class >> 16) == PCI_BASE_CLASS_DISPLAY) {
@@ -111,19 +111,18 @@ int pciehp_unconfigure_device(struct slot *p_slot)
 	int j;
 	u8 bctl = 0;
 	u8 presence = 0;
-	struct pci_bus *parent = p_slot->ctrl->pci_dev->subordinate;
+	struct pci_bus *parent = p_slot->ctrl->pcie->port->subordinate;
 	u16 command;
 	struct controller *ctrl = p_slot->ctrl;
 
-	ctrl_dbg(ctrl, "%s: domain:bus:dev = %04x:%02x:%02x\n",
-		 __func__, pci_domain_nr(parent), p_slot->bus, p_slot->device);
-	ret = p_slot->hpc_ops->get_adapter_status(p_slot, &presence);
+	ctrl_dbg(ctrl, "%s: domain:bus:dev = %04x:%02x:00\n",
+		 __func__, pci_domain_nr(parent), parent->number);
+	ret = pciehp_get_adapter_status(p_slot, &presence);
 	if (ret)
 		presence = 0;
 
 	for (j = 0; j < 8; j++) {
-		struct pci_dev* temp = pci_get_slot(parent,
-				(p_slot->device << 3) | j);
+		struct pci_dev* temp = pci_get_slot(parent, PCI_DEVFN(0, j));
 		if (!temp)
 			continue;
 		if ((temp->class >> 16) == PCI_BASE_CLASS_DISPLAY) {
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 10c0e62bd5a..2ce8f9ccc66 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -318,6 +318,8 @@ static int __init aer_service_init(void)
 {
 	if (pcie_aer_disable)
 		return -ENXIO;
+	if (!pci_msi_enabled())
+		return -ENXIO;
 	return pcie_port_service_register(&aerdriver);
 }
 
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index f289ca9bf18..745402e8e49 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -303,9 +303,6 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev,
 	pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
 	pci_read_config_dword(pdev, pos + PCI_EXP_LNKCAP, &reg32);
 	info->support = (reg32 & PCI_EXP_LNKCAP_ASPMS) >> 10;
-	/* 00b and 10b are defined as "Reserved". */
-	if (info->support == PCIE_LINK_STATE_L1)
-		info->support = 0;
 	info->latency_encoding_l0s = (reg32 & PCI_EXP_LNKCAP_L0SEL) >> 12;
 	info->latency_encoding_l1  = (reg32 & PCI_EXP_LNKCAP_L1EL) >> 15;
 	pci_read_config_word(pdev, pos + PCI_EXP_LNKCTL, &reg16);
diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index fb45f5ee8df..454970d2d70 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -746,7 +746,9 @@ static acpi_status WMID_set_u32(u32 value, u32 cap, struct wmi_interface *iface)
 			return AE_BAD_PARAMETER;
 		if (quirks->mailled == 1) {
 			param = value ? 0x92 : 0x93;
+			i8042_lock_chip();
 			i8042_command(&param, 0x1059);
+			i8042_unlock_chip();
 			return 0;
 		}
 		break;
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index c431198bdbc..82daa3c1dc9 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -14,7 +14,6 @@
 
 #include <linux/init.h>
 #include <linux/miscdevice.h>
-#include <linux/utsname.h>
 #include <linux/debugfs.h>
 #include <asm/ipl.h>
 #include <asm/sclp.h>
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 82b34893e5b..9a4dd5992f6 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -117,8 +117,6 @@ source "drivers/staging/vt6655/Kconfig"
 
 source "drivers/staging/vt6656/Kconfig"
 
-source "drivers/staging/cpc-usb/Kconfig"
-
 source "drivers/staging/udlfb/Kconfig"
 
 source "drivers/staging/hv/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index b1cad0d9ba7..104f2f8897e 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -40,7 +40,6 @@ obj-$(CONFIG_USB_SERIAL_QUATECH_USB2)	+= quatech_usb2/
 obj-$(CONFIG_OCTEON_ETHERNET)	+= octeon/
 obj-$(CONFIG_VT6655)		+= vt6655/
 obj-$(CONFIG_VT6656)		+= vt6656/
-obj-$(CONFIG_USB_CPC)		+= cpc-usb/
 obj-$(CONFIG_FB_UDL)		+= udlfb/
 obj-$(CONFIG_HYPERV)		+= hv/
 obj-$(CONFIG_VME_BUS)		+= vme/
diff --git a/drivers/staging/cpc-usb/Kconfig b/drivers/staging/cpc-usb/Kconfig
deleted file mode 100644
index 2be0bc9c39d..00000000000
--- a/drivers/staging/cpc-usb/Kconfig
+++ /dev/null
@@ -1,4 +0,0 @@
-config USB_CPC
-	tristate "CPC CAN USB driver"
-	depends on USB && PROC_FS
-	default n
diff --git a/drivers/staging/cpc-usb/Makefile b/drivers/staging/cpc-usb/Makefile
deleted file mode 100644
index 3f83170a8fa..00000000000
--- a/drivers/staging/cpc-usb/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_USB_CPC)	+= cpc-usb.o
-
-cpc-usb-y := cpc-usb_drv.o sja2m16c_2.o
diff --git a/drivers/staging/cpc-usb/TODO b/drivers/staging/cpc-usb/TODO
deleted file mode 100644
index 9b1752fb9cd..00000000000
--- a/drivers/staging/cpc-usb/TODO
+++ /dev/null
@@ -1,10 +0,0 @@
-Things to do for this driver to get merged into the main portion of the
-kernel:
-	- checkpatch cleanups
-	- sparse clean
-	- remove proc code
-	- tie into CAN socket interfaces if possible
-	- figure out sane userspace api
-	- use linux's error codes
-
-Send patches to Greg Kroah-Hartman <greg@kroah.com>
diff --git a/drivers/staging/cpc-usb/cpc-usb_drv.c b/drivers/staging/cpc-usb/cpc-usb_drv.c
deleted file mode 100644
index c5eca46996f..00000000000
--- a/drivers/staging/cpc-usb/cpc-usb_drv.c
+++ /dev/null
@@ -1,1184 +0,0 @@
-/*
- * CPC-USB CAN Interface Kernel Driver
- *
- * Copyright (C) 2004-2009 EMS Dr. Thomas Wuensche
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published
- * by the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/poll.h>
-#include <linux/smp_lock.h>
-#include <linux/completion.h>
-#include <asm/uaccess.h>
-#include <linux/usb.h>
-
-
-#include <linux/proc_fs.h>
-
-#include "cpc.h"
-
-#include "cpc_int.h"
-#include "cpcusb.h"
-
-#include "sja2m16c.h"
-
-/* Version Information */
-#define DRIVER_AUTHOR  "Sebastian Haas <haas@ems-wuensche.com>"
-#define DRIVER_DESC    "CPC-USB Driver for Linux Kernel 2.6"
-#define DRIVER_VERSION CPC_DRIVER_VERSION
-
-MODULE_AUTHOR(DRIVER_AUTHOR);
-MODULE_DESCRIPTION(DRIVER_DESC);
-MODULE_VERSION(DRIVER_VERSION);
-MODULE_LICENSE("GPL v2");
-
-/* Define these values to match your devices */
-#define USB_CPCUSB_VENDOR_ID	0x12D6
-
-#define USB_CPCUSB_M16C_PRODUCT_ID    0x0888
-#define USB_CPCUSB_LPC2119_PRODUCT_ID 0x0444
-
-#define CPC_USB_PROC_DIR     CPC_PROC_DIR "cpc-usb"
-
-static struct proc_dir_entry *procDir;
-static struct proc_dir_entry *procEntry;
-
-/* Module parameters */
-static int debug;
-module_param(debug, int, S_IRUGO);
-
-/* table of devices that work with this driver */
-static struct usb_device_id cpcusb_table[] = {
-	{USB_DEVICE(USB_CPCUSB_VENDOR_ID, USB_CPCUSB_M16C_PRODUCT_ID)},
-	{USB_DEVICE(USB_CPCUSB_VENDOR_ID, USB_CPCUSB_LPC2119_PRODUCT_ID)},
-	{}			/* Terminating entry */
-};
-
-MODULE_DEVICE_TABLE(usb, cpcusb_table);
-
-/* use to prevent kernel panic if driver is unloaded
- * while a programm has still open the device
- */
-DECLARE_WAIT_QUEUE_HEAD(rmmodWq);
-atomic_t useCount;
-
-static CPC_USB_T *CPCUSB_Table[CPC_USB_CARD_CNT] = { 0 };
-static unsigned int CPCUsbCnt;
-
-/* prevent races between open() and disconnect() */
-static DECLARE_MUTEX(disconnect_sem);
-
-/* local function prototypes */
-static ssize_t cpcusb_read(struct file *file, char *buffer, size_t count,
-			   loff_t *ppos);
-static ssize_t cpcusb_write(struct file *file, const char *buffer,
-			    size_t count, loff_t *ppos);
-static unsigned int cpcusb_poll(struct file *file, poll_table * wait);
-static int cpcusb_open(struct inode *inode, struct file *file);
-static int cpcusb_release(struct inode *inode, struct file *file);
-
-static int cpcusb_probe(struct usb_interface *interface,
-			const struct usb_device_id *id);
-static void cpcusb_disconnect(struct usb_interface *interface);
-
-static void cpcusb_read_bulk_callback(struct urb *urb);
-static void cpcusb_write_bulk_callback(struct urb *urb);
-static void cpcusb_read_interrupt_callback(struct urb *urb);
-
-static int cpcusb_setup_intrep(CPC_USB_T *card);
-
-static struct file_operations cpcusb_fops = {
-	/*
-	 * The owner field is part of the module-locking
-	 * mechanism. The idea is that the kernel knows
-	 * which module to increment the use-counter of
-	 * BEFORE it calls the device's open() function.
-	 * This also means that the kernel can decrement
-	 * the use-counter again before calling release()
-	 * or should the open() function fail.
-	 */
-	.owner = THIS_MODULE,
-
-	.read = cpcusb_read,
-	.write = cpcusb_write,
-	.poll = cpcusb_poll,
-	.open = cpcusb_open,
-	.release = cpcusb_release,
-};
-
-/*
- * usb class driver info in order to get a minor number from the usb core,
- * and to have the device registered with devfs and the driver core
- */
-static struct usb_class_driver cpcusb_class = {
-	.name = "usb/cpc_usb%d",
-	.fops = &cpcusb_fops,
-	.minor_base = CPC_USB_BASE_MNR,
-};
-
-/* usb specific object needed to register this driver with the usb subsystem */
-static struct usb_driver cpcusb_driver = {
-	.name = "cpc-usb",
-	.probe = cpcusb_probe,
-	.disconnect = cpcusb_disconnect,
-	.id_table = cpcusb_table,
-};
-
-static int cpcusb_create_info_output(char *buf)
-{
-	int i = 0, j;
-
-	for (j = 0; j < CPC_USB_CARD_CNT; j++) {
-		if (CPCUSB_Table[j]) {
-			CPC_USB_T *card = CPCUSB_Table[j];
-			CPC_CHAN_T *chan = card->chan;
-
-			/* MINOR CHANNELNO BUSNO SLOTNO */
-			i += sprintf(&buf[i], "%d %s\n", chan->minor,
-				     card->serialNumber);
-		}
-	}
-
-	return i;
-}
-
-static int cpcusb_proc_read_info(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = cpcusb_create_info_output(page);
-
-	if (len <= off + count)
-		*eof = 1;
-	*start = page + off;
-	len -= off;
-	if (len > count)
-		len = count;
-	if (len < 0)
-		len = 0;
-
-	return len;
-}
-
-/*
- * Remove CPC-USB and cleanup
- */
-static inline void cpcusb_delete(CPC_USB_T *card)
-{
-	if (card) {
-		if (card->chan) {
-			if (card->chan->buf)
-				vfree(card->chan->buf);
-
-			if (card->chan->CPCWait_q)
-				kfree(card->chan->CPCWait_q);
-
-			kfree(card->chan);
-		}
-
-		CPCUSB_Table[card->idx] = NULL;
-		kfree(card);
-	}
-}
-
-/*
- * setup the interrupt IN endpoint of a specific CPC-USB device
- */
-static int cpcusb_setup_intrep(CPC_USB_T *card)
-{
-	int retval = 0;
-	struct usb_endpoint_descriptor *ep;
-
-	ep = &card->interface->altsetting[0].endpoint[card->num_intr_in].desc;
-
-	card->intr_in_buffer[0] = 0;
-	card->free_slots = 15;	/* initial size */
-
-	/* setup the urb */
-	usb_fill_int_urb(card->intr_in_urb, card->udev,
-			 usb_rcvintpipe(card->udev, card->num_intr_in),
-			 card->intr_in_buffer,
-			 sizeof(card->intr_in_buffer),
-			 cpcusb_read_interrupt_callback,
-			 card,
-			 ep->bInterval);
-
-	card->intr_in_urb->status = 0;	/* needed! */
-
-	/* submit the urb */
-	retval = usb_submit_urb(card->intr_in_urb, GFP_KERNEL);
-
-	if (retval)
-		err("%s - failed submitting intr urb, error %d", __func__,
-		    retval);
-
-	return retval;
-}
-
-static int cpcusb_open(struct inode *inode, struct file *file)
-{
-	CPC_USB_T *card = NULL;
-	struct usb_interface *interface;
-	int subminor;
-	int j, retval = 0;
-
-	subminor = iminor(inode);
-
-	/* prevent disconnects */
-	down(&disconnect_sem);
-
-	interface = usb_find_interface(&cpcusb_driver, subminor);
-	if (!interface) {
-		err("%s - error, can't find device for minor %d",
-				__func__, subminor);
-		retval = CPC_ERR_NO_INTERFACE_PRESENT;
-		goto exit_no_device;
-	}
-
-	card = usb_get_intfdata(interface);
-	if (!card) {
-		retval = CPC_ERR_NO_INTERFACE_PRESENT;
-		goto exit_no_device;
-	}
-
-	/* lock this device */
-	down(&card->sem);
-
-	/* increment our usage count for the driver */
-	if (card->open) {
-		dbg("device already opened");
-		retval = CPC_ERR_CHANNEL_ALREADY_OPEN;
-		goto exit_on_error;
-	}
-
-	/* save our object in the file's private structure */
-	file->private_data = card;
-	for (j = 0; j < CPC_USB_URB_CNT; j++) {
-		usb_fill_bulk_urb(card->urbs[j].urb, card->udev,
-				  usb_rcvbulkpipe(card->udev, card->num_bulk_in),
-				  card->urbs[j].buffer, card->urbs[j].size,
-				  cpcusb_read_bulk_callback, card);
-
-		retval = usb_submit_urb(card->urbs[j].urb, GFP_KERNEL);
-
-		if (retval) {
-			err("%s - failed submitting read urb, error %d",
-			    __func__, retval);
-			retval = CPC_ERR_TRANSMISSION_FAILED;
-			goto exit_on_error;
-		}
-	}
-
-	info("%s - %d URB's submitted", __func__, j);
-
-	ResetBuffer(card->chan);
-
-	cpcusb_setup_intrep(card);
-	card->open = 1;
-
-	atomic_inc(&useCount);
-
-exit_on_error:
-	/* unlock this device */
-	up(&card->sem);
-
-exit_no_device:
-	up(&disconnect_sem);
-
-	return retval;
-}
-
-static unsigned int cpcusb_poll(struct file *file, poll_table * wait)
-{
-	CPC_USB_T *card = (CPC_USB_T *) file->private_data;
-	unsigned int retval = 0;
-
-	if (!card) {
-		err("%s - device object lost", __func__);
-		return -EIO;
-	}
-
-	poll_wait(file, card->chan->CPCWait_q, wait);
-
-	if (IsBufferNotEmpty(card->chan) || !(card->present))
-		retval |= (POLLIN | POLLRDNORM);
-
-	if (card->free_slots)
-		retval |= (POLLOUT | POLLWRNORM);
-
-	return retval;
-}
-
-static int cpcusb_release(struct inode *inode, struct file *file)
-{
-	CPC_USB_T *card = (CPC_USB_T *) file->private_data;
-	int j, retval = 0;
-
-	if (card == NULL) {
-		dbg("%s - object is NULL", __func__);
-		return CPC_ERR_NO_INTERFACE_PRESENT;
-	}
-
-	/* lock our device */
-	down(&card->sem);
-
-	if (!card->open) {
-		dbg("%s - device not opened", __func__);
-		retval = CPC_ERR_NO_INTERFACE_PRESENT;
-		goto exit_not_opened;
-	}
-
-	/* if device wasn't unplugged kill all urbs */
-	if (card->present) {
-		/* kill read urbs */
-		for (j = 0; j < CPC_USB_URB_CNT; j++) {
-			usb_kill_urb(card->urbs[j].urb);
-		}
-
-		/* kill irq urb */
-		usb_kill_urb(card->intr_in_urb);
-
-		/* kill write urbs */
-		for (j = 0; j < CPC_USB_URB_CNT; j++) {
-			if (atomic_read(&card->wrUrbs[j].busy)) {
-				usb_kill_urb(card->wrUrbs[j].urb);
-				wait_for_completion(&card->wrUrbs[j].finished);
-			}
-		}
-	}
-
-	atomic_dec(&useCount);
-
-	/* last process detached */
-	if (atomic_read(&useCount) == 0) {
-		wake_up(&rmmodWq);
-	}
-
-	if (!card->present && card->open) {
-		/* the device was unplugged before the file was released */
-		up(&card->sem);
-		cpcusb_delete(card);
-		return 0;
-	}
-
-	card->open = 0;
-
-exit_not_opened:
-	up(&card->sem);
-
-	return 0;
-}
-
-static ssize_t cpcusb_read(struct file *file, char *buffer, size_t count,
-			   loff_t *ppos)
-{
-	CPC_USB_T *card = (CPC_USB_T *) file->private_data;
-	CPC_CHAN_T *chan;
-	int retval = 0;
-
-	if (count < sizeof(CPC_MSG_T))
-		return CPC_ERR_UNKNOWN;
-
-	/* check if can read from the given address */
-	if (!access_ok(VERIFY_WRITE, buffer, count))
-		return CPC_ERR_UNKNOWN;
-
-	/* lock this object */
-	down(&card->sem);
-
-	/* verify that the device wasn't unplugged */
-	if (!card->present) {
-		up(&card->sem);
-		return CPC_ERR_NO_INTERFACE_PRESENT;
-	}
-
-	if (IsBufferEmpty(card->chan)) {
-		retval = 0;
-	} else {
-		chan = card->chan;
-
-#if 0
-		/* convert LPC2119 params back to SJA1000 params */
-		if (card->deviceRevision >= 0x0200
-		    && chan->buf[chan->oidx].type == CPC_MSG_T_CAN_PRMS) {
-			LPC2119_TO_SJA1000_Params(&chan->buf[chan->oidx]);
-		}
-#endif
-
-		if (copy_to_user(buffer, &chan->buf[chan->oidx], count) != 0) {
-			retval = CPC_ERR_IO_TRANSFER;
-		} else {
-			chan->oidx = (chan->oidx + 1) % CPC_MSG_BUF_CNT;
-			chan->WnR = 1;
-			retval = sizeof(CPC_MSG_T);
-		}
-	}
-/*	spin_unlock_irqrestore(&card->slock, flags); */
-
-	/* unlock the device */
-	up(&card->sem);
-
-	return retval;
-}
-
-#define SHIFT  1
-static inline void cpcusb_align_buffer_alignment(unsigned char *buf)
-{
-	/* CPC-USB uploads packed bytes. */
-	CPC_MSG_T *cpc = (CPC_MSG_T *) buf;
-	unsigned int i;
-
-	for (i = 0; i < cpc->length + (2 * sizeof(unsigned long)); i++) {
-		((unsigned char *) &cpc->msgid)[1 + i] =
-		    ((unsigned char *) &cpc->msgid)[1 + SHIFT + i];
-	}
-}
-
-static int cpc_get_buffer_count(CPC_CHAN_T *chan)
-{
-	/* check the buffer parameters */
-	if (chan->iidx == chan->oidx)
-		return !chan->WnR ? CPC_MSG_BUF_CNT : 0;
-	else if (chan->iidx >= chan->oidx)
-		return (chan->iidx - chan->oidx) % CPC_MSG_BUF_CNT;
-
-	return (chan->iidx + CPC_MSG_BUF_CNT - chan->oidx) % CPC_MSG_BUF_CNT;
-}
-
-static ssize_t cpcusb_write(struct file *file, const char *buffer,
-			    size_t count, loff_t *ppos)
-{
-	CPC_USB_T *card = (CPC_USB_T *) file->private_data;
-	CPC_USB_WRITE_URB_T *wrUrb = NULL;
-
-	ssize_t bytes_written = 0;
-	int retval = 0;
-	int j;
-
-	unsigned char *obuf = NULL;
-	unsigned char type = 0;
-	CPC_MSG_T *info = NULL;
-
-	dbg("%s - entered minor %d, count = %zu, present = %d",
-	    __func__, card->minor, count, card->present);
-
-	if (count > sizeof(CPC_MSG_T))
-		return CPC_ERR_UNKNOWN;
-
-	/* check if can read from the given address */
-	if (!access_ok(VERIFY_READ, buffer, count))
-		return CPC_ERR_UNKNOWN;
-
-	/* lock this object */
-	down(&card->sem);
-
-	/* verify that the device wasn't unplugged */
-	if (!card->present) {
-		retval = CPC_ERR_NO_INTERFACE_PRESENT;
-		goto exit;
-	}
-
-	/* verify that we actually have some data to write */
-	if (count == 0) {
-		dbg("%s - write request of 0 bytes", __func__);
-		goto exit;
-	}
-
-	if (card->free_slots <= 5) {
-		info = (CPC_MSG_T *) buffer;
-
-		if (info->type != CPC_CMD_T_CLEAR_CMD_QUEUE
-		    || card->free_slots <= 0) {
-			dbg("%s - send buffer full please try again %d",
-			    __func__, card->free_slots);
-			retval = CPC_ERR_CAN_NO_TRANSMIT_BUF;
-			goto exit;
-		}
-	}
-
-	/* Find a free write urb */
-	for (j = 0; j < CPC_USB_URB_CNT; j++) {
-		if (!atomic_read(&card->wrUrbs[j].busy)) {
-			wrUrb = &card->wrUrbs[j];	/* remember found URB */
-			atomic_set(&wrUrb->busy, 1);	/* lock this URB      */
-			init_completion(&wrUrb->finished);	/* init completion    */
-			dbg("WR URB no. %d started", j);
-			break;
-		}
-	}
-
-	/* don't found write urb say error */
-	if (!wrUrb) {
-		dbg("%s - no free send urb available", __func__);
-		retval = CPC_ERR_CAN_NO_TRANSMIT_BUF;
-		goto exit;
-	}
-	dbg("URB write req");
-
-	obuf = (unsigned char *) wrUrb->urb->transfer_buffer;
-
-	/* copy the data from userspace into our transfer buffer;
-	 * this is the only copy required.
-	 */
-	if (copy_from_user(&obuf[4], buffer, count) != 0) {
-		atomic_set(&wrUrb->busy, 0);	/* release urb */
-		retval = CPC_ERR_IO_TRANSFER;
-		goto exit;
-	}
-
-	/* check if it is a DRIVER information message, so we can
-	 * response to that message and not the USB
-	 */
-	info = (CPC_MSG_T *) &obuf[4];
-
-	bytes_written = 11 + info->length;
-	if (bytes_written >= wrUrb->size) {
-		retval = CPC_ERR_IO_TRANSFER;
-		goto exit;
-	}
-
-	switch (info->type) {
-	case CPC_CMD_T_CLEAR_MSG_QUEUE:
-		ResetBuffer(card->chan);
-		break;
-
-	case CPC_CMD_T_INQ_MSG_QUEUE_CNT:
-		retval = cpc_get_buffer_count(card->chan);
-		atomic_set(&wrUrb->busy, 0);
-
-		goto exit;
-
-	case CPC_CMD_T_INQ_INFO:
-		if (info->msg.info.source == CPC_INFOMSG_T_DRIVER) {
-			/* release urb cause we'll use it for driver
-			 * information
-			 */
-			atomic_set(&wrUrb->busy, 0);
-			if (IsBufferFull(card->chan)) {
-				retval = CPC_ERR_IO_TRANSFER;
-				goto exit;
-			}
-
-			/* it is a driver information request message and we have
-			 * free rx slots to store the response
-			 */
-			type = info->msg.info.type;
-			info = &card->chan->buf[card->chan->iidx];
-
-			info->type = CPC_MSG_T_INFO;
-			info->msg.info.source = CPC_INFOMSG_T_DRIVER;
-			info->msg.info.type = type;
-
-			switch (type) {
-			case CPC_INFOMSG_T_VERSION:
-				info->length = strlen(CPC_DRIVER_VERSION) + 2;
-				sprintf(info->msg.info.msg, "%s\n",
-					CPC_DRIVER_VERSION);
-				break;
-
-			case CPC_INFOMSG_T_SERIAL:
-				info->length = strlen(CPC_DRIVER_SERIAL) + 2;
-				sprintf(info->msg.info.msg, "%s\n",
-					CPC_DRIVER_SERIAL);
-				break;
-
-			default:
-				info->length = 2;
-				info->msg.info.type =
-				    CPC_INFOMSG_T_UNKNOWN_TYPE;
-			}
-
-			card->chan->WnR = 0;
-			card->chan->iidx =
-			    (card->chan->iidx + 1) % CPC_MSG_BUF_CNT;
-
-			retval = info->length;
-			goto exit;
-		}
-		break;
-	case CPC_CMD_T_CAN_PRMS:
-		/* Check the controller type. If it's the new CPC-USB, make sure if these are SJA1000 params */
-		if (info->msg.canparams.cc_type != SJA1000
-		    && info->msg.canparams.cc_type != M16C_BASIC
-		    && (card->productId == USB_CPCUSB_LPC2119_PRODUCT_ID
-			&& info->msg.canparams.cc_type != SJA1000)) {
-			/* don't forget to release the urb */
-			atomic_set(&wrUrb->busy, 0);
-			retval = CPC_ERR_WRONG_CONTROLLER_TYPE;
-			goto exit;
-		}
-		break;
-	}
-
-	/* just convert the params if it is an old CPC-USB with M16C controller */
-	if (card->productId == USB_CPCUSB_M16C_PRODUCT_ID) {
-		/* if it is a parameter message convert it from SJA1000 controller
-		 * settings to M16C Basic controller settings
-		 */
-		SJA1000_TO_M16C_BASIC_Params((CPC_MSG_T *) &obuf[4]);
-	}
-
-	/* don't forget the byte alignment */
-	cpcusb_align_buffer_alignment(&obuf[4]);
-
-	/* setup a the 4 byte header */
-	obuf[0] = obuf[1] = obuf[2] = obuf[3] = 0;
-
-	/* this urb was already set up, except for this write size */
-	wrUrb->urb->transfer_buffer_length = bytes_written + 4;
-
-	/* send the data out the bulk port */
-	/* a character device write uses GFP_KERNEL,
-	   unless a spinlock is held */
-	retval = usb_submit_urb(wrUrb->urb, GFP_KERNEL);
-	if (retval) {
-		atomic_set(&wrUrb->busy, 0);	/* release urb */
-		err("%s - failed submitting write urb, error %d",
-		    __func__, retval);
-	} else {
-		retval = bytes_written;
-	}
-
-exit:
-	/* unlock the device */
-	up(&card->sem);
-
-	dbg("%s - leaved", __func__);
-
-	return retval;
-}
-
-/*
- * callback for interrupt IN urb
- */
-static void cpcusb_read_interrupt_callback(struct urb *urb)
-{
-	CPC_USB_T *card = (CPC_USB_T *) urb->context;
-	int retval;
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->slock, flags);
-
-	if (!card->present) {
-		spin_unlock_irqrestore(&card->slock, flags);
-		info("%s - no such device", __func__);
-		return;
-	}
-
-	switch (urb->status) {
-	case 0: /* success */
-		card->free_slots = card->intr_in_buffer[1];
-		break;
-	case -ECONNRESET:
-	case -ENOENT:
-	case -ESHUTDOWN:
-		/* urb was killed */
-		spin_unlock_irqrestore(&card->slock, flags);
-		dbg("%s - intr urb killed", __func__);
-		return;
-	default:
-		info("%s - nonzero urb status %d", __func__, urb->status);
-		break;
-	}
-
-	retval = usb_submit_urb(urb, GFP_ATOMIC);
-	if (retval) {
-		err("%s - failed resubmitting intr urb, error %d",
-		    __func__, retval);
-	}
-
-	spin_unlock_irqrestore(&card->slock, flags);
-	wake_up_interruptible(card->chan->CPCWait_q);
-
-	return;
-}
-
-#define UN_SHIFT  1
-#define CPCMSG_HEADER_LEN_FIRMWARE   11
-static inline int cpcusb_unalign_and_copy_buffy(unsigned char *out,
-						unsigned char *in)
-{
-	unsigned int i, j;
-
-	for (i = 0; i < 3; i++)
-		out[i] = in[i];
-
-	for (j = 0; j < (in[1] + (CPCMSG_HEADER_LEN_FIRMWARE - 3)); j++)
-		out[j + i + UN_SHIFT] = in[j + i];
-
-	return i + j;
-}
-
-/*
- * callback for bulk IN urb
- */
-static void cpcusb_read_bulk_callback(struct urb *urb)
-{
-	CPC_USB_T *card = (CPC_USB_T *) urb->context;
-	CPC_CHAN_T *chan;
-	unsigned char *ibuf = urb->transfer_buffer;
-	int retval, msgCnt, start, again = 0;
-	unsigned long flags;
-
-	if (!card) {
-		err("%s - device object lost", __func__);
-		return;
-	}
-
-	spin_lock_irqsave(&card->slock, flags);
-
-	if (!card->present) {
-		spin_unlock_irqrestore(&card->slock, flags);
-		info("%s - no such device", __func__);
-		return;
-	}
-
-	switch (urb->status) {
-	case 0:		/* success */
-		break;
-	case -ECONNRESET:
-	case -ENOENT:
-	case -ESHUTDOWN:
-		/* urb was killed */
-		spin_unlock_irqrestore(&card->slock, flags);
-		dbg("%s - read urb killed", __func__);
-		return;
-	default:
-		info("%s - nonzero urb status %d", __func__, urb->status);
-		break;
-	}
-
-	if (urb->actual_length) {
-		msgCnt = ibuf[0] & ~0x80;
-		again = ibuf[0] & 0x80;
-
-		/* we have a 4 byte header */
-		start = 4;
-		chan = card->chan;
-		while (msgCnt) {
-			if (!(IsBufferFull(card->chan))) {
-				start +=
-				    cpcusb_unalign_and_copy_buffy((unsigned char *)
-							  &chan->buf[chan->iidx], &ibuf[start]);
-
-				if (start > urb->transfer_buffer_length) {
-					err("%d > %d", start, urb->transfer_buffer_length);
-					break;
-				}
-
-				chan->WnR = 0;
-				chan->iidx = (chan->iidx + 1) % CPC_MSG_BUF_CNT;
-				msgCnt--;
-			} else {
-				break;
-			}
-		}
-	}
-
-	usb_fill_bulk_urb(urb, card->udev,
-			  usb_rcvbulkpipe(card->udev, card->num_bulk_in),
-			  urb->transfer_buffer,
-			  urb->transfer_buffer_length,
-			  cpcusb_read_bulk_callback, card);
-
-	retval = usb_submit_urb(urb, GFP_ATOMIC);
-
-	if (retval) {
-		err("%s - failed resubmitting read urb, error %d", __func__, retval);
-	}
-
-	spin_unlock_irqrestore(&card->slock, flags);
-
-	wake_up_interruptible(card->chan->CPCWait_q);
-}
-
-/*
- * callback for bulk IN urb
- */
-static void cpcusb_write_bulk_callback(struct urb *urb)
-{
-	CPC_USB_T *card = (CPC_USB_T *) urb->context;
-	unsigned long flags;
-	int j;
-
-	spin_lock_irqsave(&card->slock, flags);
-
-	/* find this urb */
-	for (j = 0; j < CPC_USB_URB_CNT; j++) {
-		if (card->wrUrbs[j].urb == urb) {
-			dbg("URB found no. %d", j);
-			/* notify anyone waiting that the write has finished */
-			complete(&card->wrUrbs[j].finished);
-			atomic_set(&card->wrUrbs[j].busy, 0);
-			break;
-		}
-	}
-
-	switch (urb->status) {
-	case 0:		/* success */
-		break;
-	case -ECONNRESET:
-	case -ENOENT:
-	case -ESHUTDOWN:
-		/* urb was killed */
-		spin_unlock_irqrestore(&card->slock, flags);
-		dbg("%s - write urb no. %d killed", __func__, j);
-		return;
-	default:
-		info("%s - nonzero urb status %d", __func__, urb->status);
-		break;
-	}
-
-	spin_unlock_irqrestore(&card->slock, flags);
-
-	wake_up_interruptible(card->chan->CPCWait_q);
-}
-
-static inline int cpcusb_get_free_slot(void)
-{
-	int i;
-
-	for (i = 0; i < CPC_USB_CARD_CNT; i++) {
-		if (!CPCUSB_Table[i])
-			return i;
-	}
-
-	return -1;
-}
-
-/*
- * probe function for new CPC-USB devices
- */
-static int cpcusb_probe(struct usb_interface *interface,
-			const struct usb_device_id *id)
-{
-	CPC_USB_T *card = NULL;
-	CPC_CHAN_T *chan = NULL;
-
-	struct usb_device *udev = interface_to_usbdev(interface);
-	struct usb_host_interface *iface_desc;
-	struct usb_endpoint_descriptor *endpoint;
-
-	int i, j, retval = -ENOMEM, slot;
-
-	slot = cpcusb_get_free_slot();
-	if (slot < 0) {
-		info("No more devices supported");
-		return -ENOMEM;
-	}
-
-	/* allocate memory for our device state and initialize it */
-	card = kzalloc(sizeof(CPC_USB_T), GFP_KERNEL);
-	if (!card) {
-		err("Out of memory");
-		return -ENOMEM;
-	}
-	CPCUSB_Table[slot] = card;
-
-	/* allocate and initialize the channel struct */
-	card->chan = kmalloc(sizeof(CPC_CHAN_T), GFP_KERNEL);
-	if (!card->chan) {
-		kfree(card);
-		err("Out of memory");
-		return -ENOMEM;
-	}
-
-	chan = card->chan;
-	memset(chan, 0, sizeof(CPC_CHAN_T));
-	ResetBuffer(chan);
-
-	init_MUTEX(&card->sem);
-	spin_lock_init(&card->slock);
-
-	card->udev = udev;
-	card->interface = interface;
-	if (udev->descriptor.iSerialNumber) {
-		usb_string(udev, udev->descriptor.iSerialNumber, card->serialNumber,
-				   128);
-		info("Serial %s", card->serialNumber);
-	}
-
-	card->productId = udev->descriptor.idProduct;
-	info("Product %s",
-	     card->productId == USB_CPCUSB_LPC2119_PRODUCT_ID ?
-			 "CPC-USB/ARM7" : "CPC-USB/M16C");
-
-	/* set up the endpoint information */
-	/* check out the endpoints */
-	/* use only the first bulk-in and bulk-out endpoints */
-	iface_desc = &interface->altsetting[0];
-	for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) {
-		endpoint = &iface_desc->endpoint[i].desc;
-
-		if (!card->num_intr_in &&
-		    (endpoint->bEndpointAddress & USB_DIR_IN) &&
-		    ((endpoint->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK)
-		     == USB_ENDPOINT_XFER_INT)) {
-			card->intr_in_urb = usb_alloc_urb(0, GFP_KERNEL);
-			card->num_intr_in = 1;
-
-			if (!card->intr_in_urb) {
-				err("No free urbs available");
-				goto error;
-			}
-
-			dbg("intr_in urb %d", card->num_intr_in);
-		}
-
-		if (!card->num_bulk_in &&
-		    (endpoint->bEndpointAddress & USB_DIR_IN) &&
-		    ((endpoint->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK)
-		     == USB_ENDPOINT_XFER_BULK)) {
-			card->num_bulk_in = 2;
-			for (j = 0; j < CPC_USB_URB_CNT; j++) {
-				card->urbs[j].size = endpoint->wMaxPacketSize;
-				card->urbs[j].urb = usb_alloc_urb(0, GFP_KERNEL);
-				if (!card->urbs[j].urb) {
-					err("No free urbs available");
-					goto error;
-				}
-				card->urbs[j].buffer =
-				    usb_buffer_alloc(udev,
-						     card->urbs[j].size,
-						     GFP_KERNEL,
-						     &card->urbs[j].urb->transfer_dma);
-				if (!card->urbs[j].buffer) {
-					err("Couldn't allocate bulk_in_buffer");
-					goto error;
-				}
-			}
-			info("%s - %d reading URB's allocated",
-			     __func__, CPC_USB_URB_CNT);
-		}
-
-		if (!card->num_bulk_out &&
-		    !(endpoint->bEndpointAddress & USB_DIR_IN) &&
-		    ((endpoint->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK)
-		     == USB_ENDPOINT_XFER_BULK)) {
-
-			card->num_bulk_out = 2;
-
-			for (j = 0; j < CPC_USB_URB_CNT; j++) {
-				card->wrUrbs[j].size =
-				    endpoint->wMaxPacketSize;
-				card->wrUrbs[j].urb =
-				    usb_alloc_urb(0, GFP_KERNEL);
-				if (!card->wrUrbs[j].urb) {
-					err("No free urbs available");
-					goto error;
-				}
-				card->wrUrbs[j].buffer = usb_buffer_alloc(udev,
-							       card->wrUrbs[j].size, GFP_KERNEL,
-							       &card->wrUrbs[j].urb->transfer_dma);
-
-				if (!card->wrUrbs[j].buffer) {
-					err("Couldn't allocate bulk_out_buffer");
-					goto error;
-				}
-
-				usb_fill_bulk_urb(card->wrUrbs[j].urb, udev,
-						usb_sndbulkpipe(udev, endpoint->bEndpointAddress),
-						card->wrUrbs[j].buffer,
-						card->wrUrbs[j].size,
-						cpcusb_write_bulk_callback,
-						card);
-			}
-
-			info("%s - %d writing URB's allocated", __func__, CPC_USB_URB_CNT);
-		}
-	}
-
-	if (!(card->num_bulk_in && card->num_bulk_out)) {
-		err("Couldn't find both bulk-in and bulk-out endpoints");
-		goto error;
-	}
-
-	/* allow device read, write and ioctl */
-	card->present = 1;
-
-	/* we can register the device now, as it is ready */
-	usb_set_intfdata(interface, card);
-	retval = usb_register_dev(interface, &cpcusb_class);
-
-	if (retval) {
-		/* something prevented us from registering this driver */
-		err("Not able to get a minor for this device.");
-		usb_set_intfdata(interface, NULL);
-		goto error;
-	}
-
-	card->chan->minor = card->minor = interface->minor;
-
-	chan->buf = vmalloc(sizeof(CPC_MSG_T) * CPC_MSG_BUF_CNT);
-	if (chan->buf == NULL) {
-		err("Out of memory");
-		retval = -ENOMEM;
-		goto error;
-	}
-	info("Allocated memory for %d messages (%lu kbytes)",
-	     CPC_MSG_BUF_CNT, (long unsigned int)(sizeof(CPC_MSG_T) * CPC_MSG_BUF_CNT) / 1000);
-	memset(chan->buf, 0, sizeof(CPC_MSG_T) * CPC_MSG_BUF_CNT);
-
-	ResetBuffer(chan);
-
-	card->chan->CPCWait_q = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL);
-	if (!card->chan->CPCWait_q) {
-		err("Out of memory");
-		retval = -ENOMEM;
-		goto error;
-	}
-	init_waitqueue_head(card->chan->CPCWait_q);
-
-	CPCUSB_Table[slot] = card;
-	card->idx = slot;
-	CPCUsbCnt++;
-
-	/* let the user know what node this device is now attached to */
-	info("Device now attached to USB-%d", card->minor);
-	return 0;
-
-error:
-	for (j = 0; j < CPC_USB_URB_CNT; j++) {
-		if (card->urbs[j].buffer) {
-			usb_buffer_free(card->udev, card->urbs[j].size,
-					card->urbs[j].buffer,
-					card->urbs[j].urb->transfer_dma);
-			card->urbs[j].buffer = NULL;
-		}
-		if (card->urbs[j].urb) {
-			usb_free_urb(card->urbs[j].urb);
-			card->urbs[j].urb = NULL;
-		}
-	}
-
-	cpcusb_delete(card);
-	return retval;
-}
-
-/*
- * called by the usb core when the device is removed from the system
- */
-static void cpcusb_disconnect(struct usb_interface *interface)
-{
-	CPC_USB_T *card = NULL;
-	int minor, j;
-
-	/* prevent races with open() */
-	down(&disconnect_sem);
-
-	card = usb_get_intfdata(interface);
-	usb_set_intfdata(interface, NULL);
-
-	down(&card->sem);
-
-	/* prevent device read, write and ioctl */
-	card->present = 0;
-
-	minor = card->minor;
-
-	/* free all urbs and their buffers */
-	for (j = 0; j < CPC_USB_URB_CNT; j++) {
-		/* terminate an ongoing write */
-		if (atomic_read(&card->wrUrbs[j].busy)) {
-			usb_kill_urb(card->wrUrbs[j].urb);
-			wait_for_completion(&card->wrUrbs[j].finished);
-		}
-		usb_buffer_free(card->udev, card->wrUrbs[j].size,
-				card->wrUrbs[j].buffer,
-				card->wrUrbs[j].urb->transfer_dma);
-		usb_free_urb(card->wrUrbs[j].urb);
-	}
-	info("%d write URBs freed", CPC_USB_URB_CNT);
-
-	/* free all urbs and their buffers */
-	for (j = 0; j < CPC_USB_URB_CNT; j++) {
-		usb_buffer_free(card->udev, card->urbs[j].size,
-				card->urbs[j].buffer,
-				card->urbs[j].urb->transfer_dma);
-		usb_free_urb(card->urbs[j].urb);
-	}
-	info("%d read URBs freed", CPC_USB_URB_CNT);
-	usb_free_urb(card->intr_in_urb);
-
-	/* give back our minor */
-	usb_deregister_dev(interface, &cpcusb_class);
-
-	up(&card->sem);
-
-	/* if the device is opened, cpcusb_release will clean this up */
-	if (!card->open)
-		cpcusb_delete(card);
-	else
-		wake_up_interruptible(card->chan->CPCWait_q);
-
-	up(&disconnect_sem);
-
-	CPCUsbCnt--;
-	info("USB-%d now disconnected", minor);
-}
-
-static int __init CPCUsb_Init(void)
-{
-	int result, i;
-
-	info(DRIVER_DESC " v" DRIVER_VERSION);
-	info("Build on " __DATE__ " at " __TIME__);
-
-	for (i = 0; i < CPC_USB_CARD_CNT; i++)
-		CPCUSB_Table[i] = 0;
-
-	/* register this driver with the USB subsystem */
-	result = usb_register(&cpcusb_driver);
-	if (result) {
-		err("usb_register failed. Error number %d", result);
-		return result;
-	}
-
-	procDir = proc_mkdir(CPC_USB_PROC_DIR, NULL);
-	if (!procDir) {
-		err("Could not create proc entry");
-	} else {
-		procEntry = create_proc_read_entry("info", 0444, procDir,
-						   cpcusb_proc_read_info,
-						   NULL);
-		if (!procEntry) {
-			err("Could not create proc entry %s", CPC_USB_PROC_DIR "/info");
-			remove_proc_entry(CPC_USB_PROC_DIR, NULL);
-			procDir = NULL;
-		}
-	}
-
-	return 0;
-}
-
-static void __exit CPCUsb_Exit(void)
-{
-	wait_event(rmmodWq, !atomic_read(&useCount));
-
-	/* deregister this driver with the USB subsystem */
-	usb_deregister(&cpcusb_driver);
-
-	if (procDir) {
-		if (procEntry)
-			remove_proc_entry("info", procDir);
-		remove_proc_entry(CPC_USB_PROC_DIR, NULL);
-	}
-}
-
-module_init(CPCUsb_Init);
-module_exit(CPCUsb_Exit);
diff --git a/drivers/staging/cpc-usb/cpc.h b/drivers/staging/cpc-usb/cpc.h
deleted file mode 100644
index b2fda5d14c1..00000000000
--- a/drivers/staging/cpc-usb/cpc.h
+++ /dev/null
@@ -1,417 +0,0 @@
-/*
- * CPC CAN Interface Definitions
- *
- * Copyright (C) 2000-2008 EMS Dr. Thomas Wuensche
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- */
-#ifndef CPC_HEADER
-#define CPC_HEADER
-
-/*
- * the maximum length of the union members within a CPC_MSG
- * this value can be defined by the customer, but has to be
- * >= 64 bytes
- * however, if not defined before, we set a length of 64 byte
- */
-#if !defined(CPC_MSG_LEN) || (CPC_MSG_LEN < 64)
-#undef CPC_MSG_LEN
-#define CPC_MSG_LEN 64
-#endif
-
-/*
- * Transmission of events from CPC interfaces to PC can be individually
- * controlled per event type. Default state is: don't transmit
- * Control values are constructed by bit-or of Subject and Action
- * and passed to CPC_Control()
- */
-
-/* Control-Values for CPC_Control() Command Subject Selection */
-#define CONTR_CAN_Message 0x04
-#define CONTR_Busload	  0x08
-#define	CONTR_CAN_State	  0x0C
-#define	CONTR_SendAck	  0x10
-#define	CONTR_Filter	  0x14
-#define CONTR_CmdQueue    0x18	/* reserved, do not use */
-#define CONTR_BusError    0x1C
-
-/* Control Command Actions */
-#define CONTR_CONT_OFF    0
-#define CONTR_CONT_ON     1
-#define CONTR_SING_ON     2
-/*
- * CONTR_SING_ON doesn't change CONTR_CONT_ON state, so it should be
- * read as: transmit at least once
- */
-
-/* defines for confirmed request */
-#define DO_NOT_CONFIRM 0
-#define DO_CONFIRM     1
-
-/* event flags */
-#define EVENT_READ 0x01
-#define EVENT_WRITE 0x02
-
-/*
- * Messages from CPC to PC contain a message object type field.
- * The following message types are sent by CPC and can be used in
- * handlers, others should be ignored.
- */
-#define CPC_MSG_T_RESYNC        0 /* Normally to be ignored */
-#define CPC_MSG_T_CAN           1 /* CAN data frame */
-#define CPC_MSG_T_BUSLOAD       2 /* Busload message */
-#define CPC_MSG_T_STRING        3 /* Normally to be ignored */
-#define CPC_MSG_T_CONTI         4 /* Normally to be ignored */
-#define CPC_MSG_T_MEM           7 /* Normally not to be handled */
-#define	CPC_MSG_T_RTR           8 /* CAN remote frame */
-#define CPC_MSG_T_TXACK	        9 /* Send acknowledge */
-#define CPC_MSG_T_POWERUP      10 /* Power-up message */
-#define	CPC_MSG_T_CMD_NO       11 /* Normally to be ignored */
-#define	CPC_MSG_T_CAN_PRMS     12 /* Actual CAN parameters */
-#define	CPC_MSG_T_ABORTED      13 /* Command aborted message */
-#define	CPC_MSG_T_CANSTATE     14 /* CAN state message */
-#define CPC_MSG_T_RESET        15 /* used to reset CAN-Controller */
-#define	CPC_MSG_T_XCAN         16 /* XCAN data frame */
-#define CPC_MSG_T_XRTR         17 /* XCAN remote frame */
-#define CPC_MSG_T_INFO         18 /* information strings */
-#define CPC_MSG_T_CONTROL      19 /* used for control of interface/driver behaviour */
-#define CPC_MSG_T_CONFIRM      20 /* response type for confirmed requests */
-#define CPC_MSG_T_OVERRUN      21 /* response type for overrun conditions */
-#define CPC_MSG_T_KEEPALIVE    22 /* response type for keep alive conditions */
-#define CPC_MSG_T_CANERROR     23 /* response type for bus error conditions */
-#define CPC_MSG_T_DISCONNECTED 24 /* response type for a disconnected interface */
-#define CPC_MSG_T_ERR_COUNTER  25 /* RX/TX error counter of CAN controller */
-
-#define CPC_MSG_T_FIRMWARE    100 /* response type for USB firmware download */
-
-/*
- * Messages from the PC to the CPC interface contain a command field
- * Most of the command types are wrapped by the library functions and have therefore
- * normally not to be used.
- * However, programmers who wish to circumvent the library and talk directly
- * to the drivers (mainly Linux programmers) can use the following
- * command types:
- */
-#define CPC_CMD_T_CAN                 1	/* CAN data frame */
-#define CPC_CMD_T_CONTROL             3	/* used for control of interface/driver behaviour */
-#define	CPC_CMD_T_CAN_PRMS            6	/* set CAN parameters */
-#define	CPC_CMD_T_CLEARBUF            8	/* clears input queue; this is depricated, use CPC_CMD_T_CLEAR_MSG_QUEUE instead */
-#define	CPC_CMD_T_INQ_CAN_PARMS      11	/* inquire actual CAN parameters */
-#define	CPC_CMD_T_FILTER_PRMS        12	/* set filter parameter */
-#define	CPC_CMD_T_RTR                13	/* CAN remote frame */
-#define	CPC_CMD_T_CANSTATE           14	/* CAN state message */
-#define	CPC_CMD_T_XCAN               15	/* XCAN data frame */
-#define CPC_CMD_T_XRTR               16	/* XCAN remote frame */
-#define CPC_CMD_T_RESET              17	/* used to reset CAN-Controller */
-#define CPC_CMD_T_INQ_INFO           18	/* miscellanous information strings */
-#define CPC_CMD_T_OPEN_CHAN          19	/* open a channel */
-#define CPC_CMD_T_CLOSE_CHAN         20	/* close a channel */
-#define CPC_CMD_T_CNTBUF             21	/* this is depricated, use CPC_CMD_T_INQ_MSG_QUEUE_CNT instead */
-#define CPC_CMD_T_CAN_EXIT          200 /* exit the CAN (disable interrupts; reset bootrate; reset output_cntr; mode = 1) */
-
-#define CPC_CMD_T_INQ_MSG_QUEUE_CNT  CPC_CMD_T_CNTBUF   /* inquires the count of elements in the message queue */
-#define CPC_CMD_T_INQ_ERR_COUNTER    25	                /* request the CAN controllers error counter */
-#define	CPC_CMD_T_CLEAR_MSG_QUEUE    CPC_CMD_T_CLEARBUF /* clear CPC_MSG queue */
-#define	CPC_CMD_T_CLEAR_CMD_QUEUE    28	                /* clear CPC_CMD queue */
-#define CPC_CMD_T_FIRMWARE          100                 /* reserved, must not be used */
-#define CPC_CMD_T_USB_RESET         101                 /* reserved, must not be used */
-#define CPC_CMD_T_WAIT_NOTIFY       102                 /* reserved, must not be used */
-#define CPC_CMD_T_WAIT_SETUP        103                 /* reserved, must not be used */
-#define	CPC_CMD_T_ABORT             255                 /* Normally not to be used */
-
-/* definitions for CPC_MSG_T_INFO information sources */
-#define CPC_INFOMSG_T_UNKNOWN_SOURCE 0
-#define CPC_INFOMSG_T_INTERFACE      1
-#define CPC_INFOMSG_T_DRIVER         2
-#define CPC_INFOMSG_T_LIBRARY        3
-
-/* information types */
-#define CPC_INFOMSG_T_UNKNOWN_TYPE   0
-#define CPC_INFOMSG_T_VERSION        1
-#define CPC_INFOMSG_T_SERIAL         2
-
-/* definitions for controller types */
-#define PCA82C200   1 /* Philips basic CAN controller, replaced by SJA1000 */
-#define SJA1000     2 /* Philips basic CAN controller */
-#define AN82527     3 /* Intel full CAN controller */
-#define M16C_BASIC  4 /* M16C controller running in basic CAN (not full CAN) mode */
-
-/* channel open error codes */
-#define CPC_ERR_NO_FREE_CHANNEL            -1	/* no more free space within the channel array */
-#define CPC_ERR_CHANNEL_ALREADY_OPEN       -2	/* the channel is already open */
-#define CPC_ERR_CHANNEL_NOT_ACTIVE         -3	/* access to a channel not active failed */
-#define CPC_ERR_NO_DRIVER_PRESENT          -4	/* no driver at the location searched by the library */
-#define CPC_ERR_NO_INIFILE_PRESENT         -5	/* the library could not find the inifile */
-#define CPC_ERR_WRONG_PARAMETERS           -6	/* wrong parameters in the inifile */
-#define CPC_ERR_NO_INTERFACE_PRESENT       -7	/* 1. The specified interface is not connected */
-						/* 2. The interface (mostly CPC-USB) was disconnected upon operation */
-#define CPC_ERR_NO_MATCHING_CHANNEL        -8	/* the driver couldn't find a matching channel */
-#define CPC_ERR_NO_BUFFER_AVAILABLE        -9	/* the driver couldn't allocate buffer for messages */
-#define CPC_ERR_NO_INTERRUPT               -10	/* the requested interrupt couldn't be claimed */
-#define CPC_ERR_NO_MATCHING_INTERFACE      -11	/* no interface type related to this channel was found */
-#define CPC_ERR_NO_RESOURCES               -12	/* the requested resources could not be claimed */
-#define CPC_ERR_SOCKET                     -13	/* error concerning TCP sockets */
-
-/* init error codes */
-#define CPC_ERR_WRONG_CONTROLLER_TYPE      -14	/* wrong CAN controller type within initialization */
-#define CPC_ERR_NO_RESET_MODE              -15	/* the controller could not be set into reset mode */
-#define CPC_ERR_NO_CAN_ACCESS              -16	/* the CAN controller could not be accessed */
-
-/* transmit error codes */
-#define CPC_ERR_CAN_WRONG_ID               -20	/* the provided CAN id is too big */
-#define CPC_ERR_CAN_WRONG_LENGTH           -21	/* the provided CAN length is too long */
-#define CPC_ERR_CAN_NO_TRANSMIT_BUF        -22	/* the transmit buffer was occupied */
-#define CPC_ERR_CAN_TRANSMIT_TIMEOUT       -23	/* The message could not be sent within a */
-						/* specified time */
-
-/* other error codes */
-#define CPC_ERR_SERVICE_NOT_SUPPORTED      -30	/* the requested service is not supported by the interface */
-#define CPC_ERR_IO_TRANSFER                -31	/* a transmission error down to the driver occurred */
-#define CPC_ERR_TRANSMISSION_FAILED        -32	/* a transmission error down to the interface occurred */
-#define CPC_ERR_TRANSMISSION_TIMEOUT       -33	/* a timeout occurred within transmission to the interface */
-#define CPC_ERR_OP_SYS_NOT_SUPPORTED       -35	/* the operating system is not supported */
-#define CPC_ERR_UNKNOWN                    -40	/* an unknown error ocurred (mostly IOCTL errors) */
-
-#define CPC_ERR_LOADING_DLL                -50	/* the library 'cpcwin.dll' could not be loaded */
-#define CPC_ERR_ASSIGNING_FUNCTION         -51	/* the specified function could not be assigned */
-#define CPC_ERR_DLL_INITIALIZATION         -52	/* the DLL was not initialized correctly */
-#define CPC_ERR_MISSING_LICFILE            -55	/* the file containing the licenses does not exist */
-#define CPC_ERR_MISSING_LICENSE            -56	/* a required license was not found */
-
-/* CAN state bit values. Ignore any bits not listed */
-#define CPC_CAN_STATE_BUSOFF     0x80
-#define CPC_CAN_STATE_ERROR      0x40
-
-/* Mask to help ignore undefined bits */
-#define CPC_CAN_STATE_MASK       0xc0
-
-/*
- * CAN-Message representation in a CPC_MS
- * Message object type is CPC_MSG_T_CAN or CPC_MSG_T_RTR
- * or CPC_MSG_T_XCAN or CPC_MSG_T_XRTR
- */
-typedef struct CPC_CAN_MSG {
-	u32 id;
-	u8 length;
-	u8 msg[8];
-} CPC_CAN_MSG_T;
-
-/* representation of the CAN parameters for the PCA82C200 controller */
-typedef struct CPC_PCA82C200_PARAMS {
-	u8 acc_code;	/* Acceptance-code for receive, Standard: 0 */
-	u8 acc_mask;	/* Acceptance-mask for receive, Standard: 0xff (everything) */
-	u8 btr0;	/* Bus-timing register 0 */
-	u8 btr1;	/* Bus-timing register 1 */
-	u8 outp_contr;	/* Output-control register */
-} CPC_PCA82C200_PARAMS_T;
-
-/* representation of the CAN parameters for the SJA1000 controller */
-typedef struct CPC_SJA1000_PARAMS {
-	u8 mode;	/* enables single or dual acceptance filtering */
-	u8 acc_code0;	/* Acceptance-code for receive, Standard: 0 */
-	u8 acc_code1;
-	u8 acc_code2;
-	u8 acc_code3;
-	u8 acc_mask0;	/* Acceptance-mask for receive, Standard: 0xff (everything) */
-	u8 acc_mask1;
-	u8 acc_mask2;
-	u8 acc_mask3;
-	u8 btr0;	/* Bus-timing register 0 */
-	u8 btr1;	/* Bus-timing register 1 */
-	u8 outp_contr;	/* Output-control register */
-} CPC_SJA1000_PARAMS_T;
-
-/*
- * representation of the CAN parameters for the M16C controller
- * in basic CAN mode (means no full CAN)
- */
-typedef struct CPC_M16C_BASIC_PARAMS {
-	u8 con0;
-	u8 con1;
-	u8 ctlr0;
-	u8 ctlr1;
-	u8 clk;
-	u8 acc_std_code0;
-	u8 acc_std_code1;
-	u8 acc_ext_code0;
-	u8 acc_ext_code1;
-	u8 acc_ext_code2;
-	u8 acc_ext_code3;
-	u8 acc_std_mask0;
-	u8 acc_std_mask1;
-	u8 acc_ext_mask0;
-	u8 acc_ext_mask1;
-	u8 acc_ext_mask2;
-	u8 acc_ext_mask3;
-} CPC_M16C_BASIC_PARAMS_T;
-
-/* CAN params message representation */
-typedef struct CPC_CAN_PARAMS {
-	u8 cc_type;	/* represents the controller type */
-	union {
-		CPC_M16C_BASIC_PARAMS_T m16c_basic;
-		CPC_SJA1000_PARAMS_T sja1000;
-		CPC_PCA82C200_PARAMS_T pca82c200;
-	} cc_params;
-} CPC_CAN_PARAMS_T;
-
-/* CHAN init params representation */
-typedef struct CPC_CHAN_PARAMS {
-	int fd;
-} CPC_CHAN_PARAMS_T;
-
-/* CAN init params message representation */
-typedef struct CPC_INIT_PARAMS {
-	CPC_CHAN_PARAMS_T chanparams;
-	CPC_CAN_PARAMS_T canparams;
-} CPC_INIT_PARAMS_T;
-
-/* structure for confirmed message handling */
-typedef struct CPC_CONFIRM {
-	u8 result; /* error code */
-} CPC_CONFIRM_T;
-
-/* structure for information requests */
-typedef struct CPC_INFO {
-	u8 source;                 /* interface, driver or library */
-	u8 type;                   /* version or serial number */
-	char msg[CPC_MSG_LEN - 2]; /* string holding the requested information */
-} CPC_INFO_T;
-
-/*
- * OVERRUN
- * In general two types of overrun may occur.
- * A hardware overrun, where the CAN controller
- * lost a message, because the interrupt was
- * not handled before the next messgae comes in.
- * Or a software overrun, where i.e. a received
- * message could not be stored in the CPC_MSG
- * buffer.
- */
-
-/* After a software overrun has occurred
- * we wait until we have CPC_OVR_GAP slots
- * free in the CPC_MSG buffer.
- */
-#define CPC_OVR_GAP               10
-
-/*
- * Two types of software overrun may occur.
- * A received CAN message or a CAN state event
- * can cause an overrun.
- * Note: A CPC_CMD which would normally store
- * its result immediately in the CPC_MSG
- * queue may fail, because the message queue is full.
- * This will not generate an overrun message, but
- * will halt command execution, until this command
- * is able to store its message in the message queue.
- */
-#define CPC_OVR_EVENT_CAN       0x01
-#define CPC_OVR_EVENT_CANSTATE  0x02
-#define CPC_OVR_EVENT_BUSERROR  0x04
-
-/*
- * If the CAN controller lost a message
- * we indicate it with the highest bit
- * set in the count field.
- */
-#define CPC_OVR_HW              0x80
-
-/* structure for overrun conditions */
-typedef struct {
-	u8 event;
-	u8 count;
-} CPC_OVERRUN_T;
-
-/*
- * CAN errors
- * Each CAN controller type has different
- * registers to record errors.
- * Therefor a structure containing the specific
- * errors is set up for each controller here
- */
-
-/*
- * SJA1000 error structure
- * see the SJA1000 datasheet for detailed
- * explanation of the registers
- */
-typedef struct CPC_SJA1000_CAN_ERROR {
-	u8 ecc;   /* error capture code register */
-	u8 rxerr; /* RX error counter register */
-	u8 txerr; /* TX error counter register */
-} CPC_SJA1000_CAN_ERROR_T;
-
-/*
- * M16C error structure
- * see the M16C datasheet for detailed
- * explanation of the registers
- */
-typedef struct CPC_M16C_CAN_ERROR {
-	u8 tbd;	/* to be defined */
-} CPC_M16C_CAN_ERROR_T;
-
-/* structure for CAN error conditions */
-#define  CPC_CAN_ECODE_ERRFRAME   0x01
-typedef struct CPC_CAN_ERROR {
-	u8 ecode;
-	struct {
-		u8 cc_type; /* CAN controller type */
-		union {
-			CPC_SJA1000_CAN_ERROR_T sja1000;
-			CPC_M16C_CAN_ERROR_T m16c;
-		} regs;
-	} cc;
-} CPC_CAN_ERROR_T;
-
-/*
- * Structure containing RX/TX error counter.
- * This structure is used to request the
- * values of the CAN controllers TX and RX
- * error counter.
- */
-typedef struct CPC_CAN_ERR_COUNTER {
-	u8 rx;
-	u8 tx;
-} CPC_CAN_ERR_COUNTER_T;
-
-/* If this flag is set, transmissions from PC to CPC are protected against loss */
-#define CPC_SECURE_TO_CPC	0x01
-
-/* If this flag is set, transmissions from CPC to PC are protected against loss */
-#define CPC_SECURE_TO_PC	0x02
-
-/* If this flag is set, the CAN-transmit buffer is checked to be free before sending a message */
-#define CPC_SECURE_SEND		0x04
-
-/*
- * If this flag is set, the transmission complete flag is checked
- * after sending a message
- * THIS IS CURRENTLY ONLY IMPLEMENTED IN THE PASSIVE INTERFACE DRIVERS
- */
-#define CPC_SECURE_TRANSMIT	0x08
-
-/* main message type used between library and application */
-typedef struct CPC_MSG {
-	u8 type;	/* type of message */
-	u8 length;	/* length of data within union 'msg' */
-	u8 msgid;	/* confirmation handle */
-	u32 ts_sec;	/* timestamp in seconds */
-	u32 ts_nsec;	/* timestamp in nano seconds */
-	union {
-		u8 generic[CPC_MSG_LEN];
-		CPC_CAN_MSG_T canmsg;
-		CPC_CAN_PARAMS_T canparams;
-		CPC_CONFIRM_T confirmation;
-		CPC_INFO_T info;
-		CPC_OVERRUN_T overrun;
-		CPC_CAN_ERROR_T error;
-		CPC_CAN_ERR_COUNTER_T err_counter;
-		u8 busload;
-		u8 canstate;
-	} msg;
-} CPC_MSG_T;
-
-#endif /* CPC_HEADER */
diff --git a/drivers/staging/cpc-usb/cpc_int.h b/drivers/staging/cpc-usb/cpc_int.h
deleted file mode 100644
index 38674e9690a..00000000000
--- a/drivers/staging/cpc-usb/cpc_int.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * CPCLIB
- *
- * Copyright (C) 2000-2008 EMS Dr. Thomas Wuensche
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-#ifndef CPC_INT_H
-#define CPC_INT_H
-
-#include <linux/wait.h>
-
-#define CPC_MSG_BUF_CNT	1500
-
-#define CPC_PROC_DIR "driver/"
-
-#undef dbg
-#undef err
-#undef info
-
-/* Use our own dbg macro */
-#define dbg(format, arg...) do { if (debug) printk( KERN_INFO format "\n" , ## arg); } while (0)
-#define err(format, arg...) do { printk( KERN_INFO "ERROR " format "\n" , ## arg); } while (0)
-#define info(format, arg...) do { printk( KERN_INFO format "\n" , ## arg); } while (0)
-
-/* Macros help using of our buffers */
-#define IsBufferFull(x)     (!(x)->WnR) && ((x)->iidx == (x)->oidx)
-#define IsBufferEmpty(x)    ((x)->WnR) && ((x)->iidx == (x)->oidx)
-#define IsBufferNotEmpty(x) (!(x)->WnR) || ((x)->iidx != (x)->oidx)
-#define ResetBuffer(x)      do { (x)->oidx = (x)->iidx=0; (x)->WnR = 1; } while(0);
-
-#define CPC_BufWriteAllowed ((chan->oidx != chan->iidx) || chan->WnR)
-
-typedef void (*chan_write_byte_t) (void *chan, unsigned int reg,
-				   unsigned char val);
-typedef unsigned char (*chan_read_byte_t) (void *chan, unsigned int reg);
-
-typedef struct CPC_CHAN {
-	void __iomem * canBase;	/* base address of SJA1000 */
-	chan_read_byte_t read_byte;	/* CAN controller read access routine */
-	chan_write_byte_t write_byte;	/* CAN controller write access routine */
-	CPC_MSG_T *buf;		/* buffer for CPC msg */
-	unsigned int iidx;
-	unsigned int oidx;
-	unsigned int WnR;
-	unsigned int minor;
-	unsigned int locked;
-	unsigned int irqDisabled;
-
-	unsigned char cpcCtrlCANMessage;
-	unsigned char cpcCtrlCANState;
-	unsigned char cpcCtrlBUSState;
-
-	unsigned char controllerType;
-
-	unsigned long ovrTimeSec;
-	unsigned long ovrTimeNSec;
-	unsigned long ovrLockedBuffer;
-	CPC_OVERRUN_T ovr;
-
-	/* for debugging only */
-	unsigned int handledIrqs;
-	unsigned int lostMessages;
-
-	unsigned int sentStdCan;
-	unsigned int sentExtCan;
-	unsigned int sentStdRtr;
-	unsigned int sentExtRtr;
-
-	unsigned int recvStdCan;
-	unsigned int recvExtCan;
-	unsigned int recvStdRtr;
-	unsigned int recvExtRtr;
-
-	wait_queue_head_t *CPCWait_q;
-
-	void *private;
-} CPC_CHAN_T;
-
-#endif
diff --git a/drivers/staging/cpc-usb/cpcusb.h b/drivers/staging/cpc-usb/cpcusb.h
deleted file mode 100644
index 6bdf30be239..00000000000
--- a/drivers/staging/cpc-usb/cpcusb.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Header for CPC-USB Driver ********************
- * Copyright 1999, 2000, 2001
- *
- * Company:  EMS Dr. Thomas Wuensche
- *           Sonnenhang 3
- *           85304 Ilmmuenster
- *           Phone: +49-8441-490260
- *           Fax:   +49-8441-81860
- *           email: support@ems-wuensche.com
- *           WWW:   www.ems-wuensche.com
- */
-
-#ifndef CPCUSB_H
-#define CPCUSB_H
-
-#undef err
-#undef dbg
-#undef info
-
-/* Use our own dbg macro */
-#define dbg(format, arg...) do { if (debug) printk(KERN_INFO "CPC-USB: " format "\n" , ## arg); } while (0)
-#define info(format, arg...) do { printk(KERN_INFO "CPC-USB: " format "\n" , ## arg); } while (0)
-#define err(format, arg...) do { printk(KERN_INFO "CPC-USB(ERROR): " format "\n" , ## arg); } while (0)
-
-#define CPC_USB_CARD_CNT      4
-
-typedef struct CPC_USB_READ_URB {
-	unsigned char *buffer;	/* the buffer to send data */
-	size_t size;		/* the size of the send buffer */
-	struct urb *urb;	/* the urb used to send data */
-} CPC_USB_READ_URB_T;
-
-typedef struct CPC_USB_WRITE_URB {
-	unsigned char *buffer;	/* the buffer to send data */
-	size_t size;		/* the size of the send buffer */
-	struct urb *urb;	/* the urb used to send data */
-	atomic_t busy;		/* true if write urb is busy */
-	struct completion finished;	/* wait for the write to finish */
-} CPC_USB_WRITE_URB_T;
-
-#define CPC_USB_URB_CNT  10
-
-typedef struct CPC_USB {
-	struct usb_device *udev;	/* save off the usb device pointer */
-	struct usb_interface *interface;	/* the interface for this device */
-	unsigned char minor;	/* the starting minor number for this device */
-	unsigned char num_ports;	/* the number of ports this device has */
-	int num_intr_in;	/* number of interrupt in endpoints we have */
-	int num_bulk_in;	/* number of bulk in endpoints we have */
-	int num_bulk_out;	/* number of bulk out endpoints we have */
-
-	CPC_USB_READ_URB_T urbs[CPC_USB_URB_CNT];
-
-	unsigned char intr_in_buffer[4];	/* interrupt transfer buffer */
-	struct urb *intr_in_urb;	/* interrupt transfer urb */
-
-	CPC_USB_WRITE_URB_T wrUrbs[CPC_USB_URB_CNT];
-
-	int open;		/* if the port is open or not */
-	int present;		/* if the device is not disconnected */
-	struct semaphore sem;	/* locks this structure */
-
-	int free_slots;		/* free send slots of CPC-USB */
-	int idx;
-
-	spinlock_t slock;
-
-	char serialNumber[128];	/* serial number */
-	int productId;		/* product id to differ between M16C and LPC2119 */
-	CPC_CHAN_T *chan;
-} CPC_USB_T;
-
-#define CPCTable               CPCUSB_Table
-
-#define CPC_DRIVER_VERSION "0.724"
-#define CPC_DRIVER_SERIAL  "not applicable"
-
-#define OBUF_SIZE 255		/* 4096 */
-
-/* read timeouts -- RD_NAK_TIMEOUT * RD_EXPIRE = Number of seconds */
-#define RD_NAK_TIMEOUT (10*HZ)	/* Default number of X seconds to wait */
-#define RD_EXPIRE 12		/* Number of attempts to wait X seconds */
-
-#define CPC_USB_BASE_MNR 0	/* CPC-USB start at minor 0  */
-
-#endif
diff --git a/drivers/staging/cpc-usb/sja2m16c.h b/drivers/staging/cpc-usb/sja2m16c.h
deleted file mode 100644
index 654bd3fc91d..00000000000
--- a/drivers/staging/cpc-usb/sja2m16c.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _SJA2M16C_H
-#define _SJA2M16C_H
-
-#include "cpc.h"
-
-#define BAUDRATE_TOLERANCE_PERCENT	1
-#define SAMPLEPOINT_TOLERANCE_PERCENT	5
-#define SAMPLEPOINT_UPPER_LIMIT		88
-
-/* M16C parameters */
-struct FIELD_C0CONR {
-	unsigned int brp:4;
-	unsigned int sam:1;
-	unsigned int pr:3;
-	unsigned int dummy:8;
-};
-struct FIELD_C1CONR {
-	unsigned int ph1:3;
-	unsigned int ph2:3;
-	unsigned int sjw:2;
-	unsigned int dummy:8;
-};
-typedef union C0CONR {
-	unsigned char c0con;
-	struct FIELD_C0CONR bc0con;
-} C0CONR_T;
-typedef union C1CONR {
-	unsigned char c1con;
-	struct FIELD_C1CONR bc1con;
-} C1CONR_T;
-
-#define SJA_TSEG1	((pParams->btr1 & 0x0f)+1)
-#define SJA_TSEG2	(((pParams->btr1 & 0x70)>>4)+1)
-#define SJA_BRP		((pParams->btr0 & 0x3f)+1)
-#define SJA_SJW		((pParams->btr0 & 0xc0)>>6)
-#define SJA_SAM		((pParams->btr1 & 0x80)>>7)
-int baudrate_m16c(int clk, int brp, int pr, int ph1, int ph2);
-int samplepoint_m16c(int brp, int pr, int ph1, int ph2);
-int SJA1000_TO_M16C_BASIC_Params(CPC_MSG_T *pMsg);
-
-#endif
diff --git a/drivers/staging/cpc-usb/sja2m16c_2.c b/drivers/staging/cpc-usb/sja2m16c_2.c
deleted file mode 100644
index bf0230fb778..00000000000
--- a/drivers/staging/cpc-usb/sja2m16c_2.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/****************************************************************************
-*
-*      Copyright (c) 2003,2004 by EMS Dr. Thomas Wuensche
-*
-*                  - All rights reserved -
-*
-* This code is provided "as is" without warranty of any kind, either
-* expressed or implied, including but not limited to the liability
-* concerning the freedom from material defects, the fitness for parti-
-* cular purposes or the freedom of proprietary rights of third parties.
-*
-*****************************************************************************
-* Module name.: cpcusb
-*****************************************************************************
-* Include file: cpc.h
-*****************************************************************************
-* Project.....: Windows Driver Development Kit
-* Filename....: sja2m16c.cpp
-* Authors.....: (GU) Gerhard Uttenthaler
-*               (CS) Christian Schoett
-*****************************************************************************
-* Short descr.: converts baudrate between SJA1000 and M16C
-*****************************************************************************
-* Description.: handles the baudrate conversion from SJA1000 parameters to
-*               M16C parameters
-*****************************************************************************
-* Address     : EMS Dr. Thomas Wuensche
-*               Sonnenhang 3
-*               D-85304 Ilmmuenster
-*               Tel. : +49-8441-490260
-*               Fax. : +49-8441-81860
-*               email: support@ems-wuensche.com
-*****************************************************************************
-*                            History
-*****************************************************************************
-* Version  Date        Auth Remark
-*
-* 01.00    ??          GU   - initial release
-* 01.10    ??????????  CS   - adapted to fit into the USB Windows driver
-* 02.00    18.08.2004  GU   - improved the baudrate calculating algorithm
-*                           - implemented acceptance filtering
-* 02.10    10.09.2004  CS   - adapted to fit into the USB Windows driver
-*****************************************************************************
-*                            ToDo's
-*****************************************************************************
-*/
-
-/****************************************************************************/
-/*     I N C L U D E S
-*/
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/poll.h>
-#include <linux/smp_lock.h>
-#include <linux/completion.h>
-#include <asm/uaccess.h>
-#include <linux/usb.h>
-
-#include "cpc.h"
-#include "cpc_int.h"
-#include "cpcusb.h"
-
-#include "sja2m16c.h"
-
-/*********************************************************************/
-int baudrate_m16c(int clk, int brp, int pr, int ph1, int ph2)
-{
-	return (16000000 / (1 << clk)) / 2 / (brp + 1) / (1 + pr + 1 +
-							    ph1 + 1 + ph2 +
-							    1);
-}
-
-
-/*********************************************************************/
-int samplepoint_m16c(int brp, int pr, int ph1, int ph2)
-{
-	return (100 * (1 + pr + 1 + ph1 + 1)) / (1 + pr + 1 + ph1 + 1 +
-						  ph2 + 1);
-}
-
-
-/****************************************************************************
-* Function.....: SJA1000_TO_M16C_BASIC_Params
-*
-* Task.........: This routine converts SJA1000 CAN btr parameters into M16C
-*                parameters based on the sample point and the error. In
-*                addition it converts the acceptance filter parameters to
-*                suit the M16C parameters
-*
-* Parameters...: None
-*
-* Return values: None
-*
-* Comments.....:
-*****************************************************************************
-*                History
-*****************************************************************************
-* 19.01.2005  CS   - modifed the conversion of SJA1000 filter params into
-*                    M16C params. Due to compatibility reasons with the
-*                    older 82C200 CAN controller the SJA1000
-****************************************************************************/
-int SJA1000_TO_M16C_BASIC_Params(CPC_MSG_T * in)
-{
-	int sjaBaudrate;
-	int sjaSamplepoint;
-	int *baudrate_error;	// BRP[0..15], PR[0..7], PH1[0..7], PH2[0..7]
-	int *samplepoint_error;	// BRP[0..15], PR[0..7], PH1[0..7], PH2[0..7]
-	int baudrate_error_merk;
-	int clk, brp, pr, ph1, ph2;
-	int clk_merk, brp_merk, pr_merk, ph1_merk, ph2_merk;
-	int index;
-	unsigned char acc_code0, acc_code1, acc_code2, acc_code3;
-	unsigned char acc_mask0, acc_mask1, acc_mask2, acc_mask3;
-	CPC_MSG_T * out;
-	C0CONR_T c0con;
-	C1CONR_T c1con;
-	int tmpAccCode;
-	int tmpAccMask;
-
-	    // we have to convert the parameters into M16C parameters
-	    CPC_SJA1000_PARAMS_T * pParams;
-
-	    // check if the type is CAN parameters and if we have to convert the given params
-	    if (in->type != CPC_CMD_T_CAN_PRMS
-		|| in->msg.canparams.cc_type != SJA1000)
-		return 0;
-	pParams =
-	    (CPC_SJA1000_PARAMS_T *) & in->msg.canparams.cc_params.sja1000;
-	acc_code0 = pParams->acc_code0;
-	acc_code1 = pParams->acc_code1;
-	acc_code2 = pParams->acc_code2;
-	acc_code3 = pParams->acc_code3;
-	acc_mask0 = pParams->acc_mask0;
-	acc_mask1 = pParams->acc_mask1;
-	acc_mask2 = pParams->acc_mask2;
-	acc_mask3 = pParams->acc_mask3;
-
-#ifdef _DEBUG_OUTPUT_CAN_PARAMS
-	    info("acc_code0: %2.2Xh\n", acc_code0);
-	info("acc_code1: %2.2Xh\n", acc_code1);
-	info("acc_code2: %2.2Xh\n", acc_code2);
-	info("acc_code3: %2.2Xh\n", acc_code3);
-	info("acc_mask0: %2.2Xh\n", acc_mask0);
-	info("acc_mask1: %2.2Xh\n", acc_mask1);
-	info("acc_mask2: %2.2Xh\n", acc_mask2);
-	info("acc_mask3: %2.2Xh\n", acc_mask3);
-
-#endif	/*  */
-	    if (!
-		 (baudrate_error =
-		  (int *) vmalloc(sizeof(int) * 16 * 8 * 8 * 8 * 5))) {
-		err("Could not allocate memory\n");
-		return -3;
-	}
-	if (!
-	      (samplepoint_error =
-	       (int *) vmalloc(sizeof(int) * 16 * 8 * 8 * 8 * 5))) {
-		err("Could not allocate memory\n");
-		vfree(baudrate_error);
-		return -3;
-	}
-	memset(baudrate_error, 0xff, sizeof(baudrate_error));
-	memset(samplepoint_error, 0xff, sizeof(baudrate_error));
-	sjaBaudrate =
-	    16000000 / 2 / SJA_BRP / (1 + SJA_TSEG1 + SJA_TSEG2);
-	sjaSamplepoint =
-	    100 * (1 + SJA_TSEG1) / (1 + SJA_TSEG1 + SJA_TSEG2);
-	if (sjaBaudrate == 0) {
-		vfree(baudrate_error);
-		vfree(samplepoint_error);
-		return -2;
-	}
-
-#ifdef _DEBUG_OUTPUT_CAN_PARAMS
-	    info("\nStarting SJA CAN params\n");
-	info("-------------------------\n");
-	info("TS1     : %2.2Xh TS2 : %2.2Xh\n", SJA_TSEG1, SJA_TSEG2);
-	info("BTR0    : %2.2Xh BTR1: %2.2Xh\n", pParams->btr0,
-	      pParams->btr1);
-	info("Baudrate: %d.%dkBaud\n", sjaBaudrate / 1000,
-	      sjaBaudrate % 1000);
-	info("Sample P: 0.%d\n", sjaSamplepoint);
-	info("\n");
-
-#endif	/*  */
-	    c0con.bc0con.sam = SJA_SAM;
-	c1con.bc1con.sjw = SJA_SJW;
-
-	    // calculate errors for all baudrates
-	    index = 0;
-	for (clk = 0; clk < 5; clk++) {
-		for (brp = 0; brp < 16; brp++) {
-			for (pr = 0; pr < 8; pr++) {
-				for (ph1 = 0; ph1 < 8; ph1++) {
-					for (ph2 = 0; ph2 < 8; ph2++) {
-						baudrate_error[index] =
-						    100 *
-						    abs(baudrate_m16c
-							(clk, brp, pr, ph1,
-							 ph2) -
-							sjaBaudrate) /
-						    sjaBaudrate;
-						samplepoint_error[index] =
-						    abs(samplepoint_m16c
-							(brp, pr, ph1,
-							 ph2) -
-							sjaSamplepoint);
-
-#if 0
-						    info
-						    ("Baudrate      : %d kBaud\n",
-						     baudrate_m16c(clk,
-								   brp, pr,
-								   ph1,
-								   ph2));
-						info
-						    ("Baudrate Error: %d\n",
-						     baudrate_error
-						     [index]);
-						info
-						    ("Sample P Error: %d\n",
-						     samplepoint_error
-						     [index]);
-						info
-						    ("clk           : %d\n",
-						     clk);
-
-#endif	/*  */
-						    index++;
-					}
-				}
-			}
-		}
-	}
-
-	    // mark all baudrate_error entries which are outer limits
-	    index = 0;
-	for (clk = 0; clk < 5; clk++) {
-		for (brp = 0; brp < 16; brp++) {
-			for (pr = 0; pr < 8; pr++) {
-				for (ph1 = 0; ph1 < 8; ph1++) {
-					for (ph2 = 0; ph2 < 8; ph2++) {
-						if ((baudrate_error[index]
-						      >
-						      BAUDRATE_TOLERANCE_PERCENT)
-						     ||
-						     (samplepoint_error
-						       [index] >
-						       SAMPLEPOINT_TOLERANCE_PERCENT)
-						     ||
-						     (samplepoint_m16c
-						       (brp, pr, ph1,
-							ph2) >
-						       SAMPLEPOINT_UPPER_LIMIT))
-						{
-							baudrate_error
-							    [index] = -1;
-						} else
-						    if (((1 + pr + 1 +
-							  ph1 + 1 + ph2 +
-							  1) < 8)
-							||
-							((1 + pr + 1 +
-							  ph1 + 1 + ph2 +
-							  1) > 25)) {
-							baudrate_error
-							    [index] = -1;
-						}
-
-#if 0
-						    else {
-							info
-							    ("Baudrate      : %d kBaud\n",
-							     baudrate_m16c
-							     (clk, brp, pr,
-							      ph1, ph2));
-							info
-							    ("Baudrate Error: %d\n",
-							     baudrate_error
-							     [index]);
-							info
-							    ("Sample P Error: %d\n",
-							     samplepoint_error
-							     [index]);
-						}
-
-#endif	/*  */
-						    index++;
-					}
-				}
-			}
-		}
-	}
-
-	    // find list of minimum of baudrate_error within unmarked entries
-	    clk_merk = brp_merk = pr_merk = ph1_merk = ph2_merk = 0;
-	baudrate_error_merk = 100;
-	index = 0;
-	for (clk = 0; clk < 5; clk++) {
-		for (brp = 0; brp < 16; brp++) {
-			for (pr = 0; pr < 8; pr++) {
-				for (ph1 = 0; ph1 < 8; ph1++) {
-					for (ph2 = 0; ph2 < 8; ph2++) {
-						if (baudrate_error[index]
-						     != -1) {
-							if (baudrate_error
-							     [index] <
-							     baudrate_error_merk)
-							{
-								baudrate_error_merk
-								    =
-								    baudrate_error
-								    [index];
-								brp_merk =
-								    brp;
-								pr_merk =
-								    pr;
-								ph1_merk =
-								    ph1;
-								ph2_merk =
-								    ph2;
-								clk_merk =
-								    clk;
-
-#if 0
-								    info
-								    ("brp: %2.2Xh pr: %2.2Xh ph1: %2.2Xh ph2: %2.2Xh\n",
-								     brp,
-								     pr,
-								     ph1,
-								     ph2);
-								info
-								    ("Baudrate      : %d kBaud\n",
-								     baudrate_m16c
-								     (clk,
-								      brp,
-								      pr,
-								      ph1,
-								      ph2));
-								info
-								    ("Baudrate Error: %d\n",
-								     baudrate_error
-								     [index]);
-								info
-								    ("Sample P Error: %d\n",
-								     samplepoint_error
-								     [index]);
-
-#endif	/*  */
-							}
-						}
-						index++;
-					}
-				}
-			}
-		}
-	}
-	if (baudrate_error_merk == 100) {
-		info("ERROR: Could not convert CAN init parameter\n");
-		vfree(baudrate_error);
-		vfree(samplepoint_error);
-		return -1;
-	}
-
-	    // setting m16c CAN parameter
-	    c0con.bc0con.brp = brp_merk;
-	c0con.bc0con.pr = pr_merk;
-	c1con.bc1con.ph1 = ph1_merk;
-	c1con.bc1con.ph2 = ph2_merk;
-
-#ifdef _DEBUG_OUTPUT_CAN_PARAMS
-	    info("\nResulting M16C CAN params\n");
-	info("-------------------------\n");
-	info("clk     : %2.2Xh\n", clk_merk);
-	info("ph1     : %2.2Xh ph2: %2.2Xh\n", c1con.bc1con.ph1 + 1,
-	      c1con.bc1con.ph2 + 1);
-	info("pr      : %2.2Xh brp: %2.2Xh\n", c0con.bc0con.pr + 1,
-	      c0con.bc0con.brp + 1);
-	info("sjw     : %2.2Xh sam: %2.2Xh\n", c1con.bc1con.sjw,
-	      c0con.bc0con.sam);
-	info("co1     : %2.2Xh co0: %2.2Xh\n", c1con.c1con, c0con.c0con);
-	info("Baudrate: %d.%dBaud\n",
-	       baudrate_m16c(clk_merk, c0con.bc0con.brp, c0con.bc0con.pr,
-			     c1con.bc1con.ph1, c1con.bc1con.ph2) / 1000,
-	       baudrate_m16c(clk_merk, c0con.bc0con.brp, c0con.bc0con.pr,
-			      c1con.bc1con.ph1, c1con.bc1con.ph2) % 1000);
-	info("Sample P: 0.%d\n",
-	      samplepoint_m16c(c0con.bc0con.brp, c0con.bc0con.pr,
-			       c1con.bc1con.ph1, c1con.bc1con.ph2));
-	info("\n");
-
-#endif	/*  */
-	    out = in;
-	out->type = 6;
-	out->length = sizeof(CPC_M16C_BASIC_PARAMS_T) + 1;
-	out->msg.canparams.cc_type = M16C_BASIC;
-	out->msg.canparams.cc_params.m16c_basic.con0 = c0con.c0con;
-	out->msg.canparams.cc_params.m16c_basic.con1 = c1con.c1con;
-	out->msg.canparams.cc_params.m16c_basic.ctlr0 = 0x4C;
-	out->msg.canparams.cc_params.m16c_basic.ctlr1 = 0x00;
-	out->msg.canparams.cc_params.m16c_basic.clk = clk_merk;
-	out->msg.canparams.cc_params.m16c_basic.acc_std_code0 =
-	    acc_code0;
-	out->msg.canparams.cc_params.m16c_basic.acc_std_code1 = acc_code1;
-
-//      info("code0: 0x%2.2X, code1: 0x%2.2X\n", out->msg.canparams.cc_params.m16c_basic.acc_std_code0, out->msg.canparams.cc_params.m16c_basic.acc_std_code1);
-	    tmpAccCode = (acc_code1 >> 5) + (acc_code0 << 3);
-	out->msg.canparams.cc_params.m16c_basic.acc_std_code0 =
-	    (unsigned char) tmpAccCode;
-	out->msg.canparams.cc_params.m16c_basic.acc_std_code1 =
-	    (unsigned char) (tmpAccCode >> 8);
-
-//      info("code0: 0x%2.2X, code1: 0x%2.2X\n", out->msg.canparams.cc_params.m16c_basic.acc_std_code0, out->msg.canparams.cc_params.m16c_basic.acc_std_code1);
-	    out->msg.canparams.cc_params.m16c_basic.acc_std_mask0 =
-	    ~acc_mask0;
-	out->msg.canparams.cc_params.m16c_basic.acc_std_mask1 =
-	    ~acc_mask1;
-
-//      info("mask0: 0x%2.2X, mask1: 0x%2.2X\n", out->msg.canparams.cc_params.m16c_basic.acc_std_mask0, out->msg.canparams.cc_params.m16c_basic.acc_std_mask1);
-	    tmpAccMask = ((acc_mask1) >> 5) + ((acc_mask0) << 3);
-
-//      info("tmpAccMask: 0x%4.4X\n", tmpAccMask);
-	    out->msg.canparams.cc_params.m16c_basic.acc_std_mask0 =
-	    (unsigned char) ~tmpAccMask;
-	out->msg.canparams.cc_params.m16c_basic.acc_std_mask1 =
-	    (unsigned char) ~(tmpAccMask >> 8);
-
-//      info("mask0: 0x%2.2X, mask1: 0x%2.2X\n", out->msg.canparams.cc_params.m16c_basic.acc_std_mask0, out->msg.canparams.cc_params.m16c_basic.acc_std_mask1);
-	    out->msg.canparams.cc_params.m16c_basic.acc_ext_code0 =
-	    (unsigned char) tmpAccCode;
-	out->msg.canparams.cc_params.m16c_basic.acc_ext_code1 =
-	    (unsigned char) (tmpAccCode >> 8);
-	out->msg.canparams.cc_params.m16c_basic.acc_ext_code2 = acc_code2;
-	out->msg.canparams.cc_params.m16c_basic.acc_ext_code3 = acc_code3;
-	out->msg.canparams.cc_params.m16c_basic.acc_ext_mask0 =
-	    (unsigned char) ~tmpAccMask;
-	out->msg.canparams.cc_params.m16c_basic.acc_ext_mask1 =
-	    (unsigned char) ~(tmpAccMask >> 8);
-	out->msg.canparams.cc_params.m16c_basic.acc_ext_mask2 =
-	    ~acc_mask2;
-	out->msg.canparams.cc_params.m16c_basic.acc_ext_mask3 =
-	    ~acc_mask3;
-	vfree(baudrate_error);
-	vfree(samplepoint_error);
-	return 0;
-}
-
-
diff --git a/drivers/staging/go7007/Makefile b/drivers/staging/go7007/Makefile
index d14ea84a01f..1301caa7495 100644
--- a/drivers/staging/go7007/Makefile
+++ b/drivers/staging/go7007/Makefile
@@ -32,8 +32,3 @@ endif
 
 EXTRA_CFLAGS += -Idrivers/media/dvb/frontends
 EXTRA_CFLAGS += -Idrivers/media/dvb/dvb-core
-
-# Ubuntu 8.04 has CONFIG_SND undefined, so include lum sound/config.h too
-ifeq ($(CONFIG_SND),)
-EXTRA_CFLAGS += -include sound/config.h
-endif
diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig
index ebd7237230e..240750881d2 100644
--- a/drivers/usb/Kconfig
+++ b/drivers/usb/Kconfig
@@ -22,7 +22,6 @@ config USB_ARCH_HAS_HCD
 	default y if PCMCIA && !M32R			# sl811_cs
 	default y if ARM				# SL-811
 	default y if SUPERH				# r8a66597-hcd
-	default y if MICROBLAZE
 	default PCI
 
 # many non-PCI SOC chips embed OHCI
diff --git a/drivers/usb/gadget/f_loopback.c b/drivers/usb/gadget/f_loopback.c
index eb6ddfc2085..6cb29d3df57 100644
--- a/drivers/usb/gadget/f_loopback.c
+++ b/drivers/usb/gadget/f_loopback.c
@@ -22,7 +22,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 
 #include "g_zero.h"
diff --git a/drivers/usb/gadget/f_obex.c b/drivers/usb/gadget/f_obex.c
index 46d6266f30e..b4a3ba654ea 100644
--- a/drivers/usb/gadget/f_obex.c
+++ b/drivers/usb/gadget/f_obex.c
@@ -24,7 +24,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 
 #include "u_serial.h"
diff --git a/drivers/usb/gadget/f_sourcesink.c b/drivers/usb/gadget/f_sourcesink.c
index bffe91d525f..09cba273d2d 100644
--- a/drivers/usb/gadget/f_sourcesink.c
+++ b/drivers/usb/gadget/f_sourcesink.c
@@ -22,7 +22,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 
 #include "g_zero.h"
diff --git a/drivers/usb/gadget/u_audio.c b/drivers/usb/gadget/u_audio.c
index b5200d55145..8252595d619 100644
--- a/drivers/usb/gadget/u_audio.c
+++ b/drivers/usb/gadget/u_audio.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c
index f8751ff863c..2fc02bd9584 100644
--- a/drivers/usb/gadget/u_ether.c
+++ b/drivers/usb/gadget/u_ether.c
@@ -23,7 +23,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 #include <linux/ctype.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 68fa0e43b78..8c075b2416b 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -912,6 +912,7 @@ static void sierra_release(struct usb_serial *serial)
 	}
 }
 
+#ifdef CONFIG_PM
 static void stop_read_write_urbs(struct usb_serial *serial)
 {
 	int i, j;
@@ -988,6 +989,10 @@ static int sierra_resume(struct usb_serial *serial)
 
 	return ec ? -EIO : 0;
 }
+#else
+#define sierra_suspend NULL
+#define sierra_resume NULL
+#endif
 
 static struct usb_serial_driver sierra_device = {
 	.driver = {
diff --git a/drivers/vlynq/vlynq.c b/drivers/vlynq/vlynq.c
index ba3d71f5c7d..9554ad5f9af 100644
--- a/drivers/vlynq/vlynq.c
+++ b/drivers/vlynq/vlynq.c
@@ -702,7 +702,7 @@ static int vlynq_probe(struct platform_device *pdev)
 	dev->mem_start = mem_res->start;
 	dev->mem_end = mem_res->end;
 
-	len = regs_res->end - regs_res->start;
+	len = resource_size(regs_res);
 	if (!request_mem_region(regs_res->start, len, dev_name(&dev->dev))) {
 		printk(KERN_ERR "%s: Can't request vlynq registers\n",
 		       dev_name(&dev->dev));
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 74e0723e90b..795233702a4 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -8,3 +8,12 @@ config 9P_FS
 	  See <http://v9fs.sf.net> for more information.
 
 	  If unsure, say N.
+
+config 9P_FSCACHE
+	bool "Enable 9P client caching support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
+	help
+	  Choose Y here to enable persistent, read-only local
+	  caching support for 9p clients using FS-Cache
+
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index bc7f0d1551e..1a940ec7af6 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,5 +8,6 @@ obj-$(CONFIG_9P_FS) := 9p.o
 	vfs_dir.o \
 	vfs_dentry.o \
 	v9fs.o \
-	fid.o \
+	fid.o
 
+9p-$(CONFIG_9P_FSCACHE) += cache.o
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
new file mode 100644
index 00000000000..51c94e26a34
--- /dev/null
+++ b/fs/9p/cache.c
@@ -0,0 +1,474 @@
+/*
+ * V9FS cache definitions.
+ *
+ *  Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/jiffies.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+
+#include "v9fs.h"
+#include "cache.h"
+
+#define CACHETAG_LEN  11
+
+struct kmem_cache *vcookie_cache;
+
+struct fscache_netfs v9fs_cache_netfs = {
+	.name 		= "9p",
+	.version 	= 0,
+};
+
+static void init_once(void *foo)
+{
+	struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
+	vcookie->fscache = NULL;
+	vcookie->qid = NULL;
+	inode_init_once(&vcookie->inode);
+}
+
+/**
+ * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
+ *			    vcookie to inode mapping
+ *
+ * Returns 0 on success.
+ */
+
+static int v9fs_init_vcookiecache(void)
+{
+	vcookie_cache = kmem_cache_create("vcookie_cache",
+					  sizeof(struct v9fs_cookie),
+					  0, (SLAB_RECLAIM_ACCOUNT|
+					      SLAB_MEM_SPREAD),
+					  init_once);
+	if (!vcookie_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * v9fs_destroy_vcookiecache - destroy the cache of vcookies
+ *
+ */
+
+static void v9fs_destroy_vcookiecache(void)
+{
+	kmem_cache_destroy(vcookie_cache);
+}
+
+int __v9fs_cache_register(void)
+{
+	int ret;
+	ret = v9fs_init_vcookiecache();
+	if (ret < 0)
+		return ret;
+
+	return fscache_register_netfs(&v9fs_cache_netfs);
+}
+
+void __v9fs_cache_unregister(void)
+{
+	v9fs_destroy_vcookiecache();
+	fscache_unregister_netfs(&v9fs_cache_netfs);
+}
+
+/**
+ * v9fs_random_cachetag - Generate a random tag to be associated
+ *			  with a new cache session.
+ *
+ * The value of jiffies is used for a fairly randomly cache tag.
+ */
+
+static
+int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
+{
+	v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL);
+	if (!v9ses->cachetag)
+		return -ENOMEM;
+
+	return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
+}
+
+static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
+					   void *buffer, uint16_t bufmax)
+{
+	struct v9fs_session_info *v9ses;
+	uint16_t klen = 0;
+
+	v9ses = (struct v9fs_session_info *)cookie_netfs_data;
+	P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
+		   buffer, bufmax);
+
+	if (v9ses->cachetag)
+		klen = strlen(v9ses->cachetag);
+
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, v9ses->cachetag, klen);
+	P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
+	return klen;
+}
+
+const struct fscache_cookie_def v9fs_cache_session_index_def = {
+	.name 		= "9P.session",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key 	= v9fs_cache_session_get_key,
+};
+
+void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
+{
+	/* If no cache session tag was specified, we generate a random one. */
+	if (!v9ses->cachetag)
+		v9fs_random_cachetag(v9ses);
+
+	v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
+						&v9fs_cache_session_index_def,
+						v9ses);
+	P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
+		   v9ses->fscache);
+}
+
+void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
+{
+	P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
+		   v9ses->fscache);
+	fscache_relinquish_cookie(v9ses->fscache, 0);
+	v9ses->fscache = NULL;
+}
+
+
+static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
+					 void *buffer, uint16_t bufmax)
+{
+	const struct v9fs_cookie *vcookie = cookie_netfs_data;
+	memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
+		   vcookie->qid->path);
+	return sizeof(vcookie->qid->path);
+}
+
+static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
+				      uint64_t *size)
+{
+	const struct v9fs_cookie *vcookie = cookie_netfs_data;
+	*size = i_size_read(&vcookie->inode);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
+		   *size);
+}
+
+static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
+					 void *buffer, uint16_t buflen)
+{
+	const struct v9fs_cookie *vcookie = cookie_netfs_data;
+	memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
+		   vcookie->qid->version);
+	return sizeof(vcookie->qid->version);
+}
+
+static enum
+fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
+					    const void *buffer,
+					    uint16_t buflen)
+{
+	const struct v9fs_cookie *vcookie = cookie_netfs_data;
+
+	if (buflen != sizeof(vcookie->qid->version))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	if (memcmp(buffer, &vcookie->qid->version,
+		   sizeof(vcookie->qid->version)))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
+{
+	struct v9fs_cookie *vcookie = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	for (;;) {
+		nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
+					  first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+const struct fscache_cookie_def v9fs_cache_inode_index_def = {
+	.name		= "9p.inode",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= v9fs_cache_inode_get_key,
+	.get_attr	= v9fs_cache_inode_get_attr,
+	.get_aux	= v9fs_cache_inode_get_aux,
+	.check_aux	= v9fs_cache_inode_check_aux,
+	.now_uncached	= v9fs_cache_inode_now_uncached,
+};
+
+void v9fs_cache_inode_get_cookie(struct inode *inode)
+{
+	struct v9fs_cookie *vcookie;
+	struct v9fs_session_info *v9ses;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	vcookie = v9fs_inode2cookie(inode);
+	if (vcookie->fscache)
+		return;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+						  &v9fs_cache_inode_index_def,
+						  vcookie);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
+		   vcookie->fscache);
+}
+
+void v9fs_cache_inode_put_cookie(struct inode *inode)
+{
+	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+	if (!vcookie->fscache)
+		return;
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
+		   vcookie->fscache);
+
+	fscache_relinquish_cookie(vcookie->fscache, 0);
+	vcookie->fscache = NULL;
+}
+
+void v9fs_cache_inode_flush_cookie(struct inode *inode)
+{
+	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+	if (!vcookie->fscache)
+		return;
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
+		   vcookie->fscache);
+
+	fscache_relinquish_cookie(vcookie->fscache, 1);
+	vcookie->fscache = NULL;
+}
+
+void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
+{
+	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	struct p9_fid *fid;
+
+	if (!vcookie->fscache)
+		return;
+
+	spin_lock(&vcookie->lock);
+	fid = filp->private_data;
+	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+		v9fs_cache_inode_flush_cookie(inode);
+	else
+		v9fs_cache_inode_get_cookie(inode);
+
+	spin_unlock(&vcookie->lock);
+}
+
+void v9fs_cache_inode_reset_cookie(struct inode *inode)
+{
+	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	struct v9fs_session_info *v9ses;
+	struct fscache_cookie *old;
+
+	if (!vcookie->fscache)
+		return;
+
+	old = vcookie->fscache;
+
+	spin_lock(&vcookie->lock);
+	fscache_relinquish_cookie(vcookie->fscache, 1);
+
+	v9ses = v9fs_inode2v9ses(inode);
+	vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+						  &v9fs_cache_inode_index_def,
+						  vcookie);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
+		   inode, old, vcookie->fscache);
+
+	spin_unlock(&vcookie->lock);
+}
+
+int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	struct inode *inode = page->mapping->host;
+	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+	BUG_ON(!vcookie->fscache);
+
+	if (PageFsCache(page)) {
+		if (fscache_check_page_write(vcookie->fscache, page)) {
+			if (!(gfp & __GFP_WAIT))
+				return 0;
+			fscache_wait_on_page_write(vcookie->fscache, page);
+		}
+
+		fscache_uncache_page(vcookie->fscache, page);
+		ClearPageFsCache(page);
+	}
+
+	return 1;
+}
+
+void __v9fs_fscache_invalidate_page(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+	BUG_ON(!vcookie->fscache);
+
+	if (PageFsCache(page)) {
+		fscache_wait_on_page_write(vcookie->fscache, page);
+		BUG_ON(!PageLocked(page));
+		fscache_uncache_page(vcookie->fscache, page);
+		ClearPageFsCache(page);
+	}
+}
+
+static void v9fs_vfs_readpage_complete(struct page *page, void *data,
+				       int error)
+{
+	if (!error)
+		SetPageUptodate(page);
+
+	unlock_page(page);
+}
+
+/**
+ * __v9fs_readpage_from_fscache - read a page from cache
+ *
+ * Returns 0 if the pages are in cache and a BIO is submitted,
+ * 1 if the pages are not in cache and -error otherwise.
+ */
+
+int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+	const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	if (!vcookie->fscache)
+		return -ENOBUFS;
+
+	ret = fscache_read_or_alloc_page(vcookie->fscache,
+					 page,
+					 v9fs_vfs_readpage_complete,
+					 NULL,
+					 GFP_KERNEL);
+	switch (ret) {
+	case -ENOBUFS:
+	case -ENODATA:
+		P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
+		return 1;
+	case 0:
+		P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+		return ret;
+	default:
+		P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+		return ret;
+	}
+}
+
+/**
+ * __v9fs_readpages_from_fscache - read multiple pages from cache
+ *
+ * Returns 0 if the pages are in cache and a BIO is submitted,
+ * 1 if the pages are not in cache and -error otherwise.
+ */
+
+int __v9fs_readpages_from_fscache(struct inode *inode,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned *nr_pages)
+{
+	int ret;
+	const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
+	if (!vcookie->fscache)
+		return -ENOBUFS;
+
+	ret = fscache_read_or_alloc_pages(vcookie->fscache,
+					  mapping, pages, nr_pages,
+					  v9fs_vfs_readpage_complete,
+					  NULL,
+					  mapping_gfp_mask(mapping));
+	switch (ret) {
+	case -ENOBUFS:
+	case -ENODATA:
+		P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
+		return 1;
+	case 0:
+		BUG_ON(!list_empty(pages));
+		BUG_ON(*nr_pages != 0);
+		P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+		return ret;
+	default:
+		P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+		return ret;
+	}
+}
+
+/**
+ * __v9fs_readpage_to_fscache - write a page to the cache
+ *
+ */
+
+void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+	const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
+	P9_DPRINTK(P9_DEBUG_FSC, "ret =  %d", ret);
+	if (ret != 0)
+		v9fs_uncache_page(inode, page);
+}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
new file mode 100644
index 00000000000..a94192bfaee
--- /dev/null
+++ b/fs/9p/cache.h
@@ -0,0 +1,176 @@
+/*
+ * V9FS cache definitions.
+ *
+ *  Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#ifndef _9P_CACHE_H
+#ifdef CONFIG_9P_FSCACHE
+#include <linux/fscache.h>
+#include <linux/spinlock.h>
+
+extern struct kmem_cache *vcookie_cache;
+
+struct v9fs_cookie {
+	spinlock_t lock;
+	struct inode inode;
+	struct fscache_cookie *fscache;
+	struct p9_qid *qid;
+};
+
+static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
+{
+	return container_of(inode, struct v9fs_cookie, inode);
+}
+
+extern struct fscache_netfs v9fs_cache_netfs;
+extern const struct fscache_cookie_def v9fs_cache_session_index_def;
+extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
+
+extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses);
+extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses);
+
+extern void v9fs_cache_inode_get_cookie(struct inode *inode);
+extern void v9fs_cache_inode_put_cookie(struct inode *inode);
+extern void v9fs_cache_inode_flush_cookie(struct inode *inode);
+extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp);
+extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
+
+extern int __v9fs_cache_register(void);
+extern void __v9fs_cache_unregister(void);
+
+extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
+extern void __v9fs_fscache_invalidate_page(struct page *page);
+extern int __v9fs_readpage_from_fscache(struct inode *inode,
+					struct page *page);
+extern int __v9fs_readpages_from_fscache(struct inode *inode,
+					 struct address_space *mapping,
+					 struct list_head *pages,
+					 unsigned *nr_pages);
+extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
+
+
+/**
+ * v9fs_cache_register - Register v9fs file system with the cache
+ */
+static inline int v9fs_cache_register(void)
+{
+	return __v9fs_cache_register();
+}
+
+/**
+ * v9fs_cache_unregister - Unregister v9fs from the cache
+ */
+static inline void v9fs_cache_unregister(void)
+{
+	__v9fs_cache_unregister();
+}
+
+static inline int v9fs_fscache_release_page(struct page *page,
+					    gfp_t gfp)
+{
+	return __v9fs_fscache_release_page(page, gfp);
+}
+
+static inline void v9fs_fscache_invalidate_page(struct page *page)
+{
+	__v9fs_fscache_invalidate_page(page);
+}
+
+static inline int v9fs_readpage_from_fscache(struct inode *inode,
+					     struct page *page)
+{
+	return __v9fs_readpage_from_fscache(inode, page);
+}
+
+static inline int v9fs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return __v9fs_readpages_from_fscache(inode, mapping, pages,
+					     nr_pages);
+}
+
+static inline void v9fs_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{
+	if (PageFsCache(page))
+		__v9fs_readpage_to_fscache(inode, page);
+}
+
+static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
+{
+	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	fscache_uncache_page(vcookie->fscache, page);
+	BUG_ON(PageFsCache(page));
+}
+
+static inline void v9fs_vcookie_set_qid(struct inode *inode,
+					struct p9_qid *qid)
+{
+	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	spin_lock(&vcookie->lock);
+	vcookie->qid = qid;
+	spin_unlock(&vcookie->lock);
+}
+
+#else /* CONFIG_9P_FSCACHE */
+
+static inline int v9fs_cache_register(void)
+{
+	return 1;
+}
+
+static inline void v9fs_cache_unregister(void) {}
+
+static inline int v9fs_fscache_release_page(struct page *page,
+					    gfp_t gfp) {
+	return 1;
+}
+
+static inline void v9fs_fscache_invalidate_page(struct page *page) {}
+
+static inline int v9fs_readpage_from_fscache(struct inode *inode,
+					     struct page *page)
+{
+	return -ENOBUFS;
+}
+
+static inline int v9fs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+
+static inline void v9fs_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{}
+
+static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
+{}
+
+static inline void v9fs_vcookie_set_qid(struct inode *inode,
+					struct p9_qid *qid)
+{}
+
+#endif /* CONFIG_9P_FSCACHE */
+#endif /* _9P_CACHE_H */
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f7003cfac63..cf62b05e296 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -34,21 +34,25 @@
 #include <net/9p/transport.h>
 #include "v9fs.h"
 #include "v9fs_vfs.h"
+#include "cache.h"
+
+static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
+static LIST_HEAD(v9fs_sessionlist);
 
 /*
-  * Option Parsing (code inspired by NFS code)
-  *  NOTE: each transport will parse its own options
-  */
+ * Option Parsing (code inspired by NFS code)
+ *  NOTE: each transport will parse its own options
+ */
 
 enum {
 	/* Options that take integer arguments */
 	Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
 	/* String options */
-	Opt_uname, Opt_remotename, Opt_trans,
+	Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
 	/* Options that take no arguments */
 	Opt_nodevmap,
 	/* Cache options */
-	Opt_cache_loose,
+	Opt_cache_loose, Opt_fscache,
 	/* Access options */
 	Opt_access,
 	/* Error token */
@@ -63,8 +67,10 @@ static const match_table_t tokens = {
 	{Opt_uname, "uname=%s"},
 	{Opt_remotename, "aname=%s"},
 	{Opt_nodevmap, "nodevmap"},
-	{Opt_cache_loose, "cache=loose"},
+	{Opt_cache, "cache=%s"},
 	{Opt_cache_loose, "loose"},
+	{Opt_fscache, "fscache"},
+	{Opt_cachetag, "cachetag=%s"},
 	{Opt_access, "access=%s"},
 	{Opt_err, NULL}
 };
@@ -89,16 +95,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 	v9ses->afid = ~0;
 	v9ses->debug = 0;
 	v9ses->cache = 0;
+#ifdef CONFIG_9P_FSCACHE
+	v9ses->cachetag = NULL;
+#endif
 
 	if (!opts)
 		return 0;
 
 	options = kstrdup(opts, GFP_KERNEL);
-	if (!options) {
-		P9_DPRINTK(P9_DEBUG_ERROR,
-			   "failed to allocate copy of option string\n");
-		return -ENOMEM;
-	}
+	if (!options)
+		goto fail_option_alloc;
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
@@ -143,16 +149,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 		case Opt_cache_loose:
 			v9ses->cache = CACHE_LOOSE;
 			break;
+		case Opt_fscache:
+			v9ses->cache = CACHE_FSCACHE;
+			break;
+		case Opt_cachetag:
+#ifdef CONFIG_9P_FSCACHE
+			v9ses->cachetag = match_strdup(&args[0]);
+#endif
+			break;
+		case Opt_cache:
+			s = match_strdup(&args[0]);
+			if (!s)
+				goto fail_option_alloc;
+
+			if (strcmp(s, "loose") == 0)
+				v9ses->cache = CACHE_LOOSE;
+			else if (strcmp(s, "fscache") == 0)
+				v9ses->cache = CACHE_FSCACHE;
+			else
+				v9ses->cache = CACHE_NONE;
+			kfree(s);
+			break;
 
 		case Opt_access:
 			s = match_strdup(&args[0]);
-			if (!s) {
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					   "failed to allocate copy"
-					   " of option argument\n");
-				ret = -ENOMEM;
-				break;
-			}
+			if (!s)
+				goto fail_option_alloc;
+
 			v9ses->flags &= ~V9FS_ACCESS_MASK;
 			if (strcmp(s, "user") == 0)
 				v9ses->flags |= V9FS_ACCESS_USER;
@@ -173,6 +196,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 	}
 	kfree(options);
 	return ret;
+
+fail_option_alloc:
+	P9_DPRINTK(P9_DEBUG_ERROR,
+		   "failed to allocate copy of option argument\n");
+	return -ENOMEM;
 }
 
 /**
@@ -200,6 +228,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	spin_lock(&v9fs_sessionlist_lock);
+	list_add(&v9ses->slist, &v9fs_sessionlist);
+	spin_unlock(&v9fs_sessionlist_lock);
+
 	v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER;
 	strcpy(v9ses->uname, V9FS_DEFUSER);
 	strcpy(v9ses->aname, V9FS_DEFANAME);
@@ -249,6 +281,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	else
 		fid->uid = ~0;
 
+#ifdef CONFIG_9P_FSCACHE
+	/* register the session for caching */
+	v9fs_cache_session_get_cookie(v9ses);
+#endif
+
 	return fid;
 
 error:
@@ -268,8 +305,18 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
 		v9ses->clnt = NULL;
 	}
 
+#ifdef CONFIG_9P_FSCACHE
+	if (v9ses->fscache) {
+		v9fs_cache_session_put_cookie(v9ses);
+		kfree(v9ses->cachetag);
+	}
+#endif
 	__putname(v9ses->uname);
 	__putname(v9ses->aname);
+
+	spin_lock(&v9fs_sessionlist_lock);
+	list_del(&v9ses->slist);
+	spin_unlock(&v9fs_sessionlist_lock);
 }
 
 /**
@@ -286,25 +333,132 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
 
 extern int v9fs_error_init(void);
 
+static struct kobject *v9fs_kobj;
+
+#ifdef CONFIG_9P_FSCACHE
 /**
- * v9fs_init - Initialize module
+ * caches_show - list caches associated with a session
+ *
+ * Returns the size of buffer written.
+ */
+
+static ssize_t caches_show(struct kobject *kobj,
+			   struct kobj_attribute *attr,
+			   char *buf)
+{
+	ssize_t n = 0, count = 0, limit = PAGE_SIZE;
+	struct v9fs_session_info *v9ses;
+
+	spin_lock(&v9fs_sessionlist_lock);
+	list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
+		if (v9ses->cachetag) {
+			n = snprintf(buf, limit, "%s\n", v9ses->cachetag);
+			if (n < 0) {
+				count = n;
+				break;
+			}
+
+			count += n;
+			limit -= n;
+		}
+	}
+
+	spin_unlock(&v9fs_sessionlist_lock);
+	return count;
+}
+
+static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches);
+#endif /* CONFIG_9P_FSCACHE */
+
+static struct attribute *v9fs_attrs[] = {
+#ifdef CONFIG_9P_FSCACHE
+	&v9fs_attr_cache.attr,
+#endif
+	NULL,
+};
+
+static struct attribute_group v9fs_attr_group = {
+	.attrs = v9fs_attrs,
+};
+
+/**
+ * v9fs_sysfs_init - Initialize the v9fs sysfs interface
+ *
+ */
+
+static int v9fs_sysfs_init(void)
+{
+	v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
+	if (!v9fs_kobj)
+		return -ENOMEM;
+
+	if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) {
+		kobject_put(v9fs_kobj);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface
+ *
+ */
+
+static void v9fs_sysfs_cleanup(void)
+{
+	sysfs_remove_group(v9fs_kobj, &v9fs_attr_group);
+	kobject_put(v9fs_kobj);
+}
+
+/**
+ * init_v9fs - Initialize module
  *
  */
 
 static int __init init_v9fs(void)
 {
+	int err;
 	printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
 	/* TODO: Setup list of registered trasnport modules */
-	return register_filesystem(&v9fs_fs_type);
+	err = register_filesystem(&v9fs_fs_type);
+	if (err < 0) {
+		printk(KERN_ERR "Failed to register filesystem\n");
+		return err;
+	}
+
+	err = v9fs_cache_register();
+	if (err < 0) {
+		printk(KERN_ERR "Failed to register v9fs for caching\n");
+		goto out_fs_unreg;
+	}
+
+	err = v9fs_sysfs_init();
+	if (err < 0) {
+		printk(KERN_ERR "Failed to register with sysfs\n");
+		goto out_sysfs_cleanup;
+	}
+
+	return 0;
+
+out_sysfs_cleanup:
+	v9fs_sysfs_cleanup();
+
+out_fs_unreg:
+	unregister_filesystem(&v9fs_fs_type);
+
+	return err;
 }
 
 /**
- * v9fs_init - shutdown module
+ * exit_v9fs - shutdown module
  *
  */
 
 static void __exit exit_v9fs(void)
 {
+	v9fs_sysfs_cleanup();
+	v9fs_cache_unregister();
 	unregister_filesystem(&v9fs_fs_type);
 }
 
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 38762bf102a..019f4ccb70c 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -51,6 +51,7 @@ enum p9_session_flags {
 enum p9_cache_modes {
 	CACHE_NONE,
 	CACHE_LOOSE,
+	CACHE_FSCACHE,
 };
 
 /**
@@ -60,6 +61,8 @@ enum p9_cache_modes {
  * @debug: debug level
  * @afid: authentication handle
  * @cache: cache mode of type &p9_cache_modes
+ * @cachetag: the tag of the cache associated with this session
+ * @fscache: session cookie associated with FS-Cache
  * @options: copy of options string given by user
  * @uname: string user name to mount hierarchy as
  * @aname: mount specifier for remote hierarchy
@@ -68,7 +71,7 @@ enum p9_cache_modes {
  * @dfltgid: default numeric groupid to mount hierarchy as
  * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
  * @clnt: reference to 9P network client instantiated for this session
- * @debugfs_dir: reference to debugfs_dir which can be used for add'l debug
+ * @slist: reference to list of registered 9p sessions
  *
  * This structure holds state for each session instance established during
  * a sys_mount() .
@@ -84,6 +87,10 @@ struct v9fs_session_info {
 	unsigned short debug;
 	unsigned int afid;
 	unsigned int cache;
+#ifdef CONFIG_9P_FSCACHE
+	char *cachetag;
+	struct fscache_cookie *fscache;
+#endif
 
 	char *uname;		/* user name to mount as */
 	char *aname;		/* name of remote hierarchy being mounted */
@@ -92,11 +99,9 @@ struct v9fs_session_info {
 	unsigned int dfltgid;	/* default gid for legacy support */
 	u32 uid;		/* if ACCESS_SINGLE, the uid that has access */
 	struct p9_client *clnt;	/* 9p client */
-	struct dentry *debugfs_dir;
+	struct list_head slist; /* list of sessions registered with v9fs */
 };
 
-extern struct dentry *v9fs_debugfs_root;
-
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
 									char *);
 void v9fs_session_close(struct v9fs_session_info *v9ses);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index f0c7de78e20..3a7560e3586 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -44,7 +44,13 @@ extern const struct file_operations v9fs_dir_operations;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
 
+#ifdef CONFIG_9P_FSCACHE
+struct inode *v9fs_alloc_inode(struct super_block *sb);
+void v9fs_destroy_inode(struct inode *inode);
+#endif
+
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+void v9fs_clear_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 92828281a30..90e38449f4b 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -38,6 +38,7 @@
 
 #include "v9fs.h"
 #include "v9fs_vfs.h"
+#include "cache.h"
 
 /**
  * v9fs_vfs_readpage - read an entire page in from 9P
@@ -52,18 +53,31 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
 	int retval;
 	loff_t offset;
 	char *buffer;
+	struct inode *inode;
 
+	inode = page->mapping->host;
 	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+
+	BUG_ON(!PageLocked(page));
+
+	retval = v9fs_readpage_from_fscache(inode, page);
+	if (retval == 0)
+		return retval;
+
 	buffer = kmap(page);
 	offset = page_offset(page);
 
 	retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
-	if (retval < 0)
+	if (retval < 0) {
+		v9fs_uncache_page(inode, page);
 		goto done;
+	}
 
 	memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval);
 	flush_dcache_page(page);
 	SetPageUptodate(page);
+
+	v9fs_readpage_to_fscache(inode, page);
 	retval = 0;
 
 done:
@@ -72,6 +86,78 @@ done:
 	return retval;
 }
 
+/**
+ * v9fs_vfs_readpages - read a set of pages from 9P
+ *
+ * @filp: file being read
+ * @mapping: the address space
+ * @pages: list of pages to read
+ * @nr_pages: count of pages to read
+ *
+ */
+
+static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
+			     struct list_head *pages, unsigned nr_pages)
+{
+	int ret = 0;
+	struct inode *inode;
+
+	inode = mapping->host;
+	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+
+	ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
+	if (ret == 0)
+		return ret;
+
+	ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
+	P9_DPRINTK(P9_DEBUG_VFS, "  = %d\n", ret);
+	return ret;
+}
+
+/**
+ * v9fs_release_page - release the private state associated with a page
+ *
+ * Returns 1 if the page can be released, false otherwise.
+ */
+
+static int v9fs_release_page(struct page *page, gfp_t gfp)
+{
+	if (PagePrivate(page))
+		return 0;
+
+	return v9fs_fscache_release_page(page, gfp);
+}
+
+/**
+ * v9fs_invalidate_page - Invalidate a page completely or partially
+ *
+ * @page: structure to page
+ * @offset: offset in the page
+ */
+
+static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+{
+	if (offset == 0)
+		v9fs_fscache_invalidate_page(page);
+}
+
+/**
+ * v9fs_launder_page - Writeback a dirty page
+ * Since the writes go directly to the server, we simply return a 0
+ * here to indicate success.
+ *
+ * Returns 0 on success.
+ */
+
+static int v9fs_launder_page(struct page *page)
+{
+	return 0;
+}
+
 const struct address_space_operations v9fs_addr_operations = {
       .readpage = v9fs_vfs_readpage,
+      .readpages = v9fs_vfs_readpages,
+      .releasepage = v9fs_release_page,
+      .invalidatepage = v9fs_invalidate_page,
+      .launder_page = v9fs_launder_page,
 };
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 68bf2af6c38..3902bf43a08 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/inet.h>
 #include <linux/list.h>
+#include <linux/pagemap.h>
 #include <asm/uaccess.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
@@ -40,6 +41,7 @@
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 #include "fid.h"
+#include "cache.h"
 
 static const struct file_operations v9fs_cached_file_operations;
 
@@ -72,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 			return err;
 		}
 		if (omode & P9_OTRUNC) {
-			inode->i_size = 0;
+			i_size_write(inode, 0);
 			inode->i_blocks = 0;
 		}
 		if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
@@ -85,6 +87,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		/* enable cached file options */
 		if(file->f_op == &v9fs_file_operations)
 			file->f_op = &v9fs_cached_file_operations;
+
+#ifdef CONFIG_9P_FSCACHE
+		v9fs_cache_inode_set_cookie(inode, file);
+#endif
 	}
 
 	return 0;
@@ -210,6 +216,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	struct p9_client *clnt;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int origin = *offset;
+	unsigned long pg_start, pg_end;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
 		(int)count, (int)*offset);
@@ -225,7 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		if (count < rsize)
 			rsize = count;
 
-		n = p9_client_write(fid, NULL, data+total, *offset+total,
+		n = p9_client_write(fid, NULL, data+total, origin+total,
 									rsize);
 		if (n <= 0)
 			break;
@@ -234,14 +241,14 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	} while (count > 0);
 
 	if (total > 0) {
-		invalidate_inode_pages2_range(inode->i_mapping, origin,
-								origin+total);
+		pg_start = origin >> PAGE_CACHE_SHIFT;
+		pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
+		if (inode->i_mapping && inode->i_mapping->nrpages)
+			invalidate_inode_pages2_range(inode->i_mapping,
+						      pg_start, pg_end);
 		*offset += total;
-	}
-
-	if (*offset > inode->i_size) {
-		inode->i_size = *offset;
-		inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
+		i_size_write(inode, i_size_read(inode) + total);
+		inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
 	}
 
 	if (n < 0)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 06a223d50a8..5947628aefe 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -40,6 +40,7 @@
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 #include "fid.h"
+#include "cache.h"
 
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_ext;
@@ -197,6 +198,39 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
 	wstat->extension = NULL;
 }
 
+#ifdef CONFIG_9P_FSCACHE
+/**
+ * v9fs_alloc_inode - helper function to allocate an inode
+ * This callback is executed before setting up the inode so that we
+ * can associate a vcookie with each inode.
+ *
+ */
+
+struct inode *v9fs_alloc_inode(struct super_block *sb)
+{
+	struct v9fs_cookie *vcookie;
+	vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
+							 GFP_KERNEL);
+	if (!vcookie)
+		return NULL;
+
+	vcookie->fscache = NULL;
+	vcookie->qid = NULL;
+	spin_lock_init(&vcookie->lock);
+	return &vcookie->inode;
+}
+
+/**
+ * v9fs_destroy_inode - destroy an inode
+ *
+ */
+
+void v9fs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
+}
+#endif
+
 /**
  * v9fs_get_inode - helper function to setup an inode
  * @sb: superblock
@@ -326,6 +360,21 @@ error:
 }
 */
 
+
+/**
+ * v9fs_clear_inode - release an inode
+ * @inode: inode to release
+ *
+ */
+void v9fs_clear_inode(struct inode *inode)
+{
+	filemap_fdatawrite(inode->i_mapping);
+
+#ifdef CONFIG_9P_FSCACHE
+	v9fs_cache_inode_put_cookie(inode);
+#endif
+}
+
 /**
  * v9fs_inode_from_fid - populate an inode by issuing a attribute request
  * @v9ses: session information
@@ -356,8 +405,14 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 
 	v9fs_stat2inode(st, ret, sb);
 	ret->i_ino = v9fs_qid2ino(&st->qid);
+
+#ifdef CONFIG_9P_FSCACHE
+	v9fs_vcookie_set_qid(ret, &st->qid);
+	v9fs_cache_inode_get_cookie(ret);
+#endif
 	p9stat_free(st);
 	kfree(st);
+
 	return ret;
 
 error:
@@ -751,7 +806,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	err = -EPERM;
 	v9ses = v9fs_inode2v9ses(dentry->d_inode);
-	if (v9ses->cache == CACHE_LOOSE)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		return simple_getattr(mnt, dentry, stat);
 
 	fid = v9fs_fid_lookup(dentry);
@@ -872,10 +927,10 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	} else
 		inode->i_rdev = 0;
 
-	inode->i_size = stat->length;
+	i_size_write(inode, stat->length);
 
 	/* not real number of blocks, but 512 byte ones ... */
-	inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
+	inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
 }
 
 /**
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8961f1a8f66..14a86448572 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -44,21 +44,9 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 
-static void v9fs_clear_inode(struct inode *);
 static const struct super_operations v9fs_super_ops;
 
 /**
- * v9fs_clear_inode - release an inode
- * @inode: inode to release
- *
- */
-
-static void v9fs_clear_inode(struct inode *inode)
-{
-	filemap_fdatawrite(inode->i_mapping);
-}
-
-/**
  * v9fs_set_super - set the superblock
  * @s: super block
  * @data: file system specific data
@@ -220,6 +208,10 @@ v9fs_umount_begin(struct super_block *sb)
 }
 
 static const struct super_operations v9fs_super_ops = {
+#ifdef CONFIG_9P_FSCACHE
+	.alloc_inode = v9fs_alloc_inode,
+	.destroy_inode = v9fs_destroy_inode,
+#endif
 	.statfs = simple_statfs,
 	.clear_inode = v9fs_clear_inode,
 	.show_options = generic_show_options,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 798cb071d13..3f57ce4bee5 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -19,9 +19,6 @@ static int
 adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
 	       int create)
 {
-	if (block < 0)
-		goto abort_negative;
-
 	if (!create) {
 		if (block >= inode->i_blocks)
 			goto abort_toobig;
@@ -34,10 +31,6 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
 	/* don't support allocation of blocks yet */
 	return -EIO;
 
-abort_negative:
-	adfs_error(inode->i_sb, "block %d < 0", block);
-	return -EIO;
-
 abort_toobig:
 	return 0;
 }
diff --git a/fs/attr.c b/fs/attr.c
index 9fe1b1bd30a..96d394bdadd 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,7 +18,7 @@
 /* Taken over from the old code... */
 
 /* POSIX UID/GID verification for setting inode attributes. */
-int inode_change_ok(struct inode *inode, struct iattr *attr)
+int inode_change_ok(const struct inode *inode, struct iattr *attr)
 {
 	int retval = -EPERM;
 	unsigned int ia_valid = attr->ia_valid;
@@ -60,9 +60,51 @@ fine:
 error:
 	return retval;
 }
-
 EXPORT_SYMBOL(inode_change_ok);
 
+/**
+ * inode_newsize_ok - may this inode be truncated to a given size
+ * @inode:	the inode to be truncated
+ * @offset:	the new size to assign to the inode
+ * @Returns:	0 on success, -ve errno on failure
+ *
+ * inode_newsize_ok will check filesystem limits and ulimits to check that the
+ * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
+ * when necessary. Caller must not proceed with inode size change if failure is
+ * returned. @inode must be a file (not directory), with appropriate
+ * permissions to allow truncate (inode_newsize_ok does NOT check these
+ * conditions).
+ *
+ * inode_newsize_ok must be called with i_mutex held.
+ */
+int inode_newsize_ok(const struct inode *inode, loff_t offset)
+{
+	if (inode->i_size < offset) {
+		unsigned long limit;
+
+		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+		if (limit != RLIM_INFINITY && offset > limit)
+			goto out_sig;
+		if (offset > inode->i_sb->s_maxbytes)
+			goto out_big;
+	} else {
+		/*
+		 * truncation of in-use swapfiles is disallowed - it would
+		 * cause subsequent swapout to scribble on the now-freed
+		 * blocks.
+		 */
+		if (IS_SWAPFILE(inode))
+			return -ETXTBSY;
+	}
+
+	return 0;
+out_sig:
+	send_sig(SIGXFSZ, current, 0);
+out_big:
+	return -EFBIG;
+}
+EXPORT_SYMBOL(inode_newsize_ok);
+
 int inode_setattr(struct inode * inode, struct iattr * attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index dd376c124e7..33baf27fac7 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -737,12 +737,7 @@ befs_put_super(struct super_block *sb)
 {
 	kfree(BEFS_SB(sb)->mount_opts.iocharset);
 	BEFS_SB(sb)->mount_opts.iocharset = NULL;
-
-	if (BEFS_SB(sb)->nls) {
-		unload_nls(BEFS_SB(sb)->nls);
-		BEFS_SB(sb)->nls = NULL;
-	}
-
+	unload_nls(BEFS_SB(sb)->nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 }
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 442d94fe255..b9b3bb51b1e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1711,42 +1711,52 @@ struct elf_note_info {
 	int numnote;
 };
 
-static int fill_note_info(struct elfhdr *elf, int phdrs,
-			  struct elf_note_info *info,
-			  long signr, struct pt_regs *regs)
+static int elf_note_info_init(struct elf_note_info *info)
 {
-#define	NUM_NOTES	6
-	struct list_head *t;
-
-	info->notes = NULL;
-	info->prstatus = NULL;
-	info->psinfo = NULL;
-	info->fpu = NULL;
-#ifdef ELF_CORE_COPY_XFPREGS
-	info->xfpu = NULL;
-#endif
+	memset(info, 0, sizeof(*info));
 	INIT_LIST_HEAD(&info->thread_list);
 
-	info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote),
-			      GFP_KERNEL);
+	/* Allocate space for six ELF notes */
+	info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
 	if (!info->notes)
 		return 0;
 	info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
 	if (!info->psinfo)
-		return 0;
+		goto notes_free;
 	info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
 	if (!info->prstatus)
-		return 0;
+		goto psinfo_free;
 	info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
 	if (!info->fpu)
-		return 0;
+		goto prstatus_free;
 #ifdef ELF_CORE_COPY_XFPREGS
 	info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
 	if (!info->xfpu)
-		return 0;
+		goto fpu_free;
+#endif
+	return 1;
+#ifdef ELF_CORE_COPY_XFPREGS
+ fpu_free:
+	kfree(info->fpu);
 #endif
+ prstatus_free:
+	kfree(info->prstatus);
+ psinfo_free:
+	kfree(info->psinfo);
+ notes_free:
+	kfree(info->notes);
+	return 0;
+}
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+			  struct elf_note_info *info,
+			  long signr, struct pt_regs *regs)
+{
+	struct list_head *t;
+
+	if (!elf_note_info_init(info))
+		return 0;
 
-	info->thread_status_size = 0;
 	if (signr) {
 		struct core_thread *ct;
 		struct elf_thread_status *ets;
@@ -1806,8 +1816,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 #endif
 
 	return 1;
-
-#undef NUM_NOTES
 }
 
 static size_t get_note_info_size(struct elf_note_info *info)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 76285471073..38502c67987 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -283,20 +283,23 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	}
 
 	stack_size = exec_params.stack_size;
-	if (stack_size < interp_params.stack_size)
-		stack_size = interp_params.stack_size;
-
 	if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
 		executable_stack = EXSTACK_ENABLE_X;
 	else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
 		executable_stack = EXSTACK_DISABLE_X;
-	else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
-		executable_stack = EXSTACK_ENABLE_X;
-	else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
-		executable_stack = EXSTACK_DISABLE_X;
 	else
 		executable_stack = EXSTACK_DEFAULT;
 
+	if (stack_size == 0) {
+		stack_size = interp_params.stack_size;
+		if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
+			executable_stack = EXSTACK_ENABLE_X;
+		else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
+			executable_stack = EXSTACK_DISABLE_X;
+		else
+			executable_stack = EXSTACK_DEFAULT;
+	}
+
 	retval = -ENOEXEC;
 	if (stack_size == 0)
 		goto error;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e92f229e3c6..a2796651e75 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -278,8 +278,6 @@ static int decompress_exec(
 		ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
 		if (ret <= 0)
 			break;
-		if (ret >= (unsigned long) -4096)
-			break;
 		len -= ret;
 
 		strm.next_in = buf;
@@ -335,7 +333,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
 					"(%d != %d)", (unsigned) r, curid, id);
 			goto failed;
 		} else if ( ! p->lib_list[id].loaded &&
-				load_flat_shared_library(id, p) > (unsigned long) -4096) {
+				IS_ERR_VALUE(load_flat_shared_library(id, p))) {
 			printk("BINFMT_FLAT: failed to load library %d", id);
 			goto failed;
 		}
@@ -545,7 +543,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 		textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
 				  MAP_PRIVATE|MAP_EXECUTABLE, 0);
 		up_write(&current->mm->mmap_sem);
-		if (!textpos  || textpos >= (unsigned long) -4096) {
+		if (!textpos || IS_ERR_VALUE(textpos)) {
 			if (!textpos)
 				textpos = (unsigned long) -ENOMEM;
 			printk("Unable to mmap process text, errno %d\n", (int)-textpos);
@@ -560,7 +558,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
 		up_write(&current->mm->mmap_sem);
 
-		if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
+		if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
 			if (!realdatastart)
 				realdatastart = (unsigned long) -ENOMEM;
 			printk("Unable to allocate RAM for process data, errno %d\n",
@@ -587,7 +585,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			result = bprm->file->f_op->read(bprm->file, (char *) datapos,
 					data_len + (relocs * sizeof(unsigned long)), &fpos);
 		}
-		if (result >= (unsigned long)-4096) {
+		if (IS_ERR_VALUE(result)) {
 			printk("Unable to read data+bss, errno %d\n", (int)-result);
 			do_munmap(current->mm, textpos, text_len);
 			do_munmap(current->mm, realdatastart, data_len + extra);
@@ -607,7 +605,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
 		up_write(&current->mm->mmap_sem);
 
-		if (!textpos  || textpos >= (unsigned long) -4096) {
+		if (!textpos || IS_ERR_VALUE(textpos)) {
 			if (!textpos)
 				textpos = (unsigned long) -ENOMEM;
 			printk("Unable to allocate RAM for process text/data, errno %d\n",
@@ -641,7 +639,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			fpos = 0;
 			result = bprm->file->f_op->read(bprm->file,
 					(char *) textpos, text_len, &fpos);
-			if (result < (unsigned long) -4096)
+			if (!IS_ERR_VALUE(result))
 				result = decompress_exec(bprm, text_len, (char *) datapos,
 						 data_len + (relocs * sizeof(unsigned long)), 0);
 		}
@@ -651,13 +649,13 @@ static int load_flat_file(struct linux_binprm * bprm,
 			fpos = 0;
 			result = bprm->file->f_op->read(bprm->file,
 					(char *) textpos, text_len, &fpos);
-			if (result < (unsigned long) -4096) {
+			if (!IS_ERR_VALUE(result)) {
 				fpos = ntohl(hdr->data_start);
 				result = bprm->file->f_op->read(bprm->file, (char *) datapos,
 					data_len + (relocs * sizeof(unsigned long)), &fpos);
 			}
 		}
-		if (result >= (unsigned long)-4096) {
+		if (IS_ERR_VALUE(result)) {
 			printk("Unable to read code+data+bss, errno %d\n",(int)-result);
 			do_munmap(current->mm, textpos, text_len + data_len + extra +
 				MAX_SHARED_LIBS * sizeof(unsigned long));
@@ -835,7 +833,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
 
 	res = prepare_binprm(&bprm);
 
-	if (res <= (unsigned long)-4096)
+	if (!IS_ERR_VALUE(res))
 		res = load_flat_file(&bprm, libs, id, NULL);
 
 	abort_creds(bprm.cred);
@@ -880,7 +878,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	stack_len += FLAT_DATA_ALIGN - 1;  /* reserve for upcoming alignment */
 	
 	res = load_flat_file(bprm, &libinfo, 0, &stack_len);
-	if (res > (unsigned long)-4096)
+	if (IS_ERR_VALUE(res))
 		return res;
 	
 	/* Update data segment pointers for all libraries */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 5d1ed50bd46..9cf4b926f8e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -216,8 +216,6 @@ EXPORT_SYMBOL(fsync_bdev);
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
  * @bdev:	blockdevice to lock
  *
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
  * The reference counter (bd_fsfreeze_count) guarantees that only the last
@@ -232,46 +230,55 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 	int error = 0;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		bdev->bd_fsfreeze_count++;
+	if (++bdev->bd_fsfreeze_count > 1) {
+		/*
+		 * We don't even need to grab a reference - the first call
+		 * to freeze_bdev grab an active reference and only the last
+		 * thaw_bdev drops it.
+		 */
 		sb = get_super(bdev);
+		drop_super(sb);
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return sb;
 	}
-	bdev->bd_fsfreeze_count++;
-
-	down(&bdev->bd_mount_sem);
-	sb = get_super(bdev);
-	if (sb && !(sb->s_flags & MS_RDONLY)) {
-		sb->s_frozen = SB_FREEZE_WRITE;
-		smp_wmb();
-
-		sync_filesystem(sb);
-
-		sb->s_frozen = SB_FREEZE_TRANS;
-		smp_wmb();
-
-		sync_blockdev(sb->s_bdev);
-
-		if (sb->s_op->freeze_fs) {
-			error = sb->s_op->freeze_fs(sb);
-			if (error) {
-				printk(KERN_ERR
-					"VFS:Filesystem freeze failed\n");
-				sb->s_frozen = SB_UNFROZEN;
-				drop_super(sb);
-				up(&bdev->bd_mount_sem);
-				bdev->bd_fsfreeze_count--;
-				mutex_unlock(&bdev->bd_fsfreeze_mutex);
-				return ERR_PTR(error);
-			}
+
+	sb = get_active_super(bdev);
+	if (!sb)
+		goto out;
+	if (sb->s_flags & MS_RDONLY) {
+		deactivate_locked_super(sb);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return sb;
+	}
+
+	sb->s_frozen = SB_FREEZE_WRITE;
+	smp_wmb();
+
+	sync_filesystem(sb);
+
+	sb->s_frozen = SB_FREEZE_TRANS;
+	smp_wmb();
+
+	sync_blockdev(sb->s_bdev);
+
+	if (sb->s_op->freeze_fs) {
+		error = sb->s_op->freeze_fs(sb);
+		if (error) {
+			printk(KERN_ERR
+				"VFS:Filesystem freeze failed\n");
+			sb->s_frozen = SB_UNFROZEN;
+			deactivate_locked_super(sb);
+			bdev->bd_fsfreeze_count--;
+			mutex_unlock(&bdev->bd_fsfreeze_mutex);
+			return ERR_PTR(error);
 		}
 	}
+	up_write(&sb->s_umount);
 
+ out:
 	sync_blockdev(bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-
-	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
+	return sb;	/* thaw_bdev releases s->s_umount */
 }
 EXPORT_SYMBOL(freeze_bdev);
 
@@ -284,44 +291,44 @@ EXPORT_SYMBOL(freeze_bdev);
  */
 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
-	int error = 0;
+	int error = -EINVAL;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (!bdev->bd_fsfreeze_count) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return -EINVAL;
-	}
-
-	bdev->bd_fsfreeze_count--;
-	if (bdev->bd_fsfreeze_count > 0) {
-		if (sb)
-			drop_super(sb);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return 0;
-	}
-
-	if (sb) {
-		BUG_ON(sb->s_bdev != bdev);
-		if (!(sb->s_flags & MS_RDONLY)) {
-			if (sb->s_op->unfreeze_fs) {
-				error = sb->s_op->unfreeze_fs(sb);
-				if (error) {
-					printk(KERN_ERR
-						"VFS:Filesystem thaw failed\n");
-					sb->s_frozen = SB_FREEZE_TRANS;
-					bdev->bd_fsfreeze_count++;
-					mutex_unlock(&bdev->bd_fsfreeze_mutex);
-					return error;
-				}
-			}
-			sb->s_frozen = SB_UNFROZEN;
-			smp_wmb();
-			wake_up(&sb->s_wait_unfrozen);
+	if (!bdev->bd_fsfreeze_count)
+		goto out_unlock;
+
+	error = 0;
+	if (--bdev->bd_fsfreeze_count > 0)
+		goto out_unlock;
+
+	if (!sb)
+		goto out_unlock;
+
+	BUG_ON(sb->s_bdev != bdev);
+	down_write(&sb->s_umount);
+	if (sb->s_flags & MS_RDONLY)
+		goto out_deactivate;
+
+	if (sb->s_op->unfreeze_fs) {
+		error = sb->s_op->unfreeze_fs(sb);
+		if (error) {
+			printk(KERN_ERR
+				"VFS:Filesystem thaw failed\n");
+			sb->s_frozen = SB_FREEZE_TRANS;
+			bdev->bd_fsfreeze_count++;
+			mutex_unlock(&bdev->bd_fsfreeze_mutex);
+			return error;
 		}
-		drop_super(sb);
 	}
 
-	up(&bdev->bd_mount_sem);
+	sb->s_frozen = SB_UNFROZEN;
+	smp_wmb();
+	wake_up(&sb->s_wait_unfrozen);
+
+out_deactivate:
+	if (sb)
+		deactivate_locked_super(sb);
+out_unlock:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return 0;
 }
@@ -430,7 +437,6 @@ static void init_once(void *foo)
 
 	memset(bdev, 0, sizeof(*bdev));
 	mutex_init(&bdev->bd_mutex);
-	sema_init(&bdev->bd_mount_sem, 1);
 	INIT_LIST_HEAD(&bdev->bd_inodes);
 	INIT_LIST_HEAD(&bdev->bd_list);
 #ifdef CONFIG_SYSFS
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 019e8af449a..282ca085c2f 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
 	/* number of things on the pending list */
 	atomic_t num_pending;
 
+	/* reference counter for this struct */
+	atomic_t refs;
+
 	unsigned long sequence;
 
 	/* protects the pending list. */
@@ -71,7 +74,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
 		unsigned long flags;
 		spin_lock_irqsave(&worker->workers->lock, flags);
 		worker->idle = 1;
-		list_move(&worker->worker_list, &worker->workers->idle_list);
+
+		/* the list may be empty if the worker is just starting */
+		if (!list_empty(&worker->worker_list)) {
+			list_move(&worker->worker_list,
+				 &worker->workers->idle_list);
+		}
 		spin_unlock_irqrestore(&worker->workers->lock, flags);
 	}
 }
@@ -87,23 +95,49 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
 		unsigned long flags;
 		spin_lock_irqsave(&worker->workers->lock, flags);
 		worker->idle = 0;
-		list_move_tail(&worker->worker_list,
-			       &worker->workers->worker_list);
+
+		if (!list_empty(&worker->worker_list)) {
+			list_move_tail(&worker->worker_list,
+				      &worker->workers->worker_list);
+		}
 		spin_unlock_irqrestore(&worker->workers->lock, flags);
 	}
 }
 
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
-					    struct btrfs_work *work)
+static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
 {
+	struct btrfs_workers *workers = worker->workers;
 	unsigned long flags;
 
+	rmb();
+	if (!workers->atomic_start_pending)
+		return;
+
+	spin_lock_irqsave(&workers->lock, flags);
+	if (!workers->atomic_start_pending)
+		goto out;
+
+	workers->atomic_start_pending = 0;
+	if (workers->num_workers >= workers->max_workers)
+		goto out;
+
+	spin_unlock_irqrestore(&workers->lock, flags);
+	btrfs_start_workers(workers, 1);
+	return;
+
+out:
+	spin_unlock_irqrestore(&workers->lock, flags);
+}
+
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+					    struct btrfs_work *work)
+{
 	if (!workers->ordered)
 		return 0;
 
 	set_bit(WORK_DONE_BIT, &work->flags);
 
-	spin_lock_irqsave(&workers->lock, flags);
+	spin_lock(&workers->order_lock);
 
 	while (1) {
 		if (!list_empty(&workers->prio_order_list)) {
@@ -126,45 +160,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
 			break;
 
-		spin_unlock_irqrestore(&workers->lock, flags);
+		spin_unlock(&workers->order_lock);
 
 		work->ordered_func(work);
 
 		/* now take the lock again and call the freeing code */
-		spin_lock_irqsave(&workers->lock, flags);
+		spin_lock(&workers->order_lock);
 		list_del(&work->order_list);
 		work->ordered_free(work);
 	}
 
-	spin_unlock_irqrestore(&workers->lock, flags);
+	spin_unlock(&workers->order_lock);
 	return 0;
 }
 
+static void put_worker(struct btrfs_worker_thread *worker)
+{
+	if (atomic_dec_and_test(&worker->refs))
+		kfree(worker);
+}
+
+static int try_worker_shutdown(struct btrfs_worker_thread *worker)
+{
+	int freeit = 0;
+
+	spin_lock_irq(&worker->lock);
+	spin_lock(&worker->workers->lock);
+	if (worker->workers->num_workers > 1 &&
+	    worker->idle &&
+	    !worker->working &&
+	    !list_empty(&worker->worker_list) &&
+	    list_empty(&worker->prio_pending) &&
+	    list_empty(&worker->pending) &&
+	    atomic_read(&worker->num_pending) == 0) {
+		freeit = 1;
+		list_del_init(&worker->worker_list);
+		worker->workers->num_workers--;
+	}
+	spin_unlock(&worker->workers->lock);
+	spin_unlock_irq(&worker->lock);
+
+	if (freeit)
+		put_worker(worker);
+	return freeit;
+}
+
+static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
+					struct list_head *prio_head,
+					struct list_head *head)
+{
+	struct btrfs_work *work = NULL;
+	struct list_head *cur = NULL;
+
+	if(!list_empty(prio_head))
+		cur = prio_head->next;
+
+	smp_mb();
+	if (!list_empty(&worker->prio_pending))
+		goto refill;
+
+	if (!list_empty(head))
+		cur = head->next;
+
+	if (cur)
+		goto out;
+
+refill:
+	spin_lock_irq(&worker->lock);
+	list_splice_tail_init(&worker->prio_pending, prio_head);
+	list_splice_tail_init(&worker->pending, head);
+
+	if (!list_empty(prio_head))
+		cur = prio_head->next;
+	else if (!list_empty(head))
+		cur = head->next;
+	spin_unlock_irq(&worker->lock);
+
+	if (!cur)
+		goto out_fail;
+
+out:
+	work = list_entry(cur, struct btrfs_work, list);
+
+out_fail:
+	return work;
+}
+
 /*
  * main loop for servicing work items
  */
 static int worker_loop(void *arg)
 {
 	struct btrfs_worker_thread *worker = arg;
-	struct list_head *cur;
+	struct list_head head;
+	struct list_head prio_head;
 	struct btrfs_work *work;
+
+	INIT_LIST_HEAD(&head);
+	INIT_LIST_HEAD(&prio_head);
+
 	do {
-		spin_lock_irq(&worker->lock);
-again_locked:
+again:
 		while (1) {
-			if (!list_empty(&worker->prio_pending))
-				cur = worker->prio_pending.next;
-			else if (!list_empty(&worker->pending))
-				cur = worker->pending.next;
-			else
+
+
+			work = get_next_work(worker, &prio_head, &head);
+			if (!work)
 				break;
 
-			work = list_entry(cur, struct btrfs_work, list);
 			list_del(&work->list);
 			clear_bit(WORK_QUEUED_BIT, &work->flags);
 
 			work->worker = worker;
-			spin_unlock_irq(&worker->lock);
 
 			work->func(work);
 
@@ -175,9 +282,13 @@ again_locked:
 			 */
 			run_ordered_completions(worker->workers, work);
 
-			spin_lock_irq(&worker->lock);
-			check_idle_worker(worker);
+			check_pending_worker_creates(worker);
+
 		}
+
+		spin_lock_irq(&worker->lock);
+		check_idle_worker(worker);
+
 		if (freezing(current)) {
 			worker->working = 0;
 			spin_unlock_irq(&worker->lock);
@@ -216,8 +327,10 @@ again_locked:
 				spin_lock_irq(&worker->lock);
 				set_current_state(TASK_INTERRUPTIBLE);
 				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending))
-					goto again_locked;
+				    !list_empty(&worker->prio_pending)) {
+					spin_unlock_irq(&worker->lock);
+					goto again;
+				}
 
 				/*
 				 * this makes sure we get a wakeup when someone
@@ -226,8 +339,13 @@ again_locked:
 				worker->working = 0;
 				spin_unlock_irq(&worker->lock);
 
-				if (!kthread_should_stop())
-					schedule();
+				if (!kthread_should_stop()) {
+					schedule_timeout(HZ * 120);
+					if (!worker->working &&
+					    try_worker_shutdown(worker)) {
+						return 0;
+					}
+				}
 			}
 			__set_current_state(TASK_RUNNING);
 		}
@@ -242,16 +360,30 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 {
 	struct list_head *cur;
 	struct btrfs_worker_thread *worker;
+	int can_stop;
 
+	spin_lock_irq(&workers->lock);
 	list_splice_init(&workers->idle_list, &workers->worker_list);
 	while (!list_empty(&workers->worker_list)) {
 		cur = workers->worker_list.next;
 		worker = list_entry(cur, struct btrfs_worker_thread,
 				    worker_list);
-		kthread_stop(worker->task);
-		list_del(&worker->worker_list);
-		kfree(worker);
+
+		atomic_inc(&worker->refs);
+		workers->num_workers -= 1;
+		if (!list_empty(&worker->worker_list)) {
+			list_del_init(&worker->worker_list);
+			put_worker(worker);
+			can_stop = 1;
+		} else
+			can_stop = 0;
+		spin_unlock_irq(&workers->lock);
+		if (can_stop)
+			kthread_stop(worker->task);
+		spin_lock_irq(&workers->lock);
+		put_worker(worker);
 	}
+	spin_unlock_irq(&workers->lock);
 	return 0;
 }
 
@@ -266,10 +398,13 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
 	INIT_LIST_HEAD(&workers->order_list);
 	INIT_LIST_HEAD(&workers->prio_order_list);
 	spin_lock_init(&workers->lock);
+	spin_lock_init(&workers->order_lock);
 	workers->max_workers = max;
 	workers->idle_thresh = 32;
 	workers->name = name;
 	workers->ordered = 0;
+	workers->atomic_start_pending = 0;
+	workers->atomic_worker_start = 0;
 }
 
 /*
@@ -293,7 +428,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		INIT_LIST_HEAD(&worker->prio_pending);
 		INIT_LIST_HEAD(&worker->worker_list);
 		spin_lock_init(&worker->lock);
+
 		atomic_set(&worker->num_pending, 0);
+		atomic_set(&worker->refs, 1);
 		worker->workers = workers;
 		worker->task = kthread_run(worker_loop, worker,
 					   "btrfs-%s-%d", workers->name,
@@ -303,7 +440,6 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 			kfree(worker);
 			goto fail;
 		}
-
 		spin_lock_irq(&workers->lock);
 		list_add_tail(&worker->worker_list, &workers->idle_list);
 		worker->idle = 1;
@@ -350,7 +486,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 	 */
 	next = workers->worker_list.next;
 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
-	atomic_inc(&worker->num_pending);
 	worker->sequence++;
 
 	if (worker->sequence % workers->idle_thresh == 0)
@@ -367,28 +502,18 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
 {
 	struct btrfs_worker_thread *worker;
 	unsigned long flags;
+	struct list_head *fallback;
 
 again:
 	spin_lock_irqsave(&workers->lock, flags);
 	worker = next_worker(workers);
-	spin_unlock_irqrestore(&workers->lock, flags);
 
 	if (!worker) {
-		spin_lock_irqsave(&workers->lock, flags);
 		if (workers->num_workers >= workers->max_workers) {
-			struct list_head *fallback = NULL;
-			/*
-			 * we have failed to find any workers, just
-			 * return the force one
-			 */
-			if (!list_empty(&workers->worker_list))
-				fallback = workers->worker_list.next;
-			if (!list_empty(&workers->idle_list))
-				fallback = workers->idle_list.next;
-			BUG_ON(!fallback);
-			worker = list_entry(fallback,
-				  struct btrfs_worker_thread, worker_list);
-			spin_unlock_irqrestore(&workers->lock, flags);
+			goto fallback;
+		} else if (workers->atomic_worker_start) {
+			workers->atomic_start_pending = 1;
+			goto fallback;
 		} else {
 			spin_unlock_irqrestore(&workers->lock, flags);
 			/* we're below the limit, start another worker */
@@ -396,6 +521,28 @@ again:
 			goto again;
 		}
 	}
+	goto found;
+
+fallback:
+	fallback = NULL;
+	/*
+	 * we have failed to find any workers, just
+	 * return the first one we can find.
+	 */
+	if (!list_empty(&workers->worker_list))
+		fallback = workers->worker_list.next;
+	if (!list_empty(&workers->idle_list))
+		fallback = workers->idle_list.next;
+	BUG_ON(!fallback);
+	worker = list_entry(fallback,
+		  struct btrfs_worker_thread, worker_list);
+found:
+	/*
+	 * this makes sure the worker doesn't exit before it is placed
+	 * onto a busy/idle list
+	 */
+	atomic_inc(&worker->num_pending);
+	spin_unlock_irqrestore(&workers->lock, flags);
 	return worker;
 }
 
@@ -427,7 +574,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
 		spin_lock(&worker->workers->lock);
 		worker->idle = 0;
 		list_move_tail(&worker->worker_list,
-			       &worker->workers->worker_list);
+			      &worker->workers->worker_list);
 		spin_unlock(&worker->workers->lock);
 	}
 	if (!worker->working) {
@@ -435,9 +582,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
 		worker->working = 1;
 	}
 
-	spin_unlock_irqrestore(&worker->lock, flags);
 	if (wake)
 		wake_up_process(worker->task);
+	spin_unlock_irqrestore(&worker->lock, flags);
 out:
 
 	return 0;
@@ -463,14 +610,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
 	worker = find_worker(workers);
 	if (workers->ordered) {
-		spin_lock_irqsave(&workers->lock, flags);
+		/*
+		 * you're not allowed to do ordered queues from an
+		 * interrupt handler
+		 */
+		spin_lock(&workers->order_lock);
 		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
 			list_add_tail(&work->order_list,
 				      &workers->prio_order_list);
 		} else {
 			list_add_tail(&work->order_list, &workers->order_list);
 		}
-		spin_unlock_irqrestore(&workers->lock, flags);
+		spin_unlock(&workers->order_lock);
 	} else {
 		INIT_LIST_HEAD(&work->order_list);
 	}
@@ -481,7 +632,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 		list_add_tail(&work->list, &worker->prio_pending);
 	else
 		list_add_tail(&work->list, &worker->pending);
-	atomic_inc(&worker->num_pending);
 	check_busy_worker(worker);
 
 	/*
@@ -492,10 +642,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 		wake = 1;
 	worker->working = 1;
 
-	spin_unlock_irqrestore(&worker->lock, flags);
-
 	if (wake)
 		wake_up_process(worker->task);
+	spin_unlock_irqrestore(&worker->lock, flags);
+
 out:
 	return 0;
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1b511c109db..fc089b95ec1 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -73,6 +73,15 @@ struct btrfs_workers {
 	/* force completions in the order they were queued */
 	int ordered;
 
+	/* more workers required, but in an interrupt handler */
+	int atomic_start_pending;
+
+	/*
+	 * are we allowed to sleep while starting workers or are we required
+	 * to start them at a later time?
+	 */
+	int atomic_worker_start;
+
 	/* list with all the work threads.  The workers on the idle thread
 	 * may be actively servicing jobs, but they haven't yet hit the
 	 * idle thresh limit above.
@@ -90,6 +99,9 @@ struct btrfs_workers {
 	/* lock for finding the next worker thread to queue on */
 	spinlock_t lock;
 
+	/* lock for the ordered lists */
+	spinlock_t order_lock;
+
 	/* extra name for this worker, used for current->name */
 	char *name;
 };
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ea1ea0af8c0..82ee56bba29 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,6 +138,7 @@ struct btrfs_inode {
 	 * of these.
 	 */
 	unsigned ordered_data_close:1;
+	unsigned dummy_inode:1;
 
 	struct inode vfs_inode;
 };
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9d8ba4d54a3..a11a32058b5 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		 */
 		set_page_extent_mapped(page);
 		lock_extent(tree, last_offset, end, GFP_NOFS);
-		spin_lock(&em_tree->lock);
+		read_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, last_offset,
 					   PAGE_CACHE_SIZE);
-		spin_unlock(&em_tree->lock);
+		read_unlock(&em_tree->lock);
 
 		if (!em || last_offset < em->start ||
 		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	em_tree = &BTRFS_I(inode)->extent_tree;
 
 	/* we need the actual starting offset of this extent in the file */
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree,
 				   page_offset(bio->bi_io_vec->bv_page),
 				   PAGE_CACHE_SIZE);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	compressed_len = em->block_len;
 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3fdcc0512d3..ec96f3a6d53 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int split;
 	int num_doubles = 0;
 
+	l = path->nodes[0];
+	slot = path->slots[0];
+	if (extend && data_size + btrfs_item_size_nr(l, slot) +
+	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
+		return -EOVERFLOW;
+
 	/* first try to make some room by pushing left and right */
 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 837435ce84c..80599b4e42b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
  */
 #define BTRFS_DEV_ITEMS_OBJECTID 1ULL
 
+#define BTRFS_BTREE_INODE_OBJECTID 1
+
+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+
 /*
  * we can actually store much bigger names, but lets not confuse the rest
  * of linux
@@ -670,6 +674,7 @@ struct btrfs_space_info {
 	u64 bytes_reserved;	/* total bytes the allocator has reserved for
 				   current allocations */
 	u64 bytes_readonly;	/* total bytes that are read only */
+	u64 bytes_super;	/* total bytes reserved for the super blocks */
 
 	/* delalloc accounting */
 	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
@@ -726,6 +731,15 @@ enum btrfs_caching_type {
 	BTRFS_CACHE_FINISHED	= 2,
 };
 
+struct btrfs_caching_control {
+	struct list_head list;
+	struct mutex mutex;
+	wait_queue_head_t wait;
+	struct btrfs_block_group_cache *block_group;
+	u64 progress;
+	atomic_t count;
+};
+
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
@@ -733,6 +747,7 @@ struct btrfs_block_group_cache {
 	spinlock_t lock;
 	u64 pinned;
 	u64 reserved;
+	u64 bytes_super;
 	u64 flags;
 	u64 sectorsize;
 	int extents_thresh;
@@ -742,8 +757,9 @@ struct btrfs_block_group_cache {
 	int dirty;
 
 	/* cache tracking stuff */
-	wait_queue_head_t caching_q;
 	int cached;
+	struct btrfs_caching_control *caching_ctl;
+	u64 last_byte_to_unpin;
 
 	struct btrfs_space_info *space_info;
 
@@ -782,13 +798,16 @@ struct btrfs_fs_info {
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
+
+	spinlock_t fs_roots_radix_lock;
 	struct radix_tree_root fs_roots_radix;
 
 	/* block group cache stuff */
 	spinlock_t block_group_cache_lock;
 	struct rb_root block_group_cache_tree;
 
-	struct extent_io_tree pinned_extents;
+	struct extent_io_tree freed_extents[2];
+	struct extent_io_tree *pinned_extents;
 
 	/* logical->physical extent mapping */
 	struct btrfs_mapping_tree mapping_tree;
@@ -822,11 +841,7 @@ struct btrfs_fs_info {
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
 	struct mutex chunk_mutex;
-	struct mutex drop_mutex;
 	struct mutex volume_mutex;
-	struct mutex tree_reloc_mutex;
-	struct rw_semaphore extent_commit_sem;
-
 	/*
 	 * this protects the ordered operations list only while we are
 	 * processing all of the entries on it.  This way we make
@@ -835,10 +850,16 @@ struct btrfs_fs_info {
 	 * before jumping into the main commit.
 	 */
 	struct mutex ordered_operations_mutex;
+	struct rw_semaphore extent_commit_sem;
+
+	struct rw_semaphore subvol_sem;
+
+	struct srcu_struct subvol_srcu;
 
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
+	struct list_head caching_block_groups;
 
 	atomic_t nr_async_submits;
 	atomic_t async_submit_draining;
@@ -996,10 +1017,12 @@ struct btrfs_root {
 	u32 stripesize;
 
 	u32 type;
-	u64 highest_inode;
-	u64 last_inode_alloc;
+
+	u64 highest_objectid;
 	int ref_cows;
 	int track_dirty;
+	int in_radix;
+
 	u64 defrag_trans_start;
 	struct btrfs_key defrag_progress;
 	struct btrfs_key defrag_max;
@@ -1920,8 +1943,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_pinned_extents(struct btrfs_root *root,
-				u64 bytenr, u64 num, int pin);
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num, int reserved);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
@@ -1971,9 +1994,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct extent_io_tree *unpin);
+			       struct btrfs_root *root);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
@@ -1984,6 +2008,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -2006,7 +2031,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 				 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
 			      u64 bytes);
-void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -2100,12 +2124,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct extent_buffer *parent);
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
-		   struct btrfs_path *path,
-		   u64 root_id, u64 ref_id);
+			struct btrfs_path *path,
+			u64 root_id, u64 ref_id);
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *tree_root,
-		       u64 root_id, u8 type, u64 ref_id,
-		       u64 dirid, u64 sequence,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+		       const char *name, int name_len);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
 		       const char *name, int name_len);
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key);
@@ -2120,6 +2147,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
 		      u64 *found_objectid);
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 int btrfs_set_root_node(struct btrfs_root_item *item,
 			struct extent_buffer *node);
 /* dir-item.c */
@@ -2138,6 +2166,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_path *path, u64 dir,
 			    u64 objectid, const char *name, int name_len,
 			    int mod);
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dirid,
+			    const char *name, int name_len);
 struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      const char *name, int name_len);
@@ -2160,6 +2192,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 offset);
 int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 offset);
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
 
 /* inode-map.c */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
@@ -2232,6 +2265,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct inode *parent_inode, struct inode *inode,
 		   const char *name, int name_len, int add_backref, u64 index);
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct inode *dir, u64 objectid,
+			const char *name, int name_len);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct inode *inode, u64 new_size,
@@ -2242,7 +2279,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *new_root, struct dentry *dentry,
+			     struct btrfs_root *new_root,
 			     u64 new_dirid, u64 alloc_hint);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio, unsigned long bio_flags);
@@ -2258,6 +2295,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
+void btrfs_drop_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
@@ -2275,6 +2313,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t size);
+int btrfs_invalidate_inodes(struct btrfs_root *root);
+extern struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2290,7 +2330,7 @@ extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 locked_end,
-		       u64 inline_limit, u64 *hint_block);
+		       u64 inline_limit, u64 *hint_block, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1d70236ba00..f3a6075519c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 	return btrfs_match_dir_item_name(root, path, name, name_len);
 }
 
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dirid,
+			    const char *name, int name_len)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u32 nritems;
+	int ret;
+
+	key.objectid = dirid;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+
+	while (1) {
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				return ERR_PTR(ret);
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
+			break;
+
+		di = btrfs_match_dir_item_name(root, path, name, name_len);
+		if (di)
+			return di;
+
+		path->slots[0]++;
+	}
+	return NULL;
+}
+
 struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path, u64 dir,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6c4173146bb..644e796fd64 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -41,6 +41,7 @@
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
+static void free_fs_root(struct btrfs_root *root);
 
 static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
 
@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 	struct extent_map *em;
 	int ret;
 
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
 	if (em) {
 		em->bdev =
 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-		spin_unlock(&em_tree->lock);
+		read_unlock(&em_tree->lock);
 		goto out;
 	}
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 	em->block_start = 0;
 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
-	spin_lock(&em_tree->lock);
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	if (ret == -EEXIST) {
 		u64 failed_start = em->start;
@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 		free_extent_map(em);
 		em = NULL;
 	}
-	spin_unlock(&em_tree->lock);
+	write_unlock(&em_tree->lock);
 
 	if (ret)
 		em = ERR_PTR(ret);
@@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->fs_info = fs_info;
 	root->objectid = objectid;
 	root->last_trans = 0;
-	root->highest_inode = 0;
-	root->last_inode_alloc = 0;
+	root->highest_objectid = 0;
 	root->name = NULL;
 	root->in_sysfs = 0;
 	root->inode_tree.rb_node = NULL;
@@ -952,14 +952,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 		     root, fs_info, objectid);
 	ret = btrfs_find_last_root(tree_root, objectid,
 				   &root->root_item, &root->root_key);
+	if (ret > 0)
+		return -ENOENT;
 	BUG_ON(ret);
 
 	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
-	root->commit_root = btrfs_root_node(root);
 	BUG_ON(!root->node);
+	root->commit_root = btrfs_root_node(root);
 	return 0;
 }
 
@@ -1095,7 +1097,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	struct btrfs_fs_info *fs_info = tree_root->fs_info;
 	struct btrfs_path *path;
 	struct extent_buffer *l;
-	u64 highest_inode;
 	u64 generation;
 	u32 blocksize;
 	int ret = 0;
@@ -1110,7 +1111,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 			kfree(root);
 			return ERR_PTR(ret);
 		}
-		goto insert;
+		goto out;
 	}
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
@@ -1120,39 +1121,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
-	if (ret != 0) {
-		if (ret > 0)
-			ret = -ENOENT;
-		goto out;
+	if (ret == 0) {
+		l = path->nodes[0];
+		read_extent_buffer(l, &root->root_item,
+				btrfs_item_ptr_offset(l, path->slots[0]),
+				sizeof(root->root_item));
+		memcpy(&root->root_key, location, sizeof(*location));
 	}
-	l = path->nodes[0];
-	read_extent_buffer(l, &root->root_item,
-	       btrfs_item_ptr_offset(l, path->slots[0]),
-	       sizeof(root->root_item));
-	memcpy(&root->root_key, location, sizeof(*location));
-	ret = 0;
-out:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	if (ret) {
-		kfree(root);
+		if (ret > 0)
+			ret = -ENOENT;
 		return ERR_PTR(ret);
 	}
+
 	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
 	root->commit_root = btrfs_root_node(root);
 	BUG_ON(!root->node);
-insert:
-	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+out:
+	if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
 		root->ref_cows = 1;
-		ret = btrfs_find_highest_inode(root, &highest_inode);
-		if (ret == 0) {
-			root->highest_inode = highest_inode;
-			root->last_inode_alloc = highest_inode;
-		}
-	}
+
 	return root;
 }
 
@@ -1187,39 +1179,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 		return fs_info->dev_root;
 	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
 		return fs_info->csum_root;
-
+again:
+	spin_lock(&fs_info->fs_roots_radix_lock);
 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
 				 (unsigned long)location->objectid);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
 	if (root)
 		return root;
 
+	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+	if (ret == 0)
+		ret = -ENOENT;
+	if (ret < 0)
+		return ERR_PTR(ret);
+
 	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
 	if (IS_ERR(root))
 		return root;
 
+	WARN_ON(btrfs_root_refs(&root->root_item) == 0);
 	set_anon_super(&root->anon_super, NULL);
 
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		goto fail;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
 				(unsigned long)root->root_key.objectid,
 				root);
+	if (ret == 0)
+		root->in_radix = 1;
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	radix_tree_preload_end();
 	if (ret) {
-		free_extent_buffer(root->node);
-		kfree(root);
-		return ERR_PTR(ret);
+		if (ret == -EEXIST) {
+			free_fs_root(root);
+			goto again;
+		}
+		goto fail;
 	}
-	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
-		ret = btrfs_find_dead_roots(fs_info->tree_root,
-					    root->root_key.objectid);
-		BUG_ON(ret);
+
+	ret = btrfs_find_dead_roots(fs_info->tree_root,
+				    root->root_key.objectid);
+	WARN_ON(ret);
+
+	if (!(fs_info->sb->s_flags & MS_RDONLY))
 		btrfs_orphan_cleanup(root);
-	}
+
 	return root;
+fail:
+	free_fs_root(root);
+	return ERR_PTR(ret);
 }
 
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location,
 				      const char *name, int namelen)
 {
+	return btrfs_read_fs_root_no_name(fs_info, location);
+#if 0
 	struct btrfs_root *root;
 	int ret;
 
@@ -1236,7 +1255,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 		kfree(root);
 		return ERR_PTR(ret);
 	}
-#if 0
+
 	ret = btrfs_sysfs_add_root(root);
 	if (ret) {
 		free_extent_buffer(root->node);
@@ -1244,9 +1263,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 		kfree(root);
 		return ERR_PTR(ret);
 	}
-#endif
 	root->in_sysfs = 1;
 	return root;
+#endif
 }
 
 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
@@ -1325,9 +1344,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	offset = page_offset(page);
 
 	em_tree = &BTRFS_I(inode)->extent_tree;
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 	if (!em) {
 		__unplug_io_fn(bdi, page);
 		return;
@@ -1360,8 +1379,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 
 	err = bdi_register(bdi, NULL, "btrfs-%d",
 				atomic_inc_return(&btrfs_bdi_num));
-	if (err)
+	if (err) {
+		bdi_destroy(bdi);
 		return err;
+	}
 
 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
 	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
@@ -1451,9 +1472,12 @@ static int cleaner_kthread(void *arg)
 			break;
 
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
-		mutex_lock(&root->fs_info->cleaner_mutex);
-		btrfs_clean_old_snapshots(root);
-		mutex_unlock(&root->fs_info->cleaner_mutex);
+
+		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
+		    mutex_trylock(&root->fs_info->cleaner_mutex)) {
+			btrfs_clean_old_snapshots(root);
+			mutex_unlock(&root->fs_info->cleaner_mutex);
+		}
 
 		if (freezing(current)) {
 			refrigerator();
@@ -1558,15 +1582,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail;
 	}
-	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
+
+	ret = init_srcu_struct(&fs_info->subvol_srcu);
+	if (ret) {
+		err = ret;
+		goto fail;
+	}
+
+	ret = setup_bdi(fs_info, &fs_info->bdi);
+	if (ret) {
+		err = ret;
+		goto fail_srcu;
+	}
+
+	fs_info->btree_inode = new_inode(sb);
+	if (!fs_info->btree_inode) {
+		err = -ENOMEM;
+		goto fail_bdi;
+	}
+
+	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
 	INIT_LIST_HEAD(&fs_info->ordered_operations);
+	INIT_LIST_HEAD(&fs_info->caching_block_groups);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 	spin_lock_init(&fs_info->ref_cache_lock);
+	spin_lock_init(&fs_info->fs_roots_radix_lock);
 
 	init_completion(&fs_info->kobj_unregister);
 	fs_info->tree_root = tree_root;
@@ -1585,11 +1630,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
-	if (setup_bdi(fs_info, &fs_info->bdi))
-		goto fail_bdi;
-	fs_info->btree_inode = new_inode(sb);
-	fs_info->btree_inode->i_ino = 1;
-	fs_info->btree_inode->i_nlink = 1;
 	fs_info->metadata_ratio = 8;
 
 	fs_info->thread_pool_size = min_t(unsigned long,
@@ -1602,6 +1642,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	sb->s_blocksize_bits = blksize_bits(4096);
 	sb->s_bdi = &fs_info->bdi;
 
+	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+	fs_info->btree_inode->i_nlink = 1;
 	/*
 	 * we set the i_size on the btree inode to the max possible int.
 	 * the real end of the address space is determined by all of
@@ -1620,28 +1662,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
 
+	BTRFS_I(fs_info->btree_inode)->root = tree_root;
+	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+	       sizeof(struct btrfs_key));
+	BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+	insert_inode_hash(fs_info->btree_inode);
+
 	spin_lock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree.rb_node = NULL;
 
-	extent_io_tree_init(&fs_info->pinned_extents,
+	extent_io_tree_init(&fs_info->freed_extents[0],
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	extent_io_tree_init(&fs_info->freed_extents[1],
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	fs_info->pinned_extents = &fs_info->freed_extents[0];
 	fs_info->do_barriers = 1;
 
-	BTRFS_I(fs_info->btree_inode)->root = tree_root;
-	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
-	       sizeof(struct btrfs_key));
-	insert_inode_hash(fs_info->btree_inode);
 
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->ordered_operations_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
-	mutex_init(&fs_info->drop_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
-	mutex_init(&fs_info->tree_reloc_mutex);
 	init_rwsem(&fs_info->extent_commit_sem);
+	init_rwsem(&fs_info->subvol_sem);
 
 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
 	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1700,7 +1746,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		err = -EINVAL;
 		goto fail_iput;
 	}
-
+printk("thread pool is %d\n", fs_info->thread_pool_size);
 	/*
 	 * we need to start all the end_io workers up front because the
 	 * queue work function gets called at interrupt time, and so it
@@ -1745,20 +1791,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->endio_workers.idle_thresh = 4;
 	fs_info->endio_meta_workers.idle_thresh = 4;
 
-	fs_info->endio_write_workers.idle_thresh = 64;
-	fs_info->endio_meta_write_workers.idle_thresh = 64;
+	fs_info->endio_write_workers.idle_thresh = 2;
+	fs_info->endio_meta_write_workers.idle_thresh = 2;
+
+	fs_info->endio_workers.atomic_worker_start = 1;
+	fs_info->endio_meta_workers.atomic_worker_start = 1;
+	fs_info->endio_write_workers.atomic_worker_start = 1;
+	fs_info->endio_meta_write_workers.atomic_worker_start = 1;
 
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->delalloc_workers, 1);
 	btrfs_start_workers(&fs_info->fixup_workers, 1);
-	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
-	btrfs_start_workers(&fs_info->endio_meta_workers,
-			    fs_info->thread_pool_size);
-	btrfs_start_workers(&fs_info->endio_meta_write_workers,
-			    fs_info->thread_pool_size);
-	btrfs_start_workers(&fs_info->endio_write_workers,
-			    fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_workers, 1);
+	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
+	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
+	btrfs_start_workers(&fs_info->endio_write_workers, 1);
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1918,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		}
 	}
 
+	ret = btrfs_find_orphan_roots(tree_root);
+	BUG_ON(ret);
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_recover_relocation(tree_root);
 		BUG_ON(ret);
@@ -1977,6 +2028,8 @@ fail_iput:
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 fail_bdi:
 	bdi_destroy(&fs_info->bdi);
+fail_srcu:
+	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
 	kfree(extent_root);
 	kfree(tree_root);
@@ -2236,20 +2289,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
-	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+	spin_lock(&fs_info->fs_roots_radix_lock);
 	radix_tree_delete(&fs_info->fs_roots_radix,
 			  (unsigned long)root->root_key.objectid);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+
+	if (btrfs_root_refs(&root->root_item) == 0)
+		synchronize_srcu(&fs_info->subvol_srcu);
+
+	free_fs_root(root);
+	return 0;
+}
+
+static void free_fs_root(struct btrfs_root *root)
+{
+	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
 	if (root->anon_super.s_dev) {
 		down_write(&root->anon_super.s_umount);
 		kill_anon_super(&root->anon_super);
 	}
-	if (root->node)
-		free_extent_buffer(root->node);
-	if (root->commit_root)
-		free_extent_buffer(root->commit_root);
+	free_extent_buffer(root->node);
+	free_extent_buffer(root->commit_root);
 	kfree(root->name);
 	kfree(root);
-	return 0;
 }
 
 static int del_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2258,6 +2320,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
 	struct btrfs_root *gang[8];
 	int i;
 
+	while (!list_empty(&fs_info->dead_roots)) {
+		gang[0] = list_entry(fs_info->dead_roots.next,
+				     struct btrfs_root, root_list);
+		list_del(&gang[0]->root_list);
+
+		if (gang[0]->in_radix) {
+			btrfs_free_fs_root(fs_info, gang[0]);
+		} else {
+			free_extent_buffer(gang[0]->node);
+			free_extent_buffer(gang[0]->commit_root);
+			kfree(gang[0]);
+		}
+	}
+
 	while (1) {
 		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
 					     (void **)gang, 0,
@@ -2287,9 +2363,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
 		root_objectid = gang[ret - 1]->root_key.objectid + 1;
 		for (i = 0; i < ret; i++) {
 			root_objectid = gang[i]->root_key.objectid;
-			ret = btrfs_find_dead_roots(fs_info->tree_root,
-						    root_objectid);
-			BUG_ON(ret);
 			btrfs_orphan_cleanup(gang[i]);
 		}
 		root_objectid++;
@@ -2359,7 +2432,6 @@ int close_ctree(struct btrfs_root *root)
 	free_extent_buffer(root->fs_info->csum_root->commit_root);
 
 	btrfs_free_block_groups(root->fs_info);
-	btrfs_free_pinned_extents(root->fs_info);
 
 	del_fs_roots(fs_info);
 
@@ -2378,6 +2450,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
 	bdi_destroy(&fs_info->bdi);
+	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9596b40caa4..ba5c3fd5ab8 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
 	type = FILEID_BTRFS_WITHOUT_PARENT;
 
-	fid->objectid = BTRFS_I(inode)->location.objectid;
+	fid->objectid = inode->i_ino;
 	fid->root_objectid = BTRFS_I(inode)->root->objectid;
 	fid->gen = inode->i_generation;
 
@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 }
 
 static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
-				       u64 root_objectid, u32 generation)
+				       u64 root_objectid, u32 generation,
+				       int check_generation)
 {
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
 	struct btrfs_root *root;
+	struct dentry *dentry;
 	struct inode *inode;
 	struct btrfs_key key;
+	int index;
+	int err = 0;
+
+	if (objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return ERR_PTR(-ESTALE);
 
 	key.objectid = root_objectid;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 	key.offset = (u64)-1;
 
-	root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
-	if (IS_ERR(root))
-		return ERR_CAST(root);
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto fail;
+	}
+
+	if (btrfs_root_refs(&root->root_item) == 0) {
+		err = -ENOENT;
+		goto fail;
+	}
 
 	key.objectid = objectid;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
 	inode = btrfs_iget(sb, &key, root);
-	if (IS_ERR(inode))
-		return (void *)inode;
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto fail;
+	}
+
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
 
-	if (generation != inode->i_generation) {
+	if (check_generation && generation != inode->i_generation) {
 		iput(inode);
 		return ERR_PTR(-ESTALE);
 	}
 
-	return d_obtain_alias(inode);
+	dentry = d_obtain_alias(inode);
+	if (!IS_ERR(dentry))
+		dentry->d_op = &btrfs_dentry_operations;
+	return dentry;
+fail:
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+	return ERR_PTR(err);
 }
 
 static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
 	objectid = fid->parent_objectid;
 	generation = fid->parent_gen;
 
-	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
 }
 
 static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 	root_objectid = fid->root_objectid;
 	generation = fid->gen;
 
-	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
 }
 
 static struct dentry *btrfs_get_parent(struct dentry *child)
 {
 	struct inode *dir = child->d_inode;
+	static struct dentry *dentry;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct btrfs_key key;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	int slot;
-	u64 objectid;
+	struct btrfs_root_ref *ref;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
 	int ret;
 
 	path = btrfs_alloc_path();
 
-	key.objectid = dir->i_ino;
-	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
-	key.offset = (u64)-1;
+	if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+		key.objectid = root->root_key.objectid;
+		key.type = BTRFS_ROOT_BACKREF_KEY;
+		key.offset = (u64)-1;
+		root = root->fs_info->tree_root;
+	} else {
+		key.objectid = dir->i_ino;
+		key.type = BTRFS_INODE_REF_KEY;
+		key.offset = (u64)-1;
+	}
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0) {
-		/* Error */
-		btrfs_free_path(path);
-		return ERR_PTR(ret);
+	if (ret < 0)
+		goto fail;
+
+	BUG_ON(ret == 0);
+	if (path->slots[0] == 0) {
+		ret = -ENOENT;
+		goto fail;
 	}
+
+	path->slots[0]--;
 	leaf = path->nodes[0];
-	slot = path->slots[0];
-	if (ret) {
-		/* btrfs_search_slot() returns the slot where we'd want to
-		   insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
-		   The _real_ backref, telling us what the parent inode
-		   _actually_ is, will be in the slot _before_ the one
-		   that btrfs_search_slot() returns. */
-		if (!slot) {
-			/* Unless there is _no_ key in the tree before... */
-			btrfs_free_path(path);
-			return ERR_PTR(-EIO);
-		}
-		slot--;
+
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	if (found_key.objectid != key.objectid || found_key.type != key.type) {
+		ret = -ENOENT;
+		goto fail;
 	}
 
-	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_root_ref);
+		key.objectid = btrfs_root_ref_dirid(leaf, ref);
+	} else {
+		key.objectid = found_key.offset;
+	}
 	btrfs_free_path(path);
 
-	if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
-		return ERR_PTR(-EINVAL);
-
-	objectid = key.offset;
-
-	/* If we are already at the root of a subvol, return the real root */
-	if (objectid == dir->i_ino)
-		return dget(dir->i_sb->s_root);
+	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+		return btrfs_get_dentry(root->fs_info->sb, key.objectid,
+					found_key.offset, 0, 0);
+	}
 
-	/* Build a new key for the inode item */
-	key.objectid = objectid;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-
-	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+	dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+	if (!IS_ERR(dentry))
+		dentry->d_op = &btrfs_dentry_operations;
+	return dentry;
+fail:
+	btrfs_free_path(path);
+	return ERR_PTR(ret);
 }
 
 const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 535f85ba104..993f93ff7ba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -32,12 +32,12 @@
 #include "locking.h"
 #include "free-space-cache.h"
 
-static int update_reserved_extents(struct btrfs_root *root,
-				   u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+				   u64 num_bytes, int reserve);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 parent,
@@ -57,10 +57,17 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     u64 parent, u64 root_objectid,
 				     u64 flags, struct btrfs_disk_key *key,
 				     int level, struct btrfs_key *ins);
-
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  u64 bytenr, u64 num_bytes,
+			  int is_data, int reserved,
+			  struct extent_buffer **must_clean);
+static int find_next_key(struct btrfs_path *path, int level,
+			 struct btrfs_key *key);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -153,34 +160,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 	return ret;
 }
 
-/*
- * We always set EXTENT_LOCKED for the super mirror extents so we don't
- * overwrite them, so those bits need to be unset.  Also, if we are unmounting
- * with pinned extents still sitting there because we had a block group caching,
- * we need to clear those now, since we are done.
- */
-void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+static int add_excluded_extent(struct btrfs_root *root,
+			       u64 start, u64 num_bytes)
 {
-	u64 start, end, last = 0;
-	int ret;
+	u64 end = start + num_bytes - 1;
+	set_extent_bits(&root->fs_info->freed_extents[0],
+			start, end, EXTENT_UPTODATE, GFP_NOFS);
+	set_extent_bits(&root->fs_info->freed_extents[1],
+			start, end, EXTENT_UPTODATE, GFP_NOFS);
+	return 0;
+}
 
-	while (1) {
-		ret = find_first_extent_bit(&info->pinned_extents, last,
-					    &start, &end,
-					    EXTENT_LOCKED|EXTENT_DIRTY);
-		if (ret)
-			break;
+static void free_excluded_extents(struct btrfs_root *root,
+				  struct btrfs_block_group_cache *cache)
+{
+	u64 start, end;
 
-		clear_extent_bits(&info->pinned_extents, start, end,
-				  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
-		last = end+1;
-	}
+	start = cache->key.objectid;
+	end = start + cache->key.offset - 1;
+
+	clear_extent_bits(&root->fs_info->freed_extents[0],
+			  start, end, EXTENT_UPTODATE, GFP_NOFS);
+	clear_extent_bits(&root->fs_info->freed_extents[1],
+			  start, end, EXTENT_UPTODATE, GFP_NOFS);
 }
 
-static int remove_sb_from_cache(struct btrfs_root *root,
-				struct btrfs_block_group_cache *cache)
+static int exclude_super_stripes(struct btrfs_root *root,
+				 struct btrfs_block_group_cache *cache)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 bytenr;
 	u64 *logical;
 	int stripe_len;
@@ -192,17 +199,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
 				       cache->key.objectid, bytenr,
 				       0, &logical, &nr, &stripe_len);
 		BUG_ON(ret);
+
 		while (nr--) {
-			try_lock_extent(&fs_info->pinned_extents,
-					logical[nr],
-					logical[nr] + stripe_len - 1, GFP_NOFS);
+			cache->bytes_super += stripe_len;
+			ret = add_excluded_extent(root, logical[nr],
+						  stripe_len);
+			BUG_ON(ret);
 		}
+
 		kfree(logical);
 	}
-
 	return 0;
 }
 
+static struct btrfs_caching_control *
+get_caching_control(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_caching_control *ctl;
+
+	spin_lock(&cache->lock);
+	if (cache->cached != BTRFS_CACHE_STARTED) {
+		spin_unlock(&cache->lock);
+		return NULL;
+	}
+
+	ctl = cache->caching_ctl;
+	atomic_inc(&ctl->count);
+	spin_unlock(&cache->lock);
+	return ctl;
+}
+
+static void put_caching_control(struct btrfs_caching_control *ctl)
+{
+	if (atomic_dec_and_test(&ctl->count))
+		kfree(ctl);
+}
+
 /*
  * this is only called by cache_block_group, since we could have freed extents
  * we need to check the pinned_extents for any extents that can't be used yet
@@ -215,9 +247,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 	int ret;
 
 	while (start < end) {
-		ret = find_first_extent_bit(&info->pinned_extents, start,
+		ret = find_first_extent_bit(info->pinned_extents, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY|EXTENT_LOCKED);
+					    EXTENT_DIRTY | EXTENT_UPTODATE);
 		if (ret)
 			break;
 
@@ -249,22 +281,27 @@ static int caching_kthread(void *data)
 {
 	struct btrfs_block_group_cache *block_group = data;
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	u64 last = 0;
+	struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
+	struct btrfs_root *extent_root = fs_info->extent_root;
 	struct btrfs_path *path;
-	int ret = 0;
-	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	int slot;
+	struct btrfs_key key;
 	u64 total_found = 0;
-
-	BUG_ON(!fs_info);
+	u64 last = 0;
+	u32 nritems;
+	int ret = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	atomic_inc(&block_group->space_info->caching_threads);
+	exclude_super_stripes(extent_root, block_group);
+	spin_lock(&block_group->space_info->lock);
+	block_group->space_info->bytes_super += block_group->bytes_super;
+	spin_unlock(&block_group->space_info->lock);
+
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+
 	/*
 	 * We don't want to deadlock with somebody trying to allocate a new
 	 * extent for the extent root while also trying to search the extent
@@ -277,74 +314,64 @@ static int caching_kthread(void *data)
 
 	key.objectid = last;
 	key.offset = 0;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	key.type = BTRFS_EXTENT_ITEM_KEY;
 again:
+	mutex_lock(&caching_ctl->mutex);
 	/* need to make sure the commit_root doesn't disappear */
 	down_read(&fs_info->extent_commit_sem);
 
-	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
 
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+
 	while (1) {
 		smp_mb();
-		if (block_group->fs_info->closing > 1) {
+		if (fs_info->closing > 1) {
 			last = (u64)-1;
 			break;
 		}
 
-		leaf = path->nodes[0];
-		slot = path->slots[0];
-		if (slot >= btrfs_header_nritems(leaf)) {
-			ret = btrfs_next_leaf(fs_info->extent_root, path);
-			if (ret < 0)
-				goto err;
-			else if (ret)
+		if (path->slots[0] < nritems) {
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		} else {
+			ret = find_next_key(path, 0, &key);
+			if (ret)
 				break;
 
-			if (need_resched() ||
-			    btrfs_transaction_in_commit(fs_info)) {
-				leaf = path->nodes[0];
-
-				/* this shouldn't happen, but if the
-				 * leaf is empty just move on.
-				 */
-				if (btrfs_header_nritems(leaf) == 0)
-					break;
-				/*
-				 * we need to copy the key out so that
-				 * we are sure the next search advances
-				 * us forward in the btree.
-				 */
-				btrfs_item_key_to_cpu(leaf, &key, 0);
-				btrfs_release_path(fs_info->extent_root, path);
-				up_read(&fs_info->extent_commit_sem);
+			caching_ctl->progress = last;
+			btrfs_release_path(extent_root, path);
+			up_read(&fs_info->extent_commit_sem);
+			mutex_unlock(&caching_ctl->mutex);
+			if (btrfs_transaction_in_commit(fs_info))
 				schedule_timeout(1);
-				goto again;
-			}
+			else
+				cond_resched();
+			goto again;
+		}
 
+		if (key.objectid < block_group->key.objectid) {
+			path->slots[0]++;
 			continue;
 		}
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (key.objectid < block_group->key.objectid)
-			goto next;
 
 		if (key.objectid >= block_group->key.objectid +
 		    block_group->key.offset)
 			break;
 
-		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
+		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 			total_found += add_new_free_space(block_group,
 							  fs_info, last,
 							  key.objectid);
 			last = key.objectid + key.offset;
-		}
 
-		if (total_found > (1024 * 1024 * 2)) {
-			total_found = 0;
-			wake_up(&block_group->caching_q);
+			if (total_found > (1024 * 1024 * 2)) {
+				total_found = 0;
+				wake_up(&caching_ctl->wait);
+			}
 		}
-next:
 		path->slots[0]++;
 	}
 	ret = 0;
@@ -352,33 +379,65 @@ next:
 	total_found += add_new_free_space(block_group, fs_info, last,
 					  block_group->key.objectid +
 					  block_group->key.offset);
+	caching_ctl->progress = (u64)-1;
 
 	spin_lock(&block_group->lock);
+	block_group->caching_ctl = NULL;
 	block_group->cached = BTRFS_CACHE_FINISHED;
 	spin_unlock(&block_group->lock);
 
 err:
 	btrfs_free_path(path);
 	up_read(&fs_info->extent_commit_sem);
-	atomic_dec(&block_group->space_info->caching_threads);
-	wake_up(&block_group->caching_q);
 
+	free_excluded_extents(extent_root, block_group);
+
+	mutex_unlock(&caching_ctl->mutex);
+	wake_up(&caching_ctl->wait);
+
+	put_caching_control(caching_ctl);
+	atomic_dec(&block_group->space_info->caching_threads);
 	return 0;
 }
 
 static int cache_block_group(struct btrfs_block_group_cache *cache)
 {
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct btrfs_caching_control *caching_ctl;
 	struct task_struct *tsk;
 	int ret = 0;
 
+	smp_mb();
+	if (cache->cached != BTRFS_CACHE_NO)
+		return 0;
+
+	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
+	BUG_ON(!caching_ctl);
+
+	INIT_LIST_HEAD(&caching_ctl->list);
+	mutex_init(&caching_ctl->mutex);
+	init_waitqueue_head(&caching_ctl->wait);
+	caching_ctl->block_group = cache;
+	caching_ctl->progress = cache->key.objectid;
+	/* one for caching kthread, one for caching block group list */
+	atomic_set(&caching_ctl->count, 2);
+
 	spin_lock(&cache->lock);
 	if (cache->cached != BTRFS_CACHE_NO) {
 		spin_unlock(&cache->lock);
-		return ret;
+		kfree(caching_ctl);
+		return 0;
 	}
+	cache->caching_ctl = caching_ctl;
 	cache->cached = BTRFS_CACHE_STARTED;
 	spin_unlock(&cache->lock);
 
+	down_write(&fs_info->extent_commit_sem);
+	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+	up_write(&fs_info->extent_commit_sem);
+
+	atomic_inc(&cache->space_info->caching_threads);
+
 	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
 			  cache->key.objectid);
 	if (IS_ERR(tsk)) {
@@ -1657,7 +1716,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 						 parent, ref_root, flags,
 						 ref->objectid, ref->offset,
 						 &ins, node->ref_mod);
-		update_reserved_extents(root, ins.objectid, ins.offset, 0);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
 					     node->num_bytes, parent,
@@ -1783,7 +1841,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 						extent_op->flags_to_set,
 						&extent_op->key,
 						ref->level, &ins);
-		update_reserved_extents(root, ins.objectid, ins.offset, 0);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
 					     node->num_bytes, parent, ref_root,
@@ -1818,16 +1875,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 		BUG_ON(extent_op);
 		head = btrfs_delayed_node_to_head(node);
 		if (insert_reserved) {
+			int mark_free = 0;
+			struct extent_buffer *must_clean = NULL;
+
+			ret = pin_down_bytes(trans, root, NULL,
+					     node->bytenr, node->num_bytes,
+					     head->is_data, 1, &must_clean);
+			if (ret > 0)
+				mark_free = 1;
+
+			if (must_clean) {
+				clean_tree_block(NULL, root, must_clean);
+				btrfs_tree_unlock(must_clean);
+				free_extent_buffer(must_clean);
+			}
 			if (head->is_data) {
 				ret = btrfs_del_csums(trans, root,
 						      node->bytenr,
 						      node->num_bytes);
 				BUG_ON(ret);
 			}
-			btrfs_update_pinned_extents(root, node->bytenr,
-						    node->num_bytes, 1);
-			update_reserved_extents(root, node->bytenr,
-						node->num_bytes, 0);
+			if (mark_free) {
+				ret = btrfs_free_reserved_extent(root,
+							node->bytenr,
+							node->num_bytes);
+				BUG_ON(ret);
+			}
 		}
 		mutex_unlock(&head->mutex);
 		return 0;
@@ -2706,6 +2779,8 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root)
 	/* get the space info for where the metadata will live */
 	alloc_target = btrfs_get_alloc_profile(root, 0);
 	meta_sinfo = __find_space_info(info, alloc_target);
+	if (!meta_sinfo)
+		goto alloc;
 
 again:
 	spin_lock(&meta_sinfo->lock);
@@ -2717,12 +2792,13 @@ again:
 	do_div(thresh, 100);
 
 	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+	    meta_sinfo->bytes_super > thresh) {
 		struct btrfs_trans_handle *trans;
 		if (!meta_sinfo->full) {
 			meta_sinfo->force_alloc = 1;
 			spin_unlock(&meta_sinfo->lock);
-
+alloc:
 			trans = btrfs_start_transaction(root, 1);
 			if (!trans)
 				return -ENOMEM;
@@ -2730,6 +2806,10 @@ again:
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     2 * 1024 * 1024, alloc_target, 0);
 			btrfs_end_transaction(trans, root);
+			if (!meta_sinfo) {
+				meta_sinfo = __find_space_info(info,
+							       alloc_target);
+			}
 			goto again;
 		}
 		spin_unlock(&meta_sinfo->lock);
@@ -2765,13 +2845,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
 	data_sinfo = BTRFS_I(inode)->space_info;
+	if (!data_sinfo)
+		goto alloc;
+
 again:
 	/* make sure we have enough space to handle the data first */
 	spin_lock(&data_sinfo->lock);
 	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
-	    data_sinfo->bytes_may_use < bytes) {
+	    data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
 		struct btrfs_trans_handle *trans;
 
 		/*
@@ -2783,7 +2866,7 @@ again:
 
 			data_sinfo->force_alloc = 1;
 			spin_unlock(&data_sinfo->lock);
-
+alloc:
 			alloc_target = btrfs_get_alloc_profile(root, 1);
 			trans = btrfs_start_transaction(root, 1);
 			if (!trans)
@@ -2795,6 +2878,11 @@ again:
 			btrfs_end_transaction(trans, root);
 			if (ret)
 				return ret;
+
+			if (!data_sinfo) {
+				btrfs_set_inode_space_info(root, inode);
+				data_sinfo = BTRFS_I(inode)->space_info;
+			}
 			goto again;
 		}
 		spin_unlock(&data_sinfo->lock);
@@ -3009,10 +3097,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		num_bytes = min(total, cache->key.offset - byte_in_group);
 		if (alloc) {
 			old_val += num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			cache->reserved -= num_bytes;
 			cache->space_info->bytes_used += num_bytes;
+			cache->space_info->bytes_reserved -= num_bytes;
 			if (cache->ro)
 				cache->space_info->bytes_readonly -= num_bytes;
-			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 		} else {
@@ -3057,127 +3147,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 	return bytenr;
 }
 
-int btrfs_update_pinned_extents(struct btrfs_root *root,
-				u64 bytenr, u64 num, int pin)
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num_bytes, int reserved)
 {
-	u64 len;
-	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_group_cache *cache;
 
-	if (pin)
-		set_extent_dirty(&fs_info->pinned_extents,
-				bytenr, bytenr + num - 1, GFP_NOFS);
-
-	while (num > 0) {
-		cache = btrfs_lookup_block_group(fs_info, bytenr);
-		BUG_ON(!cache);
-		len = min(num, cache->key.offset -
-			  (bytenr - cache->key.objectid));
-		if (pin) {
-			spin_lock(&cache->space_info->lock);
-			spin_lock(&cache->lock);
-			cache->pinned += len;
-			cache->space_info->bytes_pinned += len;
-			spin_unlock(&cache->lock);
-			spin_unlock(&cache->space_info->lock);
-			fs_info->total_pinned += len;
-		} else {
-			int unpin = 0;
+	cache = btrfs_lookup_block_group(fs_info, bytenr);
+	BUG_ON(!cache);
 
-			/*
-			 * in order to not race with the block group caching, we
-			 * only want to unpin the extent if we are cached.  If
-			 * we aren't cached, we want to start async caching this
-			 * block group so we can free the extent the next time
-			 * around.
-			 */
-			spin_lock(&cache->space_info->lock);
-			spin_lock(&cache->lock);
-			unpin = (cache->cached == BTRFS_CACHE_FINISHED);
-			if (likely(unpin)) {
-				cache->pinned -= len;
-				cache->space_info->bytes_pinned -= len;
-				fs_info->total_pinned -= len;
-			}
-			spin_unlock(&cache->lock);
-			spin_unlock(&cache->space_info->lock);
+	spin_lock(&cache->space_info->lock);
+	spin_lock(&cache->lock);
+	cache->pinned += num_bytes;
+	cache->space_info->bytes_pinned += num_bytes;
+	if (reserved) {
+		cache->reserved -= num_bytes;
+		cache->space_info->bytes_reserved -= num_bytes;
+	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&cache->space_info->lock);
 
-			if (likely(unpin))
-				clear_extent_dirty(&fs_info->pinned_extents,
-						   bytenr, bytenr + len -1,
-						   GFP_NOFS);
-			else
-				cache_block_group(cache);
+	btrfs_put_block_group(cache);
 
-			if (unpin)
-				btrfs_add_free_space(cache, bytenr, len);
-		}
-		btrfs_put_block_group(cache);
-		bytenr += len;
-		num -= len;
+	set_extent_dirty(fs_info->pinned_extents,
+			 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+	return 0;
+}
+
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+				   u64 num_bytes, int reserve)
+{
+	spin_lock(&cache->space_info->lock);
+	spin_lock(&cache->lock);
+	if (reserve) {
+		cache->reserved += num_bytes;
+		cache->space_info->bytes_reserved += num_bytes;
+	} else {
+		cache->reserved -= num_bytes;
+		cache->space_info->bytes_reserved -= num_bytes;
 	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&cache->space_info->lock);
 	return 0;
 }
 
-static int update_reserved_extents(struct btrfs_root *root,
-				   u64 bytenr, u64 num, int reserve)
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
 {
-	u64 len;
-	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_caching_control *next;
+	struct btrfs_caching_control *caching_ctl;
+	struct btrfs_block_group_cache *cache;
 
-	while (num > 0) {
-		cache = btrfs_lookup_block_group(fs_info, bytenr);
-		BUG_ON(!cache);
-		len = min(num, cache->key.offset -
-			  (bytenr - cache->key.objectid));
+	down_write(&fs_info->extent_commit_sem);
 
-		spin_lock(&cache->space_info->lock);
-		spin_lock(&cache->lock);
-		if (reserve) {
-			cache->reserved += len;
-			cache->space_info->bytes_reserved += len;
+	list_for_each_entry_safe(caching_ctl, next,
+				 &fs_info->caching_block_groups, list) {
+		cache = caching_ctl->block_group;
+		if (block_group_cache_done(cache)) {
+			cache->last_byte_to_unpin = (u64)-1;
+			list_del_init(&caching_ctl->list);
+			put_caching_control(caching_ctl);
 		} else {
-			cache->reserved -= len;
-			cache->space_info->bytes_reserved -= len;
+			cache->last_byte_to_unpin = caching_ctl->progress;
 		}
-		spin_unlock(&cache->lock);
-		spin_unlock(&cache->space_info->lock);
-		btrfs_put_block_group(cache);
-		bytenr += len;
-		num -= len;
 	}
+
+	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+		fs_info->pinned_extents = &fs_info->freed_extents[1];
+	else
+		fs_info->pinned_extents = &fs_info->freed_extents[0];
+
+	up_write(&fs_info->extent_commit_sem);
 	return 0;
 }
 
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
-	u64 last = 0;
-	u64 start;
-	u64 end;
-	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
-	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_group_cache *cache = NULL;
+	u64 len;
 
-	while (1) {
-		ret = find_first_extent_bit(pinned_extents, last,
-					    &start, &end, EXTENT_DIRTY);
-		if (ret)
-			break;
+	while (start <= end) {
+		if (!cache ||
+		    start >= cache->key.objectid + cache->key.offset) {
+			if (cache)
+				btrfs_put_block_group(cache);
+			cache = btrfs_lookup_block_group(fs_info, start);
+			BUG_ON(!cache);
+		}
+
+		len = cache->key.objectid + cache->key.offset - start;
+		len = min(len, end + 1 - start);
 
-		set_extent_dirty(copy, start, end, GFP_NOFS);
-		last = end + 1;
+		if (start < cache->last_byte_to_unpin) {
+			len = min(len, cache->last_byte_to_unpin - start);
+			btrfs_add_free_space(cache, start, len);
+		}
+
+		spin_lock(&cache->space_info->lock);
+		spin_lock(&cache->lock);
+		cache->pinned -= len;
+		cache->space_info->bytes_pinned -= len;
+		spin_unlock(&cache->lock);
+		spin_unlock(&cache->space_info->lock);
+
+		start += len;
 	}
+
+	if (cache)
+		btrfs_put_block_group(cache);
 	return 0;
 }
 
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct extent_io_tree *unpin)
+			       struct btrfs_root *root)
 {
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct extent_io_tree *unpin;
 	u64 start;
 	u64 end;
 	int ret;
 
+	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+		unpin = &fs_info->freed_extents[1];
+	else
+		unpin = &fs_info->freed_extents[0];
+
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -3186,10 +3285,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_discard_extent(root, start, end + 1 - start);
 
-		/* unlocks the pinned mutex */
-		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
-
+		unpin_extent_range(root, start, end);
 		cond_resched();
 	}
 
@@ -3199,7 +3296,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_path *path,
-			  u64 bytenr, u64 num_bytes, int is_data,
+			  u64 bytenr, u64 num_bytes,
+			  int is_data, int reserved,
 			  struct extent_buffer **must_clean)
 {
 	int err = 0;
@@ -3231,15 +3329,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 	}
 	free_extent_buffer(buf);
 pinit:
-	btrfs_set_path_blocking(path);
+	if (path)
+		btrfs_set_path_blocking(path);
 	/* unlocks the pinned mutex */
-	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+	btrfs_pin_extent(root, bytenr, num_bytes, reserved);
 
 	BUG_ON(err < 0);
 	return 0;
 }
 
-
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 parent,
@@ -3413,7 +3511,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		}
 
 		ret = pin_down_bytes(trans, root, path, bytenr,
-				     num_bytes, is_data, &must_clean);
+				     num_bytes, is_data, 0, &must_clean);
 		if (ret > 0)
 			mark_free = 1;
 		BUG_ON(ret < 0);
@@ -3544,8 +3642,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
 		/* unlocks the pinned mutex */
-		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-		update_reserved_extents(root, bytenr, num_bytes, 0);
+		btrfs_pin_extent(root, bytenr, num_bytes, 1);
 		ret = 0;
 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
@@ -3585,19 +3682,33 @@ static noinline int
 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
 				u64 num_bytes)
 {
+	struct btrfs_caching_control *caching_ctl;
 	DEFINE_WAIT(wait);
 
-	prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
-
-	if (block_group_cache_done(cache)) {
-		finish_wait(&cache->caching_q, &wait);
+	caching_ctl = get_caching_control(cache);
+	if (!caching_ctl)
 		return 0;
-	}
-	schedule();
-	finish_wait(&cache->caching_q, &wait);
 
-	wait_event(cache->caching_q, block_group_cache_done(cache) ||
+	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
 		   (cache->free_space >= num_bytes));
+
+	put_caching_control(caching_ctl);
+	return 0;
+}
+
+static noinline int
+wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_caching_control *caching_ctl;
+	DEFINE_WAIT(wait);
+
+	caching_ctl = get_caching_control(cache);
+	if (!caching_ctl)
+		return 0;
+
+	wait_event(caching_ctl->wait, block_group_cache_done(cache));
+
+	put_caching_control(caching_ctl);
 	return 0;
 }
 
@@ -3635,6 +3746,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	int last_ptr_loop = 0;
 	int loop = 0;
 	bool found_uncached_bg = false;
+	bool failed_cluster_refill = false;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3732,7 +3844,16 @@ have_block_group:
 		if (unlikely(block_group->ro))
 			goto loop;
 
-		if (last_ptr) {
+		/*
+		 * Ok we want to try and use the cluster allocator, so lets look
+		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
+		 * have tried the cluster allocator plenty of times at this
+		 * point and not have found anything, so we are likely way too
+		 * fragmented for the clustering stuff to find anything, so lets
+		 * just skip it and let the allocator find whatever block it can
+		 * find
+		 */
+		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
 			/*
 			 * the refill lock keeps out other
 			 * people trying to start a new cluster
@@ -3807,9 +3928,11 @@ refill_cluster:
 					spin_unlock(&last_ptr->refill_lock);
 					goto checks;
 				}
-			} else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+			} else if (!cached && loop > LOOP_CACHING_NOWAIT
+				   && !failed_cluster_refill) {
 				spin_unlock(&last_ptr->refill_lock);
 
+				failed_cluster_refill = true;
 				wait_block_group_cache_progress(block_group,
 				       num_bytes + empty_cluster + empty_size);
 				goto have_block_group;
@@ -3821,13 +3944,9 @@ refill_cluster:
 			 * cluster.  Free the cluster we've been trying
 			 * to use, and go to the next block group
 			 */
-			if (loop < LOOP_NO_EMPTY_SIZE) {
-				btrfs_return_cluster_to_free_space(NULL,
-								   last_ptr);
-				spin_unlock(&last_ptr->refill_lock);
-				goto loop;
-			}
+			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 			spin_unlock(&last_ptr->refill_lock);
+			goto loop;
 		}
 
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
@@ -3881,9 +4000,12 @@ checks:
 					     search_start - offset);
 		BUG_ON(offset > search_start);
 
+		update_reserved_extents(block_group, num_bytes, 1);
+
 		/* we are all good, lets return */
 		break;
 loop:
+		failed_cluster_refill = false;
 		btrfs_put_block_group(block_group);
 	}
 	up_read(&space_info->groups_sem);
@@ -3973,12 +4095,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	up_read(&info->groups_sem);
 }
 
-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  u64 num_bytes, u64 min_alloc_size,
-				  u64 empty_size, u64 hint_byte,
-				  u64 search_end, struct btrfs_key *ins,
-				  u64 data)
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 num_bytes, u64 min_alloc_size,
+			 u64 empty_size, u64 hint_byte,
+			 u64 search_end, struct btrfs_key *ins,
+			 u64 data)
 {
 	int ret;
 	u64 search_start = 0;
@@ -4044,25 +4166,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 	ret = btrfs_discard_extent(root, start, len);
 
 	btrfs_add_free_space(cache, start, len);
+	update_reserved_extents(cache, len, 0);
 	btrfs_put_block_group(cache);
-	update_reserved_extents(root, start, len, 0);
-
-	return ret;
-}
-
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  u64 num_bytes, u64 min_alloc_size,
-				  u64 empty_size, u64 hint_byte,
-				  u64 search_end, struct btrfs_key *ins,
-				  u64 data)
-{
-	int ret;
-	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
-				     empty_size, hint_byte, search_end, ins,
-				     data);
-	if (!ret)
-		update_reserved_extents(root, ins->objectid, ins->offset, 1);
 
 	return ret;
 }
@@ -4223,15 +4328,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	struct btrfs_block_group_cache *block_group;
+	struct btrfs_caching_control *caching_ctl;
+	u64 start = ins->objectid;
+	u64 num_bytes = ins->offset;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
 	cache_block_group(block_group);
-	wait_event(block_group->caching_q,
-		   block_group_cache_done(block_group));
+	caching_ctl = get_caching_control(block_group);
 
-	ret = btrfs_remove_free_space(block_group, ins->objectid,
-				      ins->offset);
-	BUG_ON(ret);
+	if (!caching_ctl) {
+		BUG_ON(!block_group_cache_done(block_group));
+		ret = btrfs_remove_free_space(block_group, start, num_bytes);
+		BUG_ON(ret);
+	} else {
+		mutex_lock(&caching_ctl->mutex);
+
+		if (start >= caching_ctl->progress) {
+			ret = add_excluded_extent(root, start, num_bytes);
+			BUG_ON(ret);
+		} else if (start + num_bytes <= caching_ctl->progress) {
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+			BUG_ON(ret);
+		} else {
+			num_bytes = caching_ctl->progress - start;
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+			BUG_ON(ret);
+
+			start = caching_ctl->progress;
+			num_bytes = ins->objectid + ins->offset -
+				    caching_ctl->progress;
+			ret = add_excluded_extent(root, start, num_bytes);
+			BUG_ON(ret);
+		}
+
+		mutex_unlock(&caching_ctl->mutex);
+		put_caching_control(caching_ctl);
+	}
+
+	update_reserved_extents(block_group, ins->offset, 1);
 	btrfs_put_block_group(block_group);
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
 					 0, owner, offset, ins, 1);
@@ -4255,9 +4391,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
 	int ret;
 	u64 flags = 0;
 
-	ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
-				     empty_size, hint_byte, search_end,
-				     ins, 0);
+	ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+				   empty_size, hint_byte, search_end,
+				   ins, 0);
 	if (ret)
 		return ret;
 
@@ -4268,7 +4404,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
 	} else
 		BUG_ON(parent > 0);
 
-	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
 		struct btrfs_delayed_extent_op *extent_op;
 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
@@ -4347,452 +4482,99 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	return buf;
 }
 
-#if 0
-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, struct extent_buffer *leaf)
-{
-	u64 disk_bytenr;
-	u64 num_bytes;
-	struct btrfs_key key;
-	struct btrfs_file_extent_item *fi;
-	u32 nritems;
-	int i;
-	int ret;
-
-	BUG_ON(!btrfs_is_leaf(leaf));
-	nritems = btrfs_header_nritems(leaf);
-
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		btrfs_item_key_to_cpu(leaf, &key, i);
-
-		/* only extents have references, skip everything else */
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-			continue;
-
-		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-
-		/* inline extents live in the btree, they don't have refs */
-		if (btrfs_file_extent_type(leaf, fi) ==
-		    BTRFS_FILE_EXTENT_INLINE)
-			continue;
-
-		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-
-		/* holes don't have refs */
-		if (disk_bytenr == 0)
-			continue;
-
-		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-		ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
-					leaf->start, 0, key.objectid, 0);
-		BUG_ON(ret);
-	}
-	return 0;
-}
-
-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_leaf_ref *ref)
-{
-	int i;
-	int ret;
-	struct btrfs_extent_info *info;
-	struct refsort *sorted;
-
-	if (ref->nritems == 0)
-		return 0;
-
-	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
-	for (i = 0; i < ref->nritems; i++) {
-		sorted[i].bytenr = ref->extents[i].bytenr;
-		sorted[i].slot = i;
-	}
-	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
-
-	/*
-	 * the items in the ref were sorted when the ref was inserted
-	 * into the ref cache, so this is already in order
-	 */
-	for (i = 0; i < ref->nritems; i++) {
-		info = ref->extents + sorted[i].slot;
-		ret = btrfs_free_extent(trans, root, info->bytenr,
-					  info->num_bytes, ref->bytenr,
-					  ref->owner, ref->generation,
-					  info->objectid, 0);
-
-		atomic_inc(&root->fs_info->throttle_gen);
-		wake_up(&root->fs_info->transaction_throttle);
-		cond_resched();
-
-		BUG_ON(ret);
-		info++;
-	}
-
-	kfree(sorted);
-	return 0;
-}
-
-
-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root, u64 start,
-				     u64 len, u32 *refs)
-{
-	int ret;
-
-	ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
-	BUG_ON(ret);
-
-#if 0 /* some debugging code in case we see problems here */
-	/* if the refs count is one, it won't get increased again.  But
-	 * if the ref count is > 1, someone may be decreasing it at
-	 * the same time we are.
-	 */
-	if (*refs != 1) {
-		struct extent_buffer *eb = NULL;
-		eb = btrfs_find_create_tree_block(root, start, len);
-		if (eb)
-			btrfs_tree_lock(eb);
-
-		mutex_lock(&root->fs_info->alloc_mutex);
-		ret = lookup_extent_ref(NULL, root, start, len, refs);
-		BUG_ON(ret);
-		mutex_unlock(&root->fs_info->alloc_mutex);
-
-		if (eb) {
-			btrfs_tree_unlock(eb);
-			free_extent_buffer(eb);
-		}
-		if (*refs == 1) {
-			printk(KERN_ERR "btrfs block %llu went down to one "
-			       "during drop_snap\n", (unsigned long long)start);
-		}
-
-	}
-#endif
-
-	cond_resched();
-	return ret;
-}
+struct walk_control {
+	u64 refs[BTRFS_MAX_LEVEL];
+	u64 flags[BTRFS_MAX_LEVEL];
+	struct btrfs_key update_progress;
+	int stage;
+	int level;
+	int shared_level;
+	int update_ref;
+	int keep_locks;
+	int reada_slot;
+	int reada_count;
+};
 
+#define DROP_REFERENCE	1
+#define UPDATE_BACKREF	2
 
-/*
- * this is used while deleting old snapshots, and it drops the refs
- * on a whole subtree starting from a level 1 node.
- *
- * The idea is to sort all the leaf pointers, and then drop the
- * ref on all the leaves in order.  Most of the time the leaves
- * will have ref cache entries, so no leaf IOs will be required to
- * find the extents they have references on.
- *
- * For each leaf, any references it has are also dropped in order
- *
- * This ends up dropping the references in something close to optimal
- * order for reading and modifying the extent allocation tree.
- */
-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_path *path)
+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct walk_control *wc,
+				     struct btrfs_path *path)
 {
 	u64 bytenr;
-	u64 root_owner;
-	u64 root_gen;
-	struct extent_buffer *eb = path->nodes[1];
-	struct extent_buffer *leaf;
-	struct btrfs_leaf_ref *ref;
-	struct refsort *sorted = NULL;
-	int nritems = btrfs_header_nritems(eb);
+	u64 generation;
+	u64 refs;
+	u64 last = 0;
+	u32 nritems;
+	u32 blocksize;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
 	int ret;
-	int i;
-	int refi = 0;
-	int slot = path->slots[1];
-	u32 blocksize = btrfs_level_size(root, 0);
-	u32 refs;
-
-	if (nritems == 0)
-		goto out;
-
-	root_owner = btrfs_header_owner(eb);
-	root_gen = btrfs_header_generation(eb);
-	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+	int slot;
+	int nread = 0;
 
-	/*
-	 * step one, sort all the leaf pointers so we don't scribble
-	 * randomly into the extent allocation tree
-	 */
-	for (i = slot; i < nritems; i++) {
-		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
-		sorted[refi].slot = i;
-		refi++;
+	if (path->slots[wc->level] < wc->reada_slot) {
+		wc->reada_count = wc->reada_count * 2 / 3;
+		wc->reada_count = max(wc->reada_count, 2);
+	} else {
+		wc->reada_count = wc->reada_count * 3 / 2;
+		wc->reada_count = min_t(int, wc->reada_count,
+					BTRFS_NODEPTRS_PER_BLOCK(root));
 	}
 
-	/*
-	 * nritems won't be zero, but if we're picking up drop_snapshot
-	 * after a crash, slot might be > 0, so double check things
-	 * just in case.
-	 */
-	if (refi == 0)
-		goto out;
+	eb = path->nodes[wc->level];
+	nritems = btrfs_header_nritems(eb);
+	blocksize = btrfs_level_size(root, wc->level - 1);
 
-	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
+		if (nread >= wc->reada_count)
+			break;
 
-	/*
-	 * the first loop frees everything the leaves point to
-	 */
-	for (i = 0; i < refi; i++) {
-		u64 ptr_gen;
+		cond_resched();
+		bytenr = btrfs_node_blockptr(eb, slot);
+		generation = btrfs_node_ptr_generation(eb, slot);
 
-		bytenr = sorted[i].bytenr;
+		if (slot == path->slots[wc->level])
+			goto reada;
 
-		/*
-		 * check the reference count on this leaf.  If it is > 1
-		 * we just decrement it below and don't update any
-		 * of the refs the leaf points to.
-		 */
-		ret = drop_snap_lookup_refcount(trans, root, bytenr,
-						blocksize, &refs);
-		BUG_ON(ret);
-		if (refs != 1)
+		if (wc->stage == UPDATE_BACKREF &&
+		    generation <= root->root_key.offset)
 			continue;
 
-		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
-
-		/*
-		 * the leaf only had one reference, which means the
-		 * only thing pointing to this leaf is the snapshot
-		 * we're deleting.  It isn't possible for the reference
-		 * count to increase again later
-		 *
-		 * The reference cache is checked for the leaf,
-		 * and if found we'll be able to drop any refs held by
-		 * the leaf without needing to read it in.
-		 */
-		ref = btrfs_lookup_leaf_ref(root, bytenr);
-		if (ref && ref->generation != ptr_gen) {
-			btrfs_free_leaf_ref(root, ref);
-			ref = NULL;
-		}
-		if (ref) {
-			ret = cache_drop_leaf_ref(trans, root, ref);
-			BUG_ON(ret);
-			btrfs_remove_leaf_ref(root, ref);
-			btrfs_free_leaf_ref(root, ref);
-		} else {
-			/*
-			 * the leaf wasn't in the reference cache, so
-			 * we have to read it.
-			 */
-			leaf = read_tree_block(root, bytenr, blocksize,
-					       ptr_gen);
-			ret = btrfs_drop_leaf_ref(trans, root, leaf);
+		if (wc->stage == DROP_REFERENCE) {
+			ret = btrfs_lookup_extent_info(trans, root,
+						bytenr, blocksize,
+						&refs, NULL);
 			BUG_ON(ret);
-			free_extent_buffer(leaf);
-		}
-		atomic_inc(&root->fs_info->throttle_gen);
-		wake_up(&root->fs_info->transaction_throttle);
-		cond_resched();
-	}
-
-	/*
-	 * run through the loop again to free the refs on the leaves.
-	 * This is faster than doing it in the loop above because
-	 * the leaves are likely to be clustered together.  We end up
-	 * working in nice chunks on the extent allocation tree.
-	 */
-	for (i = 0; i < refi; i++) {
-		bytenr = sorted[i].bytenr;
-		ret = btrfs_free_extent(trans, root, bytenr,
-					blocksize, eb->start,
-					root_owner, root_gen, 0, 1);
-		BUG_ON(ret);
-
-		atomic_inc(&root->fs_info->throttle_gen);
-		wake_up(&root->fs_info->transaction_throttle);
-		cond_resched();
-	}
-out:
-	kfree(sorted);
-
-	/*
-	 * update the path to show we've processed the entire level 1
-	 * node.  This will get saved into the root's drop_snapshot_progress
-	 * field so these drops are not repeated again if this transaction
-	 * commits.
-	 */
-	path->slots[1] = nritems;
-	return 0;
-}
-
-/*
- * helper function for drop_snapshot, this walks down the tree dropping ref
- * counts as it goes.
- */
-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct btrfs_path *path, int *level)
-{
-	u64 root_owner;
-	u64 root_gen;
-	u64 bytenr;
-	u64 ptr_gen;
-	struct extent_buffer *next;
-	struct extent_buffer *cur;
-	struct extent_buffer *parent;
-	u32 blocksize;
-	int ret;
-	u32 refs;
-
-	WARN_ON(*level < 0);
-	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-	ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
-				path->nodes[*level]->len, &refs);
-	BUG_ON(ret);
-	if (refs > 1)
-		goto out;
-
-	/*
-	 * walk down to the last node level and free all the leaves
-	 */
-	while (*level >= 0) {
-		WARN_ON(*level < 0);
-		WARN_ON(*level >= BTRFS_MAX_LEVEL);
-		cur = path->nodes[*level];
-
-		if (btrfs_header_level(cur) != *level)
-			WARN_ON(1);
+			BUG_ON(refs == 0);
+			if (refs == 1)
+				goto reada;
 
-		if (path->slots[*level] >=
-		    btrfs_header_nritems(cur))
-			break;
-
-		/* the new code goes down to level 1 and does all the
-		 * leaves pointed to that node in bulk.  So, this check
-		 * for level 0 will always be false.
-		 *
-		 * But, the disk format allows the drop_snapshot_progress
-		 * field in the root to leave things in a state where
-		 * a leaf will need cleaning up here.  If someone crashes
-		 * with the old code and then boots with the new code,
-		 * we might find a leaf here.
-		 */
-		if (*level == 0) {
-			ret = btrfs_drop_leaf_ref(trans, root, cur);
-			BUG_ON(ret);
-			break;
+			if (!wc->update_ref ||
+			    generation <= root->root_key.offset)
+				continue;
+			btrfs_node_key_to_cpu(eb, &key, slot);
+			ret = btrfs_comp_cpu_keys(&key,
+						  &wc->update_progress);
+			if (ret < 0)
+				continue;
 		}
-
-		/*
-		 * once we get to level one, process the whole node
-		 * at once, including everything below it.
-		 */
-		if (*level == 1) {
-			ret = drop_level_one_refs(trans, root, path);
-			BUG_ON(ret);
+reada:
+		ret = readahead_tree_block(root, bytenr, blocksize,
+					   generation);
+		if (ret)
 			break;
-		}
-
-		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
-		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
-		blocksize = btrfs_level_size(root, *level - 1);
-
-		ret = drop_snap_lookup_refcount(trans, root, bytenr,
-						blocksize, &refs);
-		BUG_ON(ret);
-
-		/*
-		 * if there is more than one reference, we don't need
-		 * to read that node to drop any references it has.  We
-		 * just drop the ref we hold on that node and move on to the
-		 * next slot in this level.
-		 */
-		if (refs != 1) {
-			parent = path->nodes[*level];
-			root_owner = btrfs_header_owner(parent);
-			root_gen = btrfs_header_generation(parent);
-			path->slots[*level]++;
-
-			ret = btrfs_free_extent(trans, root, bytenr,
-						blocksize, parent->start,
-						root_owner, root_gen,
-						*level - 1, 1);
-			BUG_ON(ret);
-
-			atomic_inc(&root->fs_info->throttle_gen);
-			wake_up(&root->fs_info->transaction_throttle);
-			cond_resched();
-
-			continue;
-		}
-
-		/*
-		 * we need to keep freeing things in the next level down.
-		 * read the block and loop around to process it
-		 */
-		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
-		WARN_ON(*level <= 0);
-		if (path->nodes[*level-1])
-			free_extent_buffer(path->nodes[*level-1]);
-		path->nodes[*level-1] = next;
-		*level = btrfs_header_level(next);
-		path->slots[*level] = 0;
-		cond_resched();
+		last = bytenr + blocksize;
+		nread++;
 	}
-out:
-	WARN_ON(*level < 0);
-	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
-	if (path->nodes[*level] == root->node) {
-		parent = path->nodes[*level];
-		bytenr = path->nodes[*level]->start;
-	} else {
-		parent = path->nodes[*level + 1];
-		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
-	}
-
-	blocksize = btrfs_level_size(root, *level);
-	root_owner = btrfs_header_owner(parent);
-	root_gen = btrfs_header_generation(parent);
-
-	/*
-	 * cleanup and free the reference on the last node
-	 * we processed
-	 */
-	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
-				  parent->start, root_owner, root_gen,
-				  *level, 1);
-	free_extent_buffer(path->nodes[*level]);
-	path->nodes[*level] = NULL;
-
-	*level += 1;
-	BUG_ON(ret);
-
-	cond_resched();
-	return 0;
+	wc->reada_slot = slot;
 }
-#endif
-
-struct walk_control {
-	u64 refs[BTRFS_MAX_LEVEL];
-	u64 flags[BTRFS_MAX_LEVEL];
-	struct btrfs_key update_progress;
-	int stage;
-	int level;
-	int shared_level;
-	int update_ref;
-	int keep_locks;
-};
-
-#define DROP_REFERENCE	1
-#define UPDATE_BACKREF	2
 
 /*
  * hepler to process tree block while walking down the tree.
  *
- * when wc->stage == DROP_REFERENCE, this function checks
- * reference count of the block. if the block is shared and
- * we need update back refs for the subtree rooted at the
- * block, this function changes wc->stage to UPDATE_BACKREF
- *
  * when wc->stage == UPDATE_BACKREF, this function updates
  * back refs for pointers in the block.
  *
@@ -4805,7 +4587,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 {
 	int level = wc->level;
 	struct extent_buffer *eb = path->nodes[level];
-	struct btrfs_key key;
 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 	int ret;
 
@@ -4828,21 +4609,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 		BUG_ON(wc->refs[level] == 0);
 	}
 
-	if (wc->stage == DROP_REFERENCE &&
-	    wc->update_ref && wc->refs[level] > 1) {
-		BUG_ON(eb == root->node);
-		BUG_ON(path->slots[level] > 0);
-		if (level == 0)
-			btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
-		else
-			btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
-		if (btrfs_header_owner(eb) == root->root_key.objectid &&
-		    btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
-			wc->stage = UPDATE_BACKREF;
-			wc->shared_level = level;
-		}
-	}
-
 	if (wc->stage == DROP_REFERENCE) {
 		if (wc->refs[level] > 1)
 			return 1;
@@ -4879,6 +4645,123 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 }
 
 /*
+ * hepler to process tree block pointer.
+ *
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block pointed to. if the block
+ * is shared and we need update back refs for the subtree
+ * rooted at the block, this function changes wc->stage to
+ * UPDATE_BACKREF. if the block is shared and there is no
+ * need to update back, this function drops the reference
+ * to the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct walk_control *wc)
+{
+	u64 bytenr;
+	u64 generation;
+	u64 parent;
+	u32 blocksize;
+	struct btrfs_key key;
+	struct extent_buffer *next;
+	int level = wc->level;
+	int reada = 0;
+	int ret = 0;
+
+	generation = btrfs_node_ptr_generation(path->nodes[level],
+					       path->slots[level]);
+	/*
+	 * if the lower level block was created before the snapshot
+	 * was created, we know there is no need to update back refs
+	 * for the subtree
+	 */
+	if (wc->stage == UPDATE_BACKREF &&
+	    generation <= root->root_key.offset)
+		return 1;
+
+	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+	blocksize = btrfs_level_size(root, level - 1);
+
+	next = btrfs_find_tree_block(root, bytenr, blocksize);
+	if (!next) {
+		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+		reada = 1;
+	}
+	btrfs_tree_lock(next);
+	btrfs_set_lock_blocking(next);
+
+	if (wc->stage == DROP_REFERENCE) {
+		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+					       &wc->refs[level - 1],
+					       &wc->flags[level - 1]);
+		BUG_ON(ret);
+		BUG_ON(wc->refs[level - 1] == 0);
+
+		if (wc->refs[level - 1] > 1) {
+			if (!wc->update_ref ||
+			    generation <= root->root_key.offset)
+				goto skip;
+
+			btrfs_node_key_to_cpu(path->nodes[level], &key,
+					      path->slots[level]);
+			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
+			if (ret < 0)
+				goto skip;
+
+			wc->stage = UPDATE_BACKREF;
+			wc->shared_level = level - 1;
+		}
+	}
+
+	if (!btrfs_buffer_uptodate(next, generation)) {
+		btrfs_tree_unlock(next);
+		free_extent_buffer(next);
+		next = NULL;
+	}
+
+	if (!next) {
+		if (reada && level == 1)
+			reada_walk_down(trans, root, wc, path);
+		next = read_tree_block(root, bytenr, blocksize, generation);
+		btrfs_tree_lock(next);
+		btrfs_set_lock_blocking(next);
+	}
+
+	level--;
+	BUG_ON(level != btrfs_header_level(next));
+	path->nodes[level] = next;
+	path->slots[level] = 0;
+	path->locks[level] = 1;
+	wc->level = level;
+	if (wc->level == 1)
+		wc->reada_slot = 0;
+	return 0;
+skip:
+	wc->refs[level - 1] = 0;
+	wc->flags[level - 1] = 0;
+
+	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+		parent = path->nodes[level]->start;
+	} else {
+		BUG_ON(root->root_key.objectid !=
+		       btrfs_header_owner(path->nodes[level]));
+		parent = 0;
+	}
+
+	ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+				root->root_key.objectid, level - 1, 0);
+	BUG_ON(ret);
+
+	btrfs_tree_unlock(next);
+	free_extent_buffer(next);
+	return 1;
+}
+
+/*
  * hepler to process tree block while walking up the tree.
  *
  * when wc->stage == DROP_REFERENCE, this function drops
@@ -4905,7 +4788,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 		if (level < wc->shared_level)
 			goto out;
 
-		BUG_ON(wc->refs[level] <= 1);
 		ret = find_next_key(path, level + 1, &wc->update_progress);
 		if (ret > 0)
 			wc->update_ref = 0;
@@ -4936,8 +4818,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 				path->locks[level] = 0;
 				return 1;
 			}
-		} else {
-			BUG_ON(level != 0);
 		}
 	}
 
@@ -4990,17 +4870,13 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 				   struct btrfs_path *path,
 				   struct walk_control *wc)
 {
-	struct extent_buffer *next;
-	struct extent_buffer *cur;
-	u64 bytenr;
-	u64 ptr_gen;
-	u32 blocksize;
 	int level = wc->level;
 	int ret;
 
 	while (level >= 0) {
-		cur = path->nodes[level];
-		BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
+		if (path->slots[level] >=
+		    btrfs_header_nritems(path->nodes[level]))
+			break;
 
 		ret = walk_down_proc(trans, root, path, wc);
 		if (ret > 0)
@@ -5009,20 +4885,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		if (level == 0)
 			break;
 
-		bytenr = btrfs_node_blockptr(cur, path->slots[level]);
-		blocksize = btrfs_level_size(root, level - 1);
-		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
-
-		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
-		btrfs_tree_lock(next);
-		btrfs_set_lock_blocking(next);
-
-		level--;
-		BUG_ON(level != btrfs_header_level(next));
-		path->nodes[level] = next;
-		path->slots[level] = 0;
-		path->locks[level] = 1;
-		wc->level = level;
+		ret = do_walk_down(trans, root, path, wc);
+		if (ret > 0) {
+			path->slots[level]++;
+			continue;
+		}
+		level = wc->level;
 	}
 	return 0;
 }
@@ -5112,9 +4980,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
 			err = ret;
 			goto out;
 		}
-		btrfs_node_key_to_cpu(path->nodes[level], &key,
-				      path->slots[level]);
-		WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
+		WARN_ON(ret > 0);
 
 		/*
 		 * unlock our path, this is safe because only this
@@ -5149,6 +5015,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = update_ref;
 	wc->keep_locks = 0;
+	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
 		ret = walk_down_tree(trans, root, path, wc);
@@ -5201,9 +5068,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
 	BUG_ON(ret);
 
-	free_extent_buffer(root->node);
-	free_extent_buffer(root->commit_root);
-	kfree(root);
+	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
+					   NULL, NULL);
+		BUG_ON(ret < 0);
+		if (ret > 0) {
+			ret = btrfs_del_orphan_item(trans, tree_root,
+						    root->root_key.objectid);
+			BUG_ON(ret);
+		}
+	}
+
+	if (root->in_radix) {
+		btrfs_free_fs_root(tree_root->fs_info, root);
+	} else {
+		free_extent_buffer(root->node);
+		free_extent_buffer(root->commit_root);
+		kfree(root);
+	}
 out:
 	btrfs_end_transaction(trans, tree_root);
 	kfree(wc);
@@ -5255,6 +5137,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = 0;
 	wc->keep_locks = 1;
+	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
 		wret = walk_down_tree(trans, root, path, wc);
@@ -5397,9 +5280,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
 	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
 	while (1) {
 		int ret;
-		spin_lock(&em_tree->lock);
+		write_lock(&em_tree->lock);
 		ret = add_extent_mapping(em_tree, em);
-		spin_unlock(&em_tree->lock);
+		write_unlock(&em_tree->lock);
 		if (ret != -EEXIST) {
 			free_extent_map(em);
 			break;
@@ -6842,287 +6725,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
 	return 0;
 }
 
-#if 0
-static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 u64 objectid, u64 size)
-{
-	struct btrfs_path *path;
-	struct btrfs_inode_item *item;
-	struct extent_buffer *leaf;
-	int ret;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	path->leave_spinning = 1;
-	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
-	if (ret)
-		goto out;
-
-	leaf = path->nodes[0];
-	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
-	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
-	btrfs_set_inode_generation(leaf, item, 1);
-	btrfs_set_inode_size(leaf, item, size);
-	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
-	btrfs_mark_buffer_dirty(leaf);
-	btrfs_release_path(root, path);
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
-					struct btrfs_block_group_cache *group)
+/*
+ * checks to see if its even possible to relocate this block group.
+ *
+ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
+ * ok to go ahead and try.
+ */
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 {
-	struct inode *inode = NULL;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root;
-	struct btrfs_key root_key;
-	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
-	int err = 0;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_device *device;
+	int full = 0;
+	int ret = 0;
 
-	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
-	root_key.type = BTRFS_ROOT_ITEM_KEY;
-	root_key.offset = (u64)-1;
-	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
-	if (IS_ERR(root))
-		return ERR_CAST(root);
+	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
 
-	trans = btrfs_start_transaction(root, 1);
-	BUG_ON(!trans);
+	/* odd, couldn't find the block group, leave it alone */
+	if (!block_group)
+		return -1;
 
-	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
-	if (err)
+	/* no bytes used, we're good */
+	if (!btrfs_block_group_used(&block_group->item))
 		goto out;
 
-	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
-	BUG_ON(err);
-
-	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-				       group->key.offset, 0, group->key.offset,
-				       0, 0, 0);
-	BUG_ON(err);
-
-	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
-	if (inode->i_state & I_NEW) {
-		BTRFS_I(inode)->root = root;
-		BTRFS_I(inode)->location.objectid = objectid;
-		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-		BTRFS_I(inode)->location.offset = 0;
-		btrfs_read_locked_inode(inode);
-		unlock_new_inode(inode);
-		BUG_ON(is_bad_inode(inode));
-	} else {
-		BUG_ON(1);
-	}
-	BTRFS_I(inode)->index_cnt = group->key.objectid;
-
-	err = btrfs_orphan_add(trans, inode);
-out:
-	btrfs_end_transaction(trans, root);
-	if (err) {
-		if (inode)
-			iput(inode);
-		inode = ERR_PTR(err);
-	}
-	return inode;
-}
-
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
-{
-
-	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
-	struct btrfs_ordered_extent *ordered;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct list_head list;
-	size_t offset;
-	int ret;
-	u64 disk_bytenr;
-
-	INIT_LIST_HEAD(&list);
-
-	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
-	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
-
-	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
-	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
-				       disk_bytenr + len - 1, &list);
-
-	while (!list_empty(&list)) {
-		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
-		list_del_init(&sums->list);
-
-		sector_sum = sums->sums;
-		sums->bytenr = ordered->start;
+	space_info = block_group->space_info;
+	spin_lock(&space_info->lock);
 
-		offset = 0;
-		while (offset < sums->len) {
-			sector_sum->bytenr += ordered->start - disk_bytenr;
-			sector_sum++;
-			offset += root->sectorsize;
-		}
+	full = space_info->full;
 
-		btrfs_add_ordered_sum(inode, ordered, sums);
+	/*
+	 * if this is the last block group we have in this space, we can't
+	 * relocate it unless we're able to allocate a new chunk below.
+	 *
+	 * Otherwise, we need to make sure we have room in the space to handle
+	 * all of the extents from this block group.  If we can, we're good
+	 */
+	if ((space_info->total_bytes != block_group->key.offset) &&
+	   (space_info->bytes_used + space_info->bytes_reserved +
+	    space_info->bytes_pinned + space_info->bytes_readonly +
+	    btrfs_block_group_used(&block_group->item) <
+	    space_info->total_bytes)) {
+		spin_unlock(&space_info->lock);
+		goto out;
 	}
-	btrfs_put_ordered_extent(ordered);
-	return 0;
-}
-
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_path *path;
-	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_buffer *leaf;
-	struct inode *reloc_inode;
-	struct btrfs_block_group_cache *block_group;
-	struct btrfs_key key;
-	u64 skipped;
-	u64 cur_byte;
-	u64 total_found;
-	u32 nritems;
-	int ret;
-	int progress;
-	int pass = 0;
-
-	root = root->fs_info->extent_root;
-
-	block_group = btrfs_lookup_block_group(info, group_start);
-	BUG_ON(!block_group);
-
-	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
-	       (unsigned long long)block_group->key.objectid,
-	       (unsigned long long)block_group->flags);
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
-	reloc_inode = create_reloc_inode(info, block_group);
-	BUG_ON(IS_ERR(reloc_inode));
-
-	__alloc_chunk_for_shrink(root, block_group, 1);
-	set_block_group_readonly(block_group);
-
-	btrfs_start_delalloc_inodes(info->tree_root);
-	btrfs_wait_ordered_extents(info->tree_root, 0);
-again:
-	skipped = 0;
-	total_found = 0;
-	progress = 0;
-	key.objectid = block_group->key.objectid;
-	key.offset = 0;
-	key.type = 0;
-	cur_byte = key.objectid;
-
-	trans = btrfs_start_transaction(info->tree_root, 1);
-	btrfs_commit_transaction(trans, info->tree_root);
+	spin_unlock(&space_info->lock);
 
-	mutex_lock(&root->fs_info->cleaner_mutex);
-	btrfs_clean_old_snapshots(info->tree_root);
-	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
-	mutex_unlock(&root->fs_info->cleaner_mutex);
+	/*
+	 * ok we don't have enough space, but maybe we have free space on our
+	 * devices to allocate new chunks for relocation, so loop through our
+	 * alloc devices and guess if we have enough space.  However, if we
+	 * were marked as full, then we know there aren't enough chunks, and we
+	 * can just return.
+	 */
+	ret = -1;
+	if (full)
+		goto out;
 
-	trans = btrfs_start_transaction(info->tree_root, 1);
-	btrfs_commit_transaction(trans, info->tree_root);
+	mutex_lock(&root->fs_info->chunk_mutex);
+	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+		u64 min_free = btrfs_block_group_used(&block_group->item);
+		u64 dev_offset, max_avail;
 
-	while (1) {
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		if (ret < 0)
-			goto out;
-next:
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret < 0)
-				goto out;
-			if (ret == 1) {
-				ret = 0;
+		/*
+		 * check to make sure we can actually find a chunk with enough
+		 * space to fit our block group in.
+		 */
+		if (device->total_bytes > device->bytes_used + min_free) {
+			ret = find_free_dev_extent(NULL, device, min_free,
+						   &dev_offset, &max_avail);
+			if (!ret)
 				break;
-			}
-			leaf = path->nodes[0];
-			nritems = btrfs_header_nritems(leaf);
+			ret = -1;
 		}
-
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
-		if (key.objectid >= block_group->key.objectid +
-		    block_group->key.offset)
-			break;
-
-		if (progress && need_resched()) {
-			btrfs_release_path(root, path);
-			cond_resched();
-			progress = 0;
-			continue;
-		}
-		progress = 1;
-
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
-		    key.objectid + key.offset <= cur_byte) {
-			path->slots[0]++;
-			goto next;
-		}
-
-		total_found++;
-		cur_byte = key.objectid + key.offset;
-		btrfs_release_path(root, path);
-
-		__alloc_chunk_for_shrink(root, block_group, 0);
-		ret = relocate_one_extent(root, path, &key, block_group,
-					  reloc_inode, pass);
-		BUG_ON(ret < 0);
-		if (ret > 0)
-			skipped++;
-
-		key.objectid = cur_byte;
-		key.type = 0;
-		key.offset = 0;
-	}
-
-	btrfs_release_path(root, path);
-
-	if (pass == 0) {
-		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
-		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
-	}
-
-	if (total_found > 0) {
-		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
-		       (unsigned long long)total_found, pass);
-		pass++;
-		if (total_found == skipped && pass > 2) {
-			iput(reloc_inode);
-			reloc_inode = create_reloc_inode(info, block_group);
-			pass = 0;
-		}
-		goto again;
 	}
-
-	/* delete reloc_inode */
-	iput(reloc_inode);
-
-	/* unpin extents in this range */
-	trans = btrfs_start_transaction(info->tree_root, 1);
-	btrfs_commit_transaction(trans, info->tree_root);
-
-	spin_lock(&block_group->lock);
-	WARN_ON(block_group->pinned > 0);
-	WARN_ON(block_group->reserved > 0);
-	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
-	spin_unlock(&block_group->lock);
-	btrfs_put_block_group(block_group);
-	ret = 0;
+	mutex_unlock(&root->fs_info->chunk_mutex);
 out:
-	btrfs_free_path(path);
+	btrfs_put_block_group(block_group);
 	return ret;
 }
-#endif
 
 static int find_first_block_group(struct btrfs_root *root,
 		struct btrfs_path *path, struct btrfs_key *key)
@@ -7165,8 +6847,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_space_info *space_info;
+	struct btrfs_caching_control *caching_ctl;
 	struct rb_node *n;
 
+	down_write(&info->extent_commit_sem);
+	while (!list_empty(&info->caching_block_groups)) {
+		caching_ctl = list_entry(info->caching_block_groups.next,
+					 struct btrfs_caching_control, list);
+		list_del(&caching_ctl->list);
+		put_caching_control(caching_ctl);
+	}
+	up_write(&info->extent_commit_sem);
+
 	spin_lock(&info->block_group_cache_lock);
 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
 		block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -7180,8 +6872,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		up_write(&block_group->space_info->groups_sem);
 
 		if (block_group->cached == BTRFS_CACHE_STARTED)
-			wait_event(block_group->caching_q,
-				   block_group_cache_done(block_group));
+			wait_block_group_cache_done(block_group);
 
 		btrfs_remove_free_space_cache(block_group);
 
@@ -7251,7 +6942,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		spin_lock_init(&cache->lock);
 		spin_lock_init(&cache->tree_lock);
 		cache->fs_info = info;
-		init_waitqueue_head(&cache->caching_q);
 		INIT_LIST_HEAD(&cache->list);
 		INIT_LIST_HEAD(&cache->cluster_list);
 
@@ -7273,8 +6963,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		cache->flags = btrfs_block_group_flags(&cache->item);
 		cache->sectorsize = root->sectorsize;
 
-		remove_sb_from_cache(root, cache);
-
 		/*
 		 * check for two cases, either we are full, and therefore
 		 * don't need to bother with the caching work since we won't
@@ -7283,13 +6971,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		 * time, particularly in the full case.
 		 */
 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+			exclude_super_stripes(root, cache);
+			cache->last_byte_to_unpin = (u64)-1;
 			cache->cached = BTRFS_CACHE_FINISHED;
+			free_excluded_extents(root, cache);
 		} else if (btrfs_block_group_used(&cache->item) == 0) {
+			exclude_super_stripes(root, cache);
+			cache->last_byte_to_unpin = (u64)-1;
 			cache->cached = BTRFS_CACHE_FINISHED;
 			add_new_free_space(cache, root->fs_info,
 					   found_key.objectid,
 					   found_key.objectid +
 					   found_key.offset);
+			free_excluded_extents(root, cache);
 		}
 
 		ret = update_space_info(info, cache->flags, found_key.offset,
@@ -7297,6 +6991,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 					&space_info);
 		BUG_ON(ret);
 		cache->space_info = space_info;
+		spin_lock(&cache->space_info->lock);
+		cache->space_info->bytes_super += cache->bytes_super;
+		spin_unlock(&cache->space_info->lock);
+
 		down_write(&space_info->groups_sem);
 		list_add_tail(&cache->list, &space_info->block_groups);
 		up_write(&space_info->groups_sem);
@@ -7346,7 +7044,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
 	spin_lock_init(&cache->tree_lock);
-	init_waitqueue_head(&cache->caching_q);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
 
@@ -7355,15 +7052,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->flags = type;
 	btrfs_set_block_group_flags(&cache->item, type);
 
+	cache->last_byte_to_unpin = (u64)-1;
 	cache->cached = BTRFS_CACHE_FINISHED;
-	remove_sb_from_cache(root, cache);
+	exclude_super_stripes(root, cache);
 
 	add_new_free_space(cache, root->fs_info, chunk_offset,
 			   chunk_offset + size);
 
+	free_excluded_extents(root, cache);
+
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	BUG_ON(ret);
+
+	spin_lock(&cache->space_info->lock);
+	cache->space_info->bytes_super += cache->bytes_super;
+	spin_unlock(&cache->space_info->lock);
+
 	down_write(&cache->space_info->groups_sem);
 	list_add_tail(&cache->list, &cache->space_info->block_groups);
 	up_write(&cache->space_info->groups_sem);
@@ -7429,8 +7134,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	up_write(&block_group->space_info->groups_sem);
 
 	if (block_group->cached == BTRFS_CACHE_STARTED)
-		wait_event(block_group->caching_q,
-			   block_group_cache_done(block_group));
+		wait_block_group_cache_done(block_group);
 
 	btrfs_remove_free_space_cache(block_group);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68260180f58..0cb88f8146e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -367,10 +367,10 @@ static int insert_state(struct extent_io_tree *tree,
 	}
 	if (bits & EXTENT_DIRTY)
 		tree->dirty_bytes += end - start + 1;
-	set_state_cb(tree, state, bits);
-	state->state |= bits;
 	state->start = start;
 	state->end = end;
+	set_state_cb(tree, state, bits);
+	state->state |= bits;
 	node = tree_insert(&tree->state, end, &state->rb_node);
 	if (node) {
 		struct extent_state *found;
@@ -471,10 +471,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
  * bits were already set, or zero if none of the bits were already set.
  */
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete, gfp_t mask)
+		     int bits, int wake, int delete,
+		     struct extent_state **cached_state,
+		     gfp_t mask)
 {
 	struct extent_state *state;
+	struct extent_state *cached;
 	struct extent_state *prealloc = NULL;
+	struct rb_node *next_node;
 	struct rb_node *node;
 	u64 last_end;
 	int err;
@@ -488,6 +492,17 @@ again:
 	}
 
 	spin_lock(&tree->lock);
+	if (cached_state) {
+		cached = *cached_state;
+		*cached_state = NULL;
+		cached_state = NULL;
+		if (cached && cached->tree && cached->start == start) {
+			atomic_dec(&cached->refs);
+			state = cached;
+			goto hit_next;
+		}
+		free_extent_state(cached);
+	}
 	/*
 	 * this search will find the extents that end after
 	 * our range starts
@@ -496,6 +511,7 @@ again:
 	if (!node)
 		goto out;
 	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
 	if (state->start > end)
 		goto out;
 	WARN_ON(state->end < start);
@@ -531,8 +547,6 @@ again:
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
-		} else {
-			start = state->start;
 		}
 		goto search_again;
 	}
@@ -550,16 +564,28 @@ again:
 
 		if (wake)
 			wake_up(&state->wq);
+
 		set |= clear_state_bit(tree, prealloc, bits,
 				       wake, delete);
 		prealloc = NULL;
 		goto out;
 	}
 
+	if (state->end < end && prealloc && !need_resched())
+		next_node = rb_next(&state->rb_node);
+	else
+		next_node = NULL;
+
 	set |= clear_state_bit(tree, state, bits, wake, delete);
 	if (last_end == (u64)-1)
 		goto out;
 	start = last_end + 1;
+	if (start <= end && next_node) {
+		state = rb_entry(next_node, struct extent_state,
+				 rb_node);
+		if (state->start == start)
+			goto hit_next;
+	}
 	goto search_again;
 
 out:
@@ -653,28 +679,40 @@ static void set_state_bits(struct extent_io_tree *tree,
 	state->state |= bits;
 }
 
+static void cache_state(struct extent_state *state,
+			struct extent_state **cached_ptr)
+{
+	if (cached_ptr && !(*cached_ptr)) {
+		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+			*cached_ptr = state;
+			atomic_inc(&state->refs);
+		}
+	}
+}
+
 /*
- * set some bits on a range in the tree.  This may require allocations
- * or sleeping, so the gfp mask is used to indicate what is allowed.
+ * set some bits on a range in the tree.  This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
  *
- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
- * range already has the desired bits set.  The start of the existing
- * range is returned in failed_start in this case.
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set.  The start of the
+ * existing range is returned in failed_start in this case.
  *
- * [start, end] is inclusive
- * This takes the tree lock.
+ * [start, end] is inclusive This takes the tree lock.
  */
+
 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-			  int bits, int exclusive, u64 *failed_start,
+			  int bits, int exclusive_bits, u64 *failed_start,
+			  struct extent_state **cached_state,
 			  gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
 	int err = 0;
-	int set;
 	u64 last_start;
 	u64 last_end;
+
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
 		prealloc = alloc_extent_state(mask);
@@ -683,6 +721,13 @@ again:
 	}
 
 	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->start == start && state->tree) {
+			node = &state->rb_node;
+			goto hit_next;
+		}
+	}
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -694,8 +739,8 @@ again:
 		BUG_ON(err == -EEXIST);
 		goto out;
 	}
-
 	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
 	last_start = state->start;
 	last_end = state->end;
 
@@ -706,17 +751,29 @@ again:
 	 * Just lock what we found and keep going
 	 */
 	if (state->start == start && state->end <= end) {
-		set = state->state & bits;
-		if (set && exclusive) {
+		struct rb_node *next_node;
+		if (state->state & exclusive_bits) {
 			*failed_start = state->start;
 			err = -EEXIST;
 			goto out;
 		}
+
 		set_state_bits(tree, state, bits);
+		cache_state(state, cached_state);
 		merge_state(tree, state);
 		if (last_end == (u64)-1)
 			goto out;
+
 		start = last_end + 1;
+		if (start < end && prealloc && !need_resched()) {
+			next_node = rb_next(node);
+			if (next_node) {
+				state = rb_entry(next_node, struct extent_state,
+						 rb_node);
+				if (state->start == start)
+					goto hit_next;
+			}
+		}
 		goto search_again;
 	}
 
@@ -737,8 +794,7 @@ again:
 	 * desired bit on it.
 	 */
 	if (state->start < start) {
-		set = state->state & bits;
-		if (exclusive && set) {
+		if (state->state & exclusive_bits) {
 			*failed_start = start;
 			err = -EEXIST;
 			goto out;
@@ -750,12 +806,11 @@ again:
 			goto out;
 		if (state->end <= end) {
 			set_state_bits(tree, state, bits);
+			cache_state(state, cached_state);
 			merge_state(tree, state);
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
-		} else {
-			start = state->start;
 		}
 		goto search_again;
 	}
@@ -774,6 +829,7 @@ again:
 			this_end = last_start - 1;
 		err = insert_state(tree, prealloc, start, this_end,
 				   bits);
+		cache_state(prealloc, cached_state);
 		prealloc = NULL;
 		BUG_ON(err == -EEXIST);
 		if (err)
@@ -788,8 +844,7 @@ again:
 	 * on the first half
 	 */
 	if (state->start <= end && state->end > end) {
-		set = state->state & bits;
-		if (exclusive && set) {
+		if (state->state & exclusive_bits) {
 			*failed_start = start;
 			err = -EEXIST;
 			goto out;
@@ -798,6 +853,7 @@ again:
 		BUG_ON(err == -EEXIST);
 
 		set_state_bits(tree, prealloc, bits);
+		cache_state(prealloc, cached_state);
 		merge_state(tree, prealloc);
 		prealloc = NULL;
 		goto out;
@@ -826,86 +882,64 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
-			      mask);
-}
-
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-		       gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+			      NULL, mask);
 }
 
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, bits, 0, NULL,
-			      mask);
+			      NULL, mask);
 }
 
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      int bits, gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
 }
 
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY,
-			      0, NULL, mask);
+			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+			      0, NULL, NULL, mask);
 }
 
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end,
-				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
-}
-
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-			 gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+				NULL, mask);
 }
 
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
-			      mask);
+			      NULL, mask);
 }
 
 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
+				NULL, mask);
 }
 
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
-			      mask);
+			      NULL, mask);
 }
 
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
 				 u64 end, gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
-}
-
-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
-			 gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
-			      0, NULL, mask);
-}
-
-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
-				  u64 end, gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+				NULL, mask);
 }
 
 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -917,13 +951,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
  * either insert or lock state struct between start and end use mask to tell
  * us if waiting is desired.
  */
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, struct extent_state **cached_state, gfp_t mask)
 {
 	int err;
 	u64 failed_start;
 	while (1) {
-		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
-				     &failed_start, mask);
+		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+				     EXTENT_LOCKED, &failed_start,
+				     cached_state, mask);
 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
 			start = failed_start;
@@ -935,27 +971,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 	return err;
 }
 
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+	return lock_extent_bits(tree, start, end, 0, NULL, mask);
+}
+
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask)
 {
 	int err;
 	u64 failed_start;
 
-	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
-			     &failed_start, mask);
+	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+			     &failed_start, NULL, mask);
 	if (err == -EEXIST) {
 		if (failed_start > start)
 			clear_extent_bit(tree, start, failed_start - 1,
-					 EXTENT_LOCKED, 1, 0, mask);
+					 EXTENT_LOCKED, 1, 0, NULL, mask);
 		return 0;
 	}
 	return 1;
 }
 
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+			 struct extent_state **cached, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+				mask);
+}
+
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		  gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+				mask);
 }
 
 /*
@@ -974,7 +1023,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
 		page_cache_release(page);
 		index++;
 	}
-	set_extent_dirty(tree, start, end, GFP_NOFS);
 	return 0;
 }
 
@@ -994,7 +1042,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 		page_cache_release(page);
 		index++;
 	}
-	set_extent_writeback(tree, start, end, GFP_NOFS);
 	return 0;
 }
 
@@ -1232,6 +1279,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
 	u64 delalloc_start;
 	u64 delalloc_end;
 	u64 found;
+	struct extent_state *cached_state = NULL;
 	int ret;
 	int loops = 0;
 
@@ -1269,6 +1317,7 @@ again:
 		/* some of the pages are gone, lets avoid looping by
 		 * shortening the size of the delalloc range we're searching
 		 */
+		free_extent_state(cached_state);
 		if (!loops) {
 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
 			max_bytes = PAGE_CACHE_SIZE - offset;
@@ -1282,18 +1331,21 @@ again:
 	BUG_ON(ret);
 
 	/* step three, lock the state bits for the whole range */
-	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+	lock_extent_bits(tree, delalloc_start, delalloc_end,
+			 0, &cached_state, GFP_NOFS);
 
 	/* then test to make sure it is all still delalloc */
 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
-			     EXTENT_DELALLOC, 1);
+			     EXTENT_DELALLOC, 1, cached_state);
 	if (!ret) {
-		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+		unlock_extent_cached(tree, delalloc_start, delalloc_end,
+				     &cached_state, GFP_NOFS);
 		__unlock_for_delalloc(inode, locked_page,
 			      delalloc_start, delalloc_end);
 		cond_resched();
 		goto again;
 	}
+	free_extent_state(cached_state);
 	*start = delalloc_start;
 	*end = delalloc_end;
 out_failed:
@@ -1307,7 +1359,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				int clear_unlock,
 				int clear_delalloc, int clear_dirty,
 				int set_writeback,
-				int end_writeback)
+				int end_writeback,
+				int set_private2)
 {
 	int ret;
 	struct page *pages[16];
@@ -1325,8 +1378,9 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	if (clear_delalloc)
 		clear_bits |= EXTENT_DELALLOC;
 
-	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
-	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
+	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
+	      set_private2))
 		return 0;
 
 	while (nr_pages > 0) {
@@ -1334,6 +1388,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				     min_t(unsigned long,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
+
+			if (set_private2)
+				SetPagePrivate2(pages[i]);
+
 			if (pages[i] == locked_page) {
 				page_cache_release(pages[i]);
 				continue;
@@ -1476,14 +1534,17 @@ out:
  * range is found set.
  */
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int filled)
+		   int bits, int filled, struct extent_state *cached)
 {
 	struct extent_state *state = NULL;
 	struct rb_node *node;
 	int bitset = 0;
 
 	spin_lock(&tree->lock);
-	node = tree_search(tree, start);
+	if (cached && cached->tree && cached->start == start)
+		node = &cached->rb_node;
+	else
+		node = tree_search(tree, start);
 	while (node && start <= end) {
 		state = rb_entry(node, struct extent_state, rb_node);
 
@@ -1503,6 +1564,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			bitset = 0;
 			break;
 		}
+
+		if (state->end == (u64)-1)
+			break;
+
 		start = state->end + 1;
 		if (start > end)
 			break;
@@ -1526,7 +1591,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
 {
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
 		SetPageUptodate(page);
 	return 0;
 }
@@ -1540,7 +1605,7 @@ static int check_page_locked(struct extent_io_tree *tree,
 {
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
 		unlock_page(page);
 	return 0;
 }
@@ -1552,10 +1617,7 @@ static int check_page_locked(struct extent_io_tree *tree,
 static int check_page_writeback(struct extent_io_tree *tree,
 			     struct page *page)
 {
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
-		end_page_writeback(page);
+	end_page_writeback(page);
 	return 0;
 }
 
@@ -1613,13 +1675,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 		}
 
 		if (!uptodate) {
-			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			clear_extent_uptodate(tree, start, end, GFP_NOFS);
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
 
-		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
-
 		if (whole_page)
 			end_page_writeback(page);
 		else
@@ -1983,7 +2043,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			continue;
 		}
 		/* the get_extent function already copied into the page */
-		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+		if (test_range_bit(tree, cur, cur_end,
+				   EXTENT_UPTODATE, 1, NULL)) {
 			check_page_uptodate(tree, page);
 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
 			cur = cur + iosize;
@@ -2078,6 +2139,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 iosize;
 	u64 unlock_start;
 	sector_t sector;
+	struct extent_state *cached_state = NULL;
 	struct extent_map *em;
 	struct block_device *bdev;
 	int ret;
@@ -2124,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	delalloc_end = 0;
 	page_started = 0;
 	if (!epd->extent_locked) {
+		u64 delalloc_to_write = 0;
 		/*
 		 * make sure the wbc mapping index is at least updated
 		 * to this page.
@@ -2143,8 +2206,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			tree->ops->fill_delalloc(inode, page, delalloc_start,
 						 delalloc_end, &page_started,
 						 &nr_written);
+			/*
+			 * delalloc_end is already one less than the total
+			 * length, so we don't subtract one from
+			 * PAGE_CACHE_SIZE
+			 */
+			delalloc_to_write += (delalloc_end - delalloc_start +
+					      PAGE_CACHE_SIZE) >>
+					      PAGE_CACHE_SHIFT;
 			delalloc_start = delalloc_end + 1;
 		}
+		if (wbc->nr_to_write < delalloc_to_write) {
+			int thresh = 8192;
+
+			if (delalloc_to_write < thresh * 2)
+				thresh = delalloc_to_write;
+			wbc->nr_to_write = min_t(u64, delalloc_to_write,
+						 thresh);
+		}
 
 		/* did the fill delalloc function already unlock and start
 		 * the IO?
@@ -2160,15 +2239,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			goto done_unlocked;
 		}
 	}
-	lock_extent(tree, start, page_end, GFP_NOFS);
-
-	unlock_start = start;
-
 	if (tree->ops && tree->ops->writepage_start_hook) {
 		ret = tree->ops->writepage_start_hook(page, start,
 						      page_end);
 		if (ret == -EAGAIN) {
-			unlock_extent(tree, start, page_end, GFP_NOFS);
 			redirty_page_for_writepage(wbc, page);
 			update_nr_written(page, wbc, nr_written);
 			unlock_page(page);
@@ -2184,12 +2258,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	update_nr_written(page, wbc, nr_written + 1);
 
 	end = page_end;
-	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
-		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
-
 	if (last_byte <= start) {
-		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
-		unlock_extent(tree, start, page_end, GFP_NOFS);
 		if (tree->ops && tree->ops->writepage_end_io_hook)
 			tree->ops->writepage_end_io_hook(page, start,
 							 page_end, NULL, 1);
@@ -2197,13 +2266,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		goto done;
 	}
 
-	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
 	blocksize = inode->i_sb->s_blocksize;
 
 	while (cur <= end) {
 		if (cur >= last_byte) {
-			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
-			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 			if (tree->ops && tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 page_end, NULL, 1);
@@ -2235,12 +2301,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		 */
 		if (compressed || block_start == EXTENT_MAP_HOLE ||
 		    block_start == EXTENT_MAP_INLINE) {
-			clear_extent_dirty(tree, cur,
-					   cur + iosize - 1, GFP_NOFS);
-
-			unlock_extent(tree, unlock_start, cur + iosize - 1,
-				      GFP_NOFS);
-
 			/*
 			 * end_io notification does not happen here for
 			 * compressed extents
@@ -2265,13 +2325,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		}
 		/* leave this out until we have a page_mkwrite call */
 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
-				   EXTENT_DIRTY, 0)) {
+				   EXTENT_DIRTY, 0, NULL)) {
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
 		}
 
-		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
 		if (tree->ops && tree->ops->writepage_io_hook) {
 			ret = tree->ops->writepage_io_hook(page, cur,
 						cur + iosize - 1);
@@ -2309,12 +2368,12 @@ done:
 		set_page_writeback(page);
 		end_page_writeback(page);
 	}
-	if (unlock_start <= page_end)
-		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 	unlock_page(page);
 
 done_unlocked:
 
+	/* drop our reference on any cached states */
+	free_extent_state(cached_state);
 	return 0;
 }
 
@@ -2339,9 +2398,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 			     writepage_t writepage, void *data,
 			     void (*flush_fn)(void *))
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int ret = 0;
 	int done = 0;
+	int nr_to_write_done = 0;
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
@@ -2361,7 +2420,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 		scanned = 1;
 	}
 retry:
-	while (!done && (index <= end) &&
+	while (!done && !nr_to_write_done && (index <= end) &&
 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 			      PAGECACHE_TAG_DIRTY, min(end - index,
 				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -2412,12 +2471,15 @@ retry:
 				unlock_page(page);
 				ret = 0;
 			}
-			if (ret || wbc->nr_to_write <= 0)
-				done = 1;
-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
+			if (ret)
 				done = 1;
-			}
+
+			/*
+			 * the filesystem may choose to bump up nr_to_write.
+			 * We have to make sure to honor the new nr_to_write
+			 * at any time
+			 */
+			nr_to_write_done = wbc->nr_to_write <= 0;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
@@ -2604,10 +2666,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 		return 0;
 
 	lock_extent(tree, start, end, GFP_NOFS);
-	wait_on_extent_writeback(tree, start, end);
+	wait_on_page_writeback(page);
 	clear_extent_bit(tree, start, end,
 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
-			 1, 1, GFP_NOFS);
+			 1, 1, NULL, GFP_NOFS);
 	return 0;
 }
 
@@ -2687,7 +2749,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
 		    !isnew && !PageUptodate(page) &&
 		    (block_off_end > to || block_off_start < from) &&
 		    !test_range_bit(tree, block_start, cur_end,
-				    EXTENT_UPTODATE, 1)) {
+				    EXTENT_UPTODATE, 1, NULL)) {
 			u64 sector;
 			u64 extent_offset = block_start - em->start;
 			size_t iosize;
@@ -2701,7 +2763,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
 			 */
 			set_extent_bit(tree, block_start,
 				       block_start + iosize - 1,
-				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+				       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset, em->bdev,
 					 NULL, 1,
@@ -2742,13 +2804,18 @@ int try_release_extent_state(struct extent_map_tree *map,
 	int ret = 1;
 
 	if (test_range_bit(tree, start, end,
-			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
+			   EXTENT_IOBITS, 0, NULL))
 		ret = 0;
 	else {
 		if ((mask & GFP_NOFS) == GFP_NOFS)
 			mask = GFP_NOFS;
-		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
-				 1, 1, mask);
+		/*
+		 * at this point we can safely clear everything except the
+		 * locked bit and the nodatasum bit
+		 */
+		clear_extent_bit(tree, start, end,
+				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
+				 0, 0, NULL, mask);
 	}
 	return ret;
 }
@@ -2771,29 +2838,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 		u64 len;
 		while (start <= end) {
 			len = end - start + 1;
-			spin_lock(&map->lock);
+			write_lock(&map->lock);
 			em = lookup_extent_mapping(map, start, len);
 			if (!em || IS_ERR(em)) {
-				spin_unlock(&map->lock);
+				write_unlock(&map->lock);
 				break;
 			}
 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
 			    em->start != start) {
-				spin_unlock(&map->lock);
+				write_unlock(&map->lock);
 				free_extent_map(em);
 				break;
 			}
 			if (!test_range_bit(tree, em->start,
 					    extent_map_end(em) - 1,
-					    EXTENT_LOCKED | EXTENT_WRITEBACK |
-					    EXTENT_ORDERED,
-					    0)) {
+					    EXTENT_LOCKED | EXTENT_WRITEBACK,
+					    0, NULL)) {
 				remove_extent_mapping(map, em);
 				/* once for the rb tree */
 				free_extent_map(em);
 			}
 			start = extent_map_end(em);
-			spin_unlock(&map->lock);
+			write_unlock(&map->lock);
 
 			/* once for us */
 			free_extent_map(em);
@@ -3203,7 +3269,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 	int uptodate;
 	unsigned long index;
 
-	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
 	if (ret)
 		return 1;
 	while (start <= end) {
@@ -3233,7 +3299,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 		return 1;
 
 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1);
+			   EXTENT_UPTODATE, 1, NULL);
 	if (ret)
 		return ret;
 
@@ -3269,7 +3335,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 		return 0;
 
 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1)) {
+			   EXTENT_UPTODATE, 1, NULL)) {
 		return 0;
 	}
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5bc20abf3f3..14ed16fd862 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,10 +13,8 @@
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_ORDERED (1 << 9)
-#define EXTENT_ORDERED_METADATA (1 << 10)
-#define EXTENT_BOUNDARY (1 << 11)
-#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_BOUNDARY (1 << 9)
+#define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /* flags for bio submission */
@@ -142,6 +140,8 @@ int try_release_extent_state(struct extent_map_tree *map,
 			     struct extent_io_tree *tree, struct page *page,
 			     gfp_t mask);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, struct extent_state **cached, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask);
@@ -155,11 +155,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 max_bytes, unsigned long bits);
 
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int filled);
+		   int bits, int filled, struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      int bits, gfp_t mask);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete, gfp_t mask);
+		     int bits, int wake, int delete, struct extent_state **cached,
+		     gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -282,5 +283,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				int clear_unlock,
 				int clear_delalloc, int clear_dirty,
 				int set_writeback,
-				int end_writeback);
+				int end_writeback,
+				int set_private2);
 #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 30c9365861e..2c726b7b9fa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -36,7 +36,7 @@ void extent_map_exit(void)
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 {
 	tree->map.rb_node = NULL;
-	spin_lock_init(&tree->lock);
+	rwlock_init(&tree->lock);
 }
 
 /**
@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 	return 0;
 }
 
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+{
+	int ret = 0;
+	struct extent_map *merge = NULL;
+	struct rb_node *rb;
+	struct extent_map *em;
+
+	write_lock(&tree->lock);
+	em = lookup_extent_mapping(tree, start, len);
+
+	WARN_ON(em->start != start || !em);
+
+	if (!em)
+		goto out;
+
+	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb)
+			merge = rb_entry(rb, struct extent_map, rb_node);
+		if (rb && mergable_maps(merge, em)) {
+			em->start = merge->start;
+			em->len += merge->len;
+			em->block_len += merge->block_len;
+			em->block_start = merge->block_start;
+			merge->in_tree = 0;
+			rb_erase(&merge->rb_node, &tree->map);
+			free_extent_map(merge);
+		}
+	}
+
+	rb = rb_next(&em->rb_node);
+	if (rb)
+		merge = rb_entry(rb, struct extent_map, rb_node);
+	if (rb && mergable_maps(em, merge)) {
+		em->len += merge->len;
+		em->block_len += merge->len;
+		rb_erase(&merge->rb_node, &tree->map);
+		merge->in_tree = 0;
+		free_extent_map(merge);
+	}
+
+	free_extent_map(em);
+out:
+	write_unlock(&tree->lock);
+	return ret;
+
+}
+
 /**
  * add_extent_mapping - add new extent map to the extent tree
  * @tree:	tree to insert new map in
@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		ret = -EEXIST;
 		goto out;
 	}
-	assert_spin_locked(&tree->lock);
 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
 	if (rb) {
 		ret = -EEXIST;
@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 	struct rb_node *next = NULL;
 	u64 end = range_end(start, len);
 
-	assert_spin_locked(&tree->lock);
 	rb_node = __tree_search(&tree->map, start, &prev, &next);
 	if (!rb_node && prev) {
 		em = rb_entry(prev, struct extent_map, rb_node);
@@ -319,6 +367,54 @@ out:
 }
 
 /**
+ * search_extent_mapping - find a nearby extent map
+ * @tree:	tree to lookup in
+ * @start:	byte offset to start the search
+ * @len:	length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.
+ *
+ * If one can't be found, any nearby extent may be returned
+ */
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len)
+{
+	struct extent_map *em;
+	struct rb_node *rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *next = NULL;
+
+	rb_node = __tree_search(&tree->map, start, &prev, &next);
+	if (!rb_node && prev) {
+		em = rb_entry(prev, struct extent_map, rb_node);
+		goto found;
+	}
+	if (!rb_node && next) {
+		em = rb_entry(next, struct extent_map, rb_node);
+		goto found;
+	}
+	if (!rb_node) {
+		em = NULL;
+		goto out;
+	}
+	if (IS_ERR(rb_node)) {
+		em = ERR_PTR(PTR_ERR(rb_node));
+		goto out;
+	}
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+	goto found;
+
+	em = NULL;
+	goto out;
+
+found:
+	atomic_inc(&em->refs);
+out:
+	return em;
+}
+
+/**
  * remove_extent_mapping - removes an extent_map from the extent tree
  * @tree:	extent tree to remove from
  * @em:		extent map beeing removed
@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	int ret = 0;
 
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
-	assert_spin_locked(&tree->lock);
 	rb_erase(&em->rb_node, &tree->map);
 	em->in_tree = 0;
 	return ret;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index fb6eeef06bb..ab6d74b6e64 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -31,7 +31,7 @@ struct extent_map {
 
 struct extent_map_tree {
 	struct rb_root map;
-	spinlock_t lock;
+	rwlock_t lock;
 };
 
 static inline u64 extent_map_end(struct extent_map *em)
@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4b833972273..571ad3c13b4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	int err = 0;
 	int i;
 	struct inode *inode = fdentry(file)->d_inode;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	u64 hint_byte;
 	u64 num_bytes;
 	u64 start_pos;
 	u64 end_of_last_block;
@@ -125,22 +123,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
 	end_of_last_block = start_pos + num_bytes - 1;
-
-	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-	trans = btrfs_join_transaction(root, 1);
-	if (!trans) {
-		err = -ENOMEM;
-		goto out_unlock;
-	}
-	btrfs_set_trans_block_group(trans, inode);
-	hint_byte = 0;
-
-	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-
-	/* check for reserved extents on each page, we don't want
-	 * to reset the delalloc bit on things that already have
-	 * extents reserved.
-	 */
 	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = pages[i];
@@ -155,9 +137,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		 * at this time.
 		 */
 	}
-	err = btrfs_end_transaction(trans, root);
-out_unlock:
-	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	return err;
 }
 
@@ -189,18 +168,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		if (!split2)
 			split2 = alloc_extent_map(GFP_NOFS);
 
-		spin_lock(&em_tree->lock);
+		write_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, len);
 		if (!em) {
-			spin_unlock(&em_tree->lock);
+			write_unlock(&em_tree->lock);
 			break;
 		}
 		flags = em->flags;
 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
-			spin_unlock(&em_tree->lock);
 			if (em->start <= start &&
 			    (!testend || em->start + em->len >= start + len)) {
 				free_extent_map(em);
+				write_unlock(&em_tree->lock);
 				break;
 			}
 			if (start < em->start) {
@@ -210,6 +189,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 				start = em->start + em->len;
 			}
 			free_extent_map(em);
+			write_unlock(&em_tree->lock);
 			continue;
 		}
 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -260,7 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			free_extent_map(split);
 			split = NULL;
 		}
-		spin_unlock(&em_tree->lock);
+		write_unlock(&em_tree->lock);
 
 		/* once for us */
 		free_extent_map(em);
@@ -289,7 +269,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 locked_end,
-		       u64 inline_limit, u64 *hint_byte)
+		       u64 inline_limit, u64 *hint_byte, int drop_cache)
 {
 	u64 extent_end = 0;
 	u64 search_start = start;
@@ -314,7 +294,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int ret;
 
 	inline_limit = 0;
-	btrfs_drop_extent_cache(inode, start, end - 1, 0);
+	if (drop_cache)
+		btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
 	path = btrfs_alloc_path();
 	if (!path)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5edcee3a617..5c2caad7621 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 
 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
 {
-	u64 max_bytes, possible_bytes;
+	u64 max_bytes;
+	u64 bitmap_bytes;
+	u64 extent_bytes;
 
 	/*
 	 * The goal is to keep the total amount of memory used per 1gb of space
@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
 	max_bytes = MAX_CACHE_BYTES_PER_GIG *
 		(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
 
-	possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
-		(sizeof(struct btrfs_free_space) *
-		 block_group->extents_thresh);
+	/*
+	 * we want to account for 1 more bitmap than what we have so we can make
+	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
+	 * we add more bitmaps.
+	 */
+	bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
 
-	if (possible_bytes > max_bytes) {
-		int extent_bytes = max_bytes -
-			(block_group->total_bitmaps * PAGE_CACHE_SIZE);
+	if (bitmap_bytes >= max_bytes) {
+		block_group->extents_thresh = 0;
+		return;
+	}
 
-		if (extent_bytes <= 0) {
-			block_group->extents_thresh = 0;
-			return;
-		}
+	/*
+	 * we want the extent entry threshold to always be at most 1/2 the maxw
+	 * bytes we can have, or whatever is less than that.
+	 */
+	extent_bytes = max_bytes - bitmap_bytes;
+	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
 
-		block_group->extents_thresh = extent_bytes /
-			(sizeof(struct btrfs_free_space));
-	}
+	block_group->extents_thresh =
+		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
 }
 
 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
 	BUG_ON(block_group->total_bitmaps >= max_bitmaps);
 
 	info->offset = offset_to_bitmap(block_group, offset);
+	info->bytes = 0;
 	link_free_space(block_group, info);
 	block_group->total_bitmaps++;
 
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 6b627c61180..72ce3c173d6 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 		ptr = (unsigned long)(ref + 1);
 		ret = 0;
 	} else if (ret < 0) {
+		if (ret == -EOVERFLOW)
+			ret = -EMLINK;
 		goto out;
 	} else {
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(struct btrfs_inode_item));
-	if (ret == 0 && objectid > root->highest_inode)
-		root->highest_inode = objectid;
 	return ret;
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 9abbced1123..c56eb590917 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
 		slot = path->slots[0] - 1;
 		l = path->nodes[0];
 		btrfs_item_key_to_cpu(l, &found_key, slot);
-		*objectid = found_key.objectid;
+		*objectid = max_t(u64, found_key.objectid,
+				  BTRFS_FIRST_FREE_OBJECTID - 1);
 	} else {
-		*objectid = BTRFS_FIRST_FREE_OBJECTID;
+		*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
 	}
 	ret = 0;
 error:
@@ -53,91 +54,27 @@ error:
 	return ret;
 }
 
-/*
- * walks the btree of allocated inodes and find a hole.
- */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 dirid, u64 *objectid)
 {
-	struct btrfs_path *path;
-	struct btrfs_key key;
 	int ret;
-	int slot = 0;
-	u64 last_ino = 0;
-	int start_found;
-	struct extent_buffer *l;
-	struct btrfs_key search_key;
-	u64 search_start = dirid;
-
 	mutex_lock(&root->objectid_mutex);
-	if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
-	    root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
-		*objectid = ++root->last_inode_alloc;
-		mutex_unlock(&root->objectid_mutex);
-		return 0;
-	}
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
-	search_key.objectid = search_start;
-	search_key.type = 0;
-	search_key.offset = 0;
-
-	start_found = 0;
-	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
-	if (ret < 0)
-		goto error;
 
-	while (1) {
-		l = path->nodes[0];
-		slot = path->slots[0];
-		if (slot >= btrfs_header_nritems(l)) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret == 0)
-				continue;
-			if (ret < 0)
-				goto error;
-			if (!start_found) {
-				*objectid = search_start;
-				start_found = 1;
-				goto found;
-			}
-			*objectid = last_ino > search_start ?
-				last_ino : search_start;
-			goto found;
-		}
-		btrfs_item_key_to_cpu(l, &key, slot);
-		if (key.objectid >= search_start) {
-			if (start_found) {
-				if (last_ino < search_start)
-					last_ino = search_start;
-				if (key.objectid > last_ino) {
-					*objectid = last_ino;
-					goto found;
-				}
-			} else if (key.objectid > search_start) {
-				*objectid = search_start;
-				goto found;
-			}
-		}
-		if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
-			break;
+	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
+		ret = btrfs_find_highest_inode(root, &root->highest_objectid);
+		if (ret)
+			goto out;
+	}
 
-		start_found = 1;
-		last_ino = key.objectid + 1;
-		path->slots[0]++;
+	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+		ret = -ENOSPC;
+		goto out;
 	}
-	BUG_ON(1);
-found:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
-	BUG_ON(*objectid < search_start);
-	mutex_unlock(&root->objectid_mutex);
-	return 0;
-error:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
+
+	*objectid = ++root->highest_objectid;
+	ret = 0;
+out:
 	mutex_unlock(&root->objectid_mutex);
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9096fd0ca3c..e9b76bcd1c1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_drop_extents(trans, root, inode, start,
-				 aligned_end, aligned_end, start, &hint_byte);
+				 aligned_end, aligned_end, start,
+				 &hint_byte, 1);
 	BUG_ON(ret);
 
 	if (isize > actual_end)
@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 				   inline_len, compressed_size,
 				   compressed_pages);
 	BUG_ON(ret);
-	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 	return 0;
 }
 
@@ -425,7 +426,7 @@ again:
 			extent_clear_unlock_delalloc(inode,
 						     &BTRFS_I(inode)->io_tree,
 						     start, end, NULL, 1, 0,
-						     0, 1, 1, 1);
+						     0, 1, 1, 1, 0);
 			ret = 0;
 			goto free_pages_out;
 		}
@@ -611,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 
 		while (1) {
-			spin_lock(&em_tree->lock);
+			write_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
-			spin_unlock(&em_tree->lock);
+			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
 				break;
@@ -640,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 					     async_extent->start,
 					     async_extent->start +
 					     async_extent->ram_size - 1,
-					     NULL, 1, 1, 0, 1, 1, 0);
+					     NULL, 1, 1, 0, 1, 1, 0, 0);
 
 		ret = btrfs_submit_compressed_write(inode,
 				    async_extent->start,
@@ -713,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
 			extent_clear_unlock_delalloc(inode,
 						     &BTRFS_I(inode)->io_tree,
 						     start, end, NULL, 1, 1,
-						     1, 1, 1, 1);
+						     1, 1, 1, 1, 0);
 			*nr_written = *nr_written +
 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 			*page_started = 1;
@@ -725,6 +726,15 @@ static noinline int cow_file_range(struct inode *inode,
 	BUG_ON(disk_num_bytes >
 	       btrfs_super_total_bytes(&root->fs_info->super_copy));
 
+
+	read_lock(&BTRFS_I(inode)->extent_tree.lock);
+	em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
+				   start, num_bytes);
+	if (em) {
+		alloc_hint = em->block_start;
+		free_extent_map(em);
+	}
+	read_unlock(&BTRFS_I(inode)->extent_tree.lock);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
 	while (disk_num_bytes > 0) {
@@ -737,7 +747,6 @@ static noinline int cow_file_range(struct inode *inode,
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
 		em->orig_start = em->start;
-
 		ram_size = ins.offset;
 		em->len = ins.offset;
 
@@ -747,9 +756,9 @@ static noinline int cow_file_range(struct inode *inode,
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
 		while (1) {
-			spin_lock(&em_tree->lock);
+			write_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
-			spin_unlock(&em_tree->lock);
+			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
 				break;
@@ -776,11 +785,14 @@ static noinline int cow_file_range(struct inode *inode,
 		/* we're not doing compressed IO, don't unlock the first
 		 * page (which the caller expects to stay locked), don't
 		 * clear any dirty bits and don't set any writeback bits
+		 *
+		 * Do set the Private2 bit so we know this page was properly
+		 * setup for writepage
 		 */
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					     start, start + ram_size - 1,
 					     locked_page, unlock, 1,
-					     1, 0, 0, 0);
+					     1, 0, 0, 0, 1);
 		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
@@ -853,7 +865,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	int limit = 10 * 1024 * 1042;
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
-			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+			 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
 	while (start < end) {
 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
 		async_cow->inode = inode;
@@ -1080,9 +1092,9 @@ out_check:
 			em->bdev = root->fs_info->fs_devices->latest_bdev;
 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
 			while (1) {
-				spin_lock(&em_tree->lock);
+				write_lock(&em_tree->lock);
 				ret = add_extent_mapping(em_tree, em);
-				spin_unlock(&em_tree->lock);
+				write_unlock(&em_tree->lock);
 				if (ret != -EEXIST) {
 					free_extent_map(em);
 					break;
@@ -1101,7 +1113,7 @@ out_check:
 
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					cur_offset, cur_offset + num_bytes - 1,
-					locked_page, 1, 1, 1, 0, 0, 0);
+					locked_page, 1, 1, 1, 0, 0, 0, 1);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
@@ -1374,10 +1386,8 @@ again:
 	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
 
 	/* already ordered? We're done */
-	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			     EXTENT_ORDERED, 0)) {
+	if (PagePrivate2(page))
 		goto out;
-	}
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
@@ -1413,11 +1423,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 	struct inode *inode = page->mapping->host;
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
 
-	ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
-			     EXTENT_ORDERED, 0);
-	if (ret)
+	/* this page is properly in the ordered list */
+	if (TestClearPagePrivate2(page))
 		return 0;
 
 	if (PageChecked(page))
@@ -1455,9 +1463,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(!path);
 
 	path->leave_spinning = 1;
+
+	/*
+	 * we may be replacing one extent in the tree with another.
+	 * The new extent is pinned in the extent map, and we don't want
+	 * to drop it from the cache until it is completely in the btree.
+	 *
+	 * So, tell btrfs_drop_extents to leave this extent in the cache.
+	 * the caller is expected to unpin it and allow it to be merged
+	 * with the others.
+	 */
 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
 				 file_pos + num_bytes, locked_end,
-				 file_pos, &hint);
+				 file_pos, &hint, 0);
 	BUG_ON(ret);
 
 	ins.objectid = inode->i_ino;
@@ -1485,7 +1503,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 
 	inode_add_bytes(inode, num_bytes);
-	btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
 
 	ins.objectid = disk_bytenr;
 	ins.offset = disk_num_bytes;
@@ -1596,6 +1613,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 						ordered_extent->len,
 						compressed, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
+		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+				   ordered_extent->file_offset,
+				   ordered_extent->len);
 		BUG_ON(ret);
 	}
 	unlock_extent(io_tree, ordered_extent->file_offset,
@@ -1623,6 +1643,7 @@ nocow:
 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 				struct extent_state *state, int uptodate)
 {
+	ClearPagePrivate2(page);
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
@@ -1669,13 +1690,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		failrec->last_mirror = 0;
 		failrec->bio_flags = 0;
 
-		spin_lock(&em_tree->lock);
+		read_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, failrec->len);
 		if (em->start > start || em->start + em->len < start) {
 			free_extent_map(em);
 			em = NULL;
 		}
-		spin_unlock(&em_tree->lock);
+		read_unlock(&em_tree->lock);
 
 		if (!em || IS_ERR(em)) {
 			kfree(failrec);
@@ -1794,7 +1815,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		return 0;
 
 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
-	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
 				  GFP_NOFS);
 		return 0;
@@ -2352,6 +2373,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	return ret;
 }
 
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct inode *dir, u64 objectid,
+			const char *name, int name_len)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u64 index;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+				   name, name_len, -1);
+	BUG_ON(!di || IS_ERR(di));
+
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	BUG_ON(ret);
+	btrfs_release_path(root, path);
+
+	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+				 objectid, root->root_key.objectid,
+				 dir->i_ino, &index, name, name_len);
+	if (ret < 0) {
+		BUG_ON(ret != -ENOENT);
+		di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+						 name, name_len);
+		BUG_ON(!di || IS_ERR(di));
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(root, path);
+		index = key.offset;
+	}
+
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+					 index, name, name_len, -1);
+	BUG_ON(!di || IS_ERR(di));
+
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	BUG_ON(ret);
+	btrfs_release_path(root, path);
+
+	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, dir);
+	BUG_ON(ret);
+	dir->i_sb->s_dirt = 1;
+
+	btrfs_free_path(path);
+	return 0;
+}
+
 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
@@ -2361,29 +2445,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr = 0;
 
-	/*
-	 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
-	 * the root of a subvolume or snapshot
-	 */
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
-	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		return -ENOTEMPTY;
-	}
 
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
+	if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+		err = btrfs_unlink_subvol(trans, root, dir,
+					  BTRFS_I(inode)->location.objectid,
+					  dentry->d_name.name,
+					  dentry->d_name.len);
+		goto out;
+	}
+
 	err = btrfs_orphan_add(trans, inode);
 	if (err)
-		goto fail_trans;
+		goto out;
 
 	/* now the directory is empty */
 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
 	if (!err)
 		btrfs_i_size_write(inode, 0);
-
-fail_trans:
+out:
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
@@ -2935,7 +3021,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 						 cur_offset,
 						 cur_offset + hole_size,
 						 block_end,
-						 cur_offset, &hint_byte);
+						 cur_offset, &hint_byte, 1);
 			if (err)
 				break;
 			err = btrfs_insert_file_extent(trans, root,
@@ -3003,6 +3089,11 @@ void btrfs_delete_inode(struct inode *inode)
 	}
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
+	if (inode->i_nlink > 0) {
+		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+		goto no_delete;
+	}
+
 	btrfs_i_size_write(inode, 0);
 	trans = btrfs_join_transaction(root, 1);
 
@@ -3070,29 +3161,67 @@ out_err:
  * is kind of like crossing a mount point.
  */
 static int fixup_tree_root_location(struct btrfs_root *root,
-			     struct btrfs_key *location,
-			     struct btrfs_root **sub_root,
-			     struct dentry *dentry)
+				    struct inode *dir,
+				    struct dentry *dentry,
+				    struct btrfs_key *location,
+				    struct btrfs_root **sub_root)
 {
-	struct btrfs_root_item *ri;
+	struct btrfs_path *path;
+	struct btrfs_root *new_root;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	int ret;
+	int err = 0;
 
-	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
-		return 0;
-	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
-		return 0;
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out;
+	}
 
-	*sub_root = btrfs_read_fs_root(root->fs_info, location,
-					dentry->d_name.name,
-					dentry->d_name.len);
-	if (IS_ERR(*sub_root))
-		return PTR_ERR(*sub_root);
+	err = -ENOENT;
+	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
+				  BTRFS_I(dir)->root->root_key.objectid,
+				  location->objectid);
+	if (ret) {
+		if (ret < 0)
+			err = ret;
+		goto out;
+	}
 
-	ri = &(*sub_root)->root_item;
-	location->objectid = btrfs_root_dirid(ri);
-	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
-	location->offset = 0;
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+		goto out;
 
-	return 0;
+	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
+				   (unsigned long)(ref + 1),
+				   dentry->d_name.len);
+	if (ret)
+		goto out;
+
+	btrfs_release_path(root->fs_info->tree_root, path);
+
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+	if (IS_ERR(new_root)) {
+		err = PTR_ERR(new_root);
+		goto out;
+	}
+
+	if (btrfs_root_refs(&new_root->root_item) == 0) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	*sub_root = new_root;
+	location->objectid = btrfs_root_dirid(&new_root->root_item);
+	location->type = BTRFS_INODE_ITEM_KEY;
+	location->offset = 0;
+	err = 0;
+out:
+	btrfs_free_path(path);
+	return err;
 }
 
 static void inode_tree_add(struct inode *inode)
@@ -3101,11 +3230,13 @@ static void inode_tree_add(struct inode *inode)
 	struct btrfs_inode *entry;
 	struct rb_node **p;
 	struct rb_node *parent;
-
 again:
 	p = &root->inode_tree.rb_node;
 	parent = NULL;
 
+	if (hlist_unhashed(&inode->i_hash))
+		return;
+
 	spin_lock(&root->inode_lock);
 	while (*p) {
 		parent = *p;
@@ -3132,13 +3263,87 @@ again:
 static void inode_tree_del(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int empty = 0;
 
 	spin_lock(&root->inode_lock);
 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+		empty = RB_EMPTY_ROOT(&root->inode_tree);
 	}
 	spin_unlock(&root->inode_lock);
+
+	if (empty && btrfs_root_refs(&root->root_item) == 0) {
+		synchronize_srcu(&root->fs_info->subvol_srcu);
+		spin_lock(&root->inode_lock);
+		empty = RB_EMPTY_ROOT(&root->inode_tree);
+		spin_unlock(&root->inode_lock);
+		if (empty)
+			btrfs_add_dead_root(root);
+	}
+}
+
+int btrfs_invalidate_inodes(struct btrfs_root *root)
+{
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct btrfs_inode *entry;
+	struct inode *inode;
+	u64 objectid = 0;
+
+	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+	spin_lock(&root->inode_lock);
+again:
+	node = root->inode_tree.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+		if (objectid < entry->vfs_inode.i_ino)
+			node = node->rb_left;
+		else if (objectid > entry->vfs_inode.i_ino)
+			node = node->rb_right;
+		else
+			break;
+	}
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct btrfs_inode, rb_node);
+			if (objectid <= entry->vfs_inode.i_ino) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+	while (node) {
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+		objectid = entry->vfs_inode.i_ino + 1;
+		inode = igrab(&entry->vfs_inode);
+		if (inode) {
+			spin_unlock(&root->inode_lock);
+			if (atomic_read(&inode->i_count) > 1)
+				d_prune_aliases(inode);
+			/*
+			 * btrfs_drop_inode will remove it from
+			 * the inode cache when its usage count
+			 * hits zero.
+			 */
+			iput(inode);
+			cond_resched();
+			spin_lock(&root->inode_lock);
+			goto again;
+		}
+
+		if (cond_resched_lock(&root->inode_lock))
+			goto again;
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->inode_lock);
+	return 0;
 }
 
 static noinline void init_btrfs_i(struct inode *inode)
@@ -3225,15 +3430,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 	return inode;
 }
 
+static struct inode *new_simple_dir(struct super_block *s,
+				    struct btrfs_key *key,
+				    struct btrfs_root *root)
+{
+	struct inode *inode = new_inode(s);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	init_btrfs_i(inode);
+
+	BTRFS_I(inode)->root = root;
+	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+	BTRFS_I(inode)->dummy_inode = 1;
+
+	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	return inode;
+}
+
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode;
-	struct btrfs_inode *bi = BTRFS_I(dir);
-	struct btrfs_root *root = bi->root;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
+	int index;
 	int ret;
 
+	dentry->d_op = &btrfs_dentry_operations;
+
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
@@ -3242,29 +3473,50 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	if (ret < 0)
 		return ERR_PTR(ret);
 
-	inode = NULL;
-	if (location.objectid) {
-		ret = fixup_tree_root_location(root, &location, &sub_root,
-						dentry);
-		if (ret < 0)
-			return ERR_PTR(ret);
-		if (ret > 0)
-			return ERR_PTR(-ENOENT);
+	if (location.objectid == 0)
+		return NULL;
+
+	if (location.type == BTRFS_INODE_ITEM_KEY) {
+		inode = btrfs_iget(dir->i_sb, &location, root);
+		return inode;
+	}
+
+	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
+
+	index = srcu_read_lock(&root->fs_info->subvol_srcu);
+	ret = fixup_tree_root_location(root, dir, dentry,
+				       &location, &sub_root);
+	if (ret < 0) {
+		if (ret != -ENOENT)
+			inode = ERR_PTR(ret);
+		else
+			inode = new_simple_dir(dir->i_sb, &location, sub_root);
+	} else {
 		inode = btrfs_iget(dir->i_sb, &location, sub_root);
-		if (IS_ERR(inode))
-			return ERR_CAST(inode);
 	}
+	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+
 	return inode;
 }
 
+static int btrfs_dentry_delete(struct dentry *dentry)
+{
+	struct btrfs_root *root;
+
+	if (!dentry->d_inode)
+		return 0;
+
+	root = BTRFS_I(dentry->d_inode)->root;
+	if (btrfs_root_refs(&root->root_item) == 0)
+		return 1;
+	return 0;
+}
+
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   struct nameidata *nd)
 {
 	struct inode *inode;
 
-	if (dentry->d_name.len > BTRFS_NAME_LEN)
-		return ERR_PTR(-ENAMETOOLONG);
-
 	inode = btrfs_lookup_dentry(dir, dentry);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
@@ -3603,9 +3855,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (ret != 0)
 		goto fail;
 
-	if (objectid > root->highest_inode)
-		root->highest_inode = objectid;
-
 	inode->i_uid = current_fsuid();
 
 	if (dir && (dir->i_mode & S_ISGID)) {
@@ -3673,26 +3922,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct inode *parent_inode, struct inode *inode,
 		   const char *name, int name_len, int add_backref, u64 index)
 {
-	int ret;
+	int ret = 0;
 	struct btrfs_key key;
 	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
 
-	key.objectid = inode->i_ino;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-	key.offset = 0;
+	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
+	} else {
+		key.objectid = inode->i_ino;
+		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+		key.offset = 0;
+	}
+
+	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+					 key.objectid, root->root_key.objectid,
+					 parent_inode->i_ino,
+					 index, name, name_len);
+	} else if (add_backref) {
+		ret = btrfs_insert_inode_ref(trans, root,
+					     name, name_len, inode->i_ino,
+					     parent_inode->i_ino, index);
+	}
 
-	ret = btrfs_insert_dir_item(trans, root, name, name_len,
-				    parent_inode->i_ino,
-				    &key, btrfs_inode_type(inode),
-				    index);
 	if (ret == 0) {
-		if (add_backref) {
-			ret = btrfs_insert_inode_ref(trans, root,
-						     name, name_len,
-						     inode->i_ino,
-						     parent_inode->i_ino,
-						     index);
-		}
+		ret = btrfs_insert_dir_item(trans, root, name, name_len,
+					    parent_inode->i_ino, &key,
+					    btrfs_inode_type(inode), index);
+		BUG_ON(ret);
+
 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
 				   name_len * 2);
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -3875,18 +4133,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	err = btrfs_add_nondir(trans, dentry, inode, 1, index);
 
-	if (err)
-		drop_inode = 1;
-
-	btrfs_update_inode_block_group(trans, dir);
-	err = btrfs_update_inode(trans, root, inode);
-
-	if (err)
+	if (err) {
 		drop_inode = 1;
+	} else {
+		btrfs_update_inode_block_group(trans, dir);
+		err = btrfs_update_inode(trans, root, inode);
+		BUG_ON(err);
+		btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+	}
 
 	nr = trans->blocks_used;
-
-	btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
 	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
@@ -4064,11 +4320,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	int compressed;
 
 again:
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
 	if (em)
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	if (em) {
 		if (em->start > start || em->start + em->len <= start)
@@ -4215,6 +4471,11 @@ again:
 				map = kmap(page);
 				read_extent_buffer(leaf, map + pg_offset, ptr,
 						   copy_size);
+				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+					memset(map + pg_offset + copy_size, 0,
+					       PAGE_CACHE_SIZE - pg_offset -
+					       copy_size);
+				}
 				kunmap(page);
 			}
 			flush_dcache_page(page);
@@ -4259,7 +4520,7 @@ insert:
 	}
 
 	err = 0;
-	spin_lock(&em_tree->lock);
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	/* it is possible that someone inserted the extent into the tree
 	 * while we had the lock dropped.  It is also possible that
@@ -4299,7 +4560,7 @@ insert:
 			err = 0;
 		}
 	}
-	spin_unlock(&em_tree->lock);
+	write_unlock(&em_tree->lock);
 out:
 	if (path)
 		btrfs_free_path(path);
@@ -4398,13 +4659,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
+
+	/*
+	 * we have the page locked, so new writeback can't start,
+	 * and the dirty bit won't be cleared while we are here.
+	 *
+	 * Wait for IO on this page so that we can safely clear
+	 * the PagePrivate2 bit and do ordered accounting
+	 */
 	wait_on_page_writeback(page);
+
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	if (offset) {
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-
 	lock_extent(tree, page_start, page_end, GFP_NOFS);
 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
 					   page_offset(page));
@@ -4415,16 +4684,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		 */
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
-		btrfs_finish_ordered_io(page->mapping->host,
-					page_start, page_end);
+				 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+		/*
+		 * whoever cleared the private bit is responsible
+		 * for the finish_ordered_io
+		 */
+		if (TestClearPagePrivate2(page)) {
+			btrfs_finish_ordered_io(page->mapping->host,
+						page_start, page_end);
+		}
 		btrfs_put_ordered_extent(ordered);
 		lock_extent(tree, page_start, page_end, GFP_NOFS);
 	}
 	clear_extent_bit(tree, page_start, page_end,
-		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_ORDERED,
-		 1, 1, GFP_NOFS);
+		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+		 1, 1, NULL, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
@@ -4521,11 +4795,14 @@ again:
 	}
 	ClearPageChecked(page);
 	set_page_dirty(page);
+	SetPageUptodate(page);
 
 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
+	if (!ret)
+		return VM_FAULT_LOCKED;
 	unlock_page(page);
 out:
 	return ret;
@@ -4594,11 +4871,11 @@ out:
  * create a new subvolume directory/inode (helper for the ioctl).
  */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *new_root, struct dentry *dentry,
+			     struct btrfs_root *new_root,
 			     u64 new_dirid, u64 alloc_hint)
 {
 	struct inode *inode;
-	int error;
+	int err;
 	u64 index = 0;
 
 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
@@ -4611,11 +4888,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 	inode->i_nlink = 1;
 	btrfs_i_size_write(inode, 0);
 
-	error = btrfs_update_inode(trans, new_root, inode);
-	if (error)
-		return error;
+	err = btrfs_update_inode(trans, new_root, inode);
+	BUG_ON(err);
 
-	d_instantiate(dentry, inode);
+	iput(inode);
 	return 0;
 }
 
@@ -4693,6 +4969,16 @@ void btrfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
+void btrfs_drop_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
+		generic_delete_inode(inode);
+	else
+		generic_drop_inode(inode);
+}
+
 static void init_once(void *foo)
 {
 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
@@ -4761,31 +5047,32 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
+	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
 	struct inode *new_inode = new_dentry->d_inode;
 	struct inode *old_inode = old_dentry->d_inode;
 	struct timespec ctime = CURRENT_TIME;
 	u64 index = 0;
+	u64 root_objectid;
 	int ret;
 
-	/* we're not allowed to rename between subvolumes */
-	if (BTRFS_I(old_inode)->root->root_key.objectid !=
-	    BTRFS_I(new_dir)->root->root_key.objectid)
+	if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+		return -EPERM;
+
+	/* we only allow rename subvolume link between subvolumes */
+	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
 		return -EXDEV;
 
-	if (S_ISDIR(old_inode->i_mode) && new_inode &&
-	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+	if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+	    (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
 		return -ENOTEMPTY;
-	}
 
-	/* to rename a snapshot or subvolume, we need to juggle the
-	 * backrefs.  This isn't coded yet
-	 */
-	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
-		return -EXDEV;
+	if (S_ISDIR(old_inode->i_mode) && new_inode &&
+	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+		return -ENOTEMPTY;
 
 	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	/*
 	 * we're using rename to replace one file with another.
@@ -4796,8 +5083,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
 		filemap_flush(old_inode->i_mapping);
 
+	/* close the racy window with snapshot create/destroy ioctl */
+	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		down_read(&root->fs_info->subvol_sem);
+
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, new_dir);
+
+	if (dest != root)
+		btrfs_record_root_in_trans(trans, dest);
 
+	ret = btrfs_set_inode_index(new_dir, &index);
+	if (ret)
+		goto out_fail;
+
+	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		/* force full log commit if subvolume involved. */
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+	} else {
+		ret = btrfs_insert_inode_ref(trans, dest,
+					     new_dentry->d_name.name,
+					     new_dentry->d_name.len,
+					     old_inode->i_ino,
+					     new_dir->i_ino, index);
+		if (ret)
+			goto out_fail;
+		/*
+		 * this is an ugly little race, but the rename is required
+		 * to make sure that if we crash, the inode is either at the
+		 * old name or the new one.  pinning the log transaction lets
+		 * us make sure we don't allow a log commit to come in after
+		 * we unlink the name but before we add the new name back in.
+		 */
+		btrfs_pin_log_trans(root);
+	}
 	/*
 	 * make sure the inode gets flushed if it is replacing
 	 * something.
@@ -4807,18 +5126,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		btrfs_add_ordered_operation(trans, root, old_inode);
 	}
 
-	/*
-	 * this is an ugly little race, but the rename is required to make
-	 * sure that if we crash, the inode is either at the old name
-	 * or the new one.  pinning the log transaction lets us make sure
-	 * we don't allow a log commit to come in after we unlink the
-	 * name but before we add the new name back in.
-	 */
-	btrfs_pin_log_trans(root);
-
-	btrfs_set_trans_block_group(trans, new_dir);
-
-	btrfs_inc_nlink(old_dentry->d_inode);
 	old_dir->i_ctime = old_dir->i_mtime = ctime;
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
@@ -4826,47 +5133,58 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (old_dentry->d_parent != new_dentry->d_parent)
 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
 
-	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
-				 old_dentry->d_name.name,
-				 old_dentry->d_name.len);
-	if (ret)
-		goto out_fail;
+	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+					old_dentry->d_name.name,
+					old_dentry->d_name.len);
+	} else {
+		btrfs_inc_nlink(old_dentry->d_inode);
+		ret = btrfs_unlink_inode(trans, root, old_dir,
+					 old_dentry->d_inode,
+					 old_dentry->d_name.name,
+					 old_dentry->d_name.len);
+	}
+	BUG_ON(ret);
 
 	if (new_inode) {
 		new_inode->i_ctime = CURRENT_TIME;
-		ret = btrfs_unlink_inode(trans, root, new_dir,
-					 new_dentry->d_inode,
-					 new_dentry->d_name.name,
-					 new_dentry->d_name.len);
-		if (ret)
-			goto out_fail;
+		if (unlikely(new_inode->i_ino ==
+			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+			root_objectid = BTRFS_I(new_inode)->location.objectid;
+			ret = btrfs_unlink_subvol(trans, dest, new_dir,
+						root_objectid,
+						new_dentry->d_name.name,
+						new_dentry->d_name.len);
+			BUG_ON(new_inode->i_nlink == 0);
+		} else {
+			ret = btrfs_unlink_inode(trans, dest, new_dir,
+						 new_dentry->d_inode,
+						 new_dentry->d_name.name,
+						 new_dentry->d_name.len);
+		}
+		BUG_ON(ret);
 		if (new_inode->i_nlink == 0) {
 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
-			if (ret)
-				goto out_fail;
+			BUG_ON(ret);
 		}
-
 	}
-	ret = btrfs_set_inode_index(new_dir, &index);
-	if (ret)
-		goto out_fail;
 
-	ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
-			     old_inode, new_dentry->d_name.name,
-			     new_dentry->d_name.len, 1, index);
-	if (ret)
-		goto out_fail;
+	ret = btrfs_add_link(trans, new_dir, old_inode,
+			     new_dentry->d_name.name,
+			     new_dentry->d_name.len, 0, index);
+	BUG_ON(ret);
 
-	btrfs_log_new_name(trans, old_inode, old_dir,
-				       new_dentry->d_parent);
+	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		btrfs_log_new_name(trans, old_inode, old_dir,
+				   new_dentry->d_parent);
+		btrfs_end_log_trans(root);
+	}
 out_fail:
-
-	/* this btrfs_end_log_trans just allows the current
-	 * log-sub transaction to complete
-	 */
-	btrfs_end_log_trans(root);
 	btrfs_end_transaction_throttle(trans, root);
-out_unlock:
+
+	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		up_read(&root->fs_info->subvol_sem);
 	return ret;
 }
 
@@ -5058,6 +5376,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 						  0, 0, 0,
 						  BTRFS_FILE_EXTENT_PREALLOC);
 		BUG_ON(ret);
+		btrfs_drop_extent_cache(inode, cur_offset,
+					cur_offset + ins.offset -1, 0);
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
 		alloc_hint = ins.objectid + ins.offset;
@@ -5223,6 +5543,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.permission	= btrfs_permission,
 };
+
 static struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
@@ -5269,6 +5590,7 @@ static const struct address_space_operations btrfs_aops = {
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
 	.set_page_dirty	= btrfs_set_page_dirty,
+	.error_remove_page = generic_error_remove_page,
 };
 
 static const struct address_space_operations btrfs_symlink_aops = {
@@ -5309,3 +5631,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 };
+
+struct dentry_operations btrfs_dentry_operations = {
+	.d_delete	= btrfs_dentry_delete,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd88f25889f..a8577a7f26a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
 	struct btrfs_root_item root_item;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
-	struct btrfs_root *new_root = root;
-	struct inode *dir;
+	struct btrfs_root *new_root;
+	struct inode *dir = dentry->d_parent->d_inode;
 	int ret;
 	int err;
 	u64 objectid;
@@ -241,7 +241,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 
 	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
-		goto fail_commit;
+		return ret;
 
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
@@ -304,11 +304,17 @@ static noinline int create_subvol(struct btrfs_root *root,
 	if (ret)
 		goto fail;
 
+	key.offset = (u64)-1;
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+	BUG_ON(IS_ERR(new_root));
+
+	btrfs_record_root_in_trans(trans, new_root);
+
+	ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
+				       BTRFS_I(dir)->block_group);
 	/*
 	 * insert the directory item
 	 */
-	key.offset = (u64)-1;
-	dir = dentry->d_parent->d_inode;
 	ret = btrfs_set_inode_index(dir, &index);
 	BUG_ON(ret);
 
@@ -322,44 +328,18 @@ static noinline int create_subvol(struct btrfs_root *root,
 	ret = btrfs_update_inode(trans, root, dir);
 	BUG_ON(ret);
 
-	/* add the backref first */
 	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
-				 objectid, BTRFS_ROOT_BACKREF_KEY,
-				 root->root_key.objectid,
+				 objectid, root->root_key.objectid,
 				 dir->i_ino, index, name, namelen);
 
 	BUG_ON(ret);
 
-	/* now add the forward ref */
-	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
-				 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
-				 objectid,
-				 dir->i_ino, index, name, namelen);
-
-	BUG_ON(ret);
-
-	ret = btrfs_commit_transaction(trans, root);
-	if (ret)
-		goto fail_commit;
-
-	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
-	BUG_ON(!new_root);
-
-	trans = btrfs_start_transaction(new_root, 1);
-	BUG_ON(!trans);
-
-	ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
-				       BTRFS_I(dir)->block_group);
-	if (ret)
-		goto fail;
-
+	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
 	nr = trans->blocks_used;
-	err = btrfs_commit_transaction(trans, new_root);
+	err = btrfs_commit_transaction(trans, root);
 	if (err && !ret)
 		ret = err;
-fail_commit:
-	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
 
@@ -420,14 +400,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
  * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
  * inside this filesystem so it's quite a bit simpler.
  */
-static noinline int btrfs_mksubvol(struct path *parent, char *name,
-				   int mode, int namelen,
+static noinline int btrfs_mksubvol(struct path *parent,
+				   char *name, int namelen,
 				   struct btrfs_root *snap_src)
 {
+	struct inode *dir  = parent->dentry->d_inode;
 	struct dentry *dentry;
 	int error;
 
-	mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
 
 	dentry = lookup_one_len(name, parent->dentry, namelen);
 	error = PTR_ERR(dentry);
@@ -438,99 +419,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 	if (dentry->d_inode)
 		goto out_dput;
 
-	if (!IS_POSIXACL(parent->dentry->d_inode))
-		mode &= ~current_umask();
-
 	error = mnt_want_write(parent->mnt);
 	if (error)
 		goto out_dput;
 
-	error = btrfs_may_create(parent->dentry->d_inode, dentry);
+	error = btrfs_may_create(dir, dentry);
 	if (error)
 		goto out_drop_write;
 
-	/*
-	 * Actually perform the low-level subvolume creation after all
-	 * this VFS fuzz.
-	 *
-	 * Eventually we want to pass in an inode under which we create this
-	 * subvolume, but for now all are under the filesystem root.
-	 *
-	 * Also we should pass on the mode eventually to allow creating new
-	 * subvolume with specific mode bits.
-	 */
+	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+
+	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
+		goto out_up_read;
+
 	if (snap_src) {
-		struct dentry *dir = dentry->d_parent;
-		struct dentry *test = dir->d_parent;
-		struct btrfs_path *path = btrfs_alloc_path();
-		int ret;
-		u64 test_oid;
-		u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
-
-		test_oid = snap_src->root_key.objectid;
-
-		ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
-					  path, parent_oid, test_oid);
-		if (ret == 0)
-			goto create;
-		btrfs_release_path(snap_src->fs_info->tree_root, path);
-
-		/* we need to make sure we aren't creating a directory loop
-		 * by taking a snapshot of something that has our current
-		 * subvol in its directory tree.  So, this loops through
-		 * the dentries and checks the forward refs for each subvolume
-		 * to see if is references the subvolume where we are
-		 * placing this new snapshot.
-		 */
-		while (1) {
-			if (!test ||
-			    dir == snap_src->fs_info->sb->s_root ||
-			    test == snap_src->fs_info->sb->s_root ||
-			    test->d_inode->i_sb != snap_src->fs_info->sb) {
-				break;
-			}
-			if (S_ISLNK(test->d_inode->i_mode)) {
-				printk(KERN_INFO "Btrfs symlink in snapshot "
-				       "path, failed\n");
-				error = -EMLINK;
-				btrfs_free_path(path);
-				goto out_drop_write;
-			}
-			test_oid =
-				BTRFS_I(test->d_inode)->root->root_key.objectid;
-			ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
-				  path, test_oid, parent_oid);
-			if (ret == 0) {
-				printk(KERN_INFO "Btrfs snapshot creation "
-				       "failed, looping\n");
-				error = -EMLINK;
-				btrfs_free_path(path);
-				goto out_drop_write;
-			}
-			btrfs_release_path(snap_src->fs_info->tree_root, path);
-			test = test->d_parent;
-		}
-create:
-		btrfs_free_path(path);
-		error = create_snapshot(snap_src, dentry, name, namelen);
+		error = create_snapshot(snap_src, dentry,
+					name, namelen);
 	} else {
-		error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
-				      dentry, name, namelen);
+		error = create_subvol(BTRFS_I(dir)->root, dentry,
+				      name, namelen);
 	}
-	if (error)
-		goto out_drop_write;
-
-	fsnotify_mkdir(parent->dentry->d_inode, dentry);
+	if (!error)
+		fsnotify_mkdir(dir, dentry);
+out_up_read:
+	up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
 out_drop_write:
 	mnt_drop_write(parent->mnt);
 out_dput:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&parent->dentry->d_inode->i_mutex);
+	mutex_unlock(&dir->i_mutex);
 	return error;
 }
 
-
 static int btrfs_defrag_file(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
@@ -596,9 +517,8 @@ again:
 		clear_page_dirty_for_io(page);
 
 		btrfs_set_extent_delalloc(inode, page_start, page_end);
-
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		unlock_page(page);
 		page_cache_release(page);
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
@@ -609,7 +529,8 @@ out_unlock:
 	return 0;
 }
 
-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+					void __user *arg)
 {
 	u64 new_size;
 	u64 old_size;
@@ -718,10 +639,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 {
 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_vol_args *vol_args;
-	struct btrfs_dir_item *di;
-	struct btrfs_path *path;
 	struct file *src_file;
-	u64 root_dirid;
 	int namelen;
 	int ret = 0;
 
@@ -739,32 +657,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 		goto out;
 	}
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
-	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
-			    path, root_dirid,
-			    vol_args->name, namelen, 0);
-	btrfs_free_path(path);
-
-	if (di && !IS_ERR(di)) {
-		ret = -EEXIST;
-		goto out;
-	}
-
-	if (IS_ERR(di)) {
-		ret = PTR_ERR(di);
-		goto out;
-	}
-
 	if (subvol) {
-		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
-				     file->f_path.dentry->d_inode->i_mode,
-				     namelen, NULL);
+		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+				     NULL);
 	} else {
 		struct inode *src_inode;
 		src_file = fget(vol_args->fd);
@@ -781,17 +676,156 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 			fput(src_file);
 			goto out;
 		}
-		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
-			     file->f_path.dentry->d_inode->i_mode,
-			     namelen, BTRFS_I(src_inode)->root);
+		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+				     BTRFS_I(src_inode)->root);
 		fput(src_file);
 	}
-
 out:
 	kfree(vol_args);
 	return ret;
 }
 
+/*
+ * helper to check if the subvolume references other subvolumes
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = root->root_key.objectid;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
+				&key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	ret = 0;
+	if (path->slots[0] > 0) {
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid == root->root_key.objectid &&
+		    key.type == BTRFS_ROOT_REF_KEY)
+			ret = -ENOTEMPTY;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
+					     void __user *arg)
+{
+	struct dentry *parent = fdentry(file);
+	struct dentry *dentry;
+	struct inode *dir = parent->d_inode;
+	struct inode *inode;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_root *dest = NULL;
+	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_trans_handle *trans;
+	int namelen;
+	int ret;
+	int err = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	namelen = strlen(vol_args->name);
+	if (strchr(vol_args->name, '/') ||
+	    strncmp(vol_args->name, "..", namelen) == 0) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = mnt_want_write(file->f_path.mnt);
+	if (err)
+		goto out;
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(vol_args->name, parent, namelen);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_unlock_dir;
+	}
+
+	if (!dentry->d_inode) {
+		err = -ENOENT;
+		goto out_dput;
+	}
+
+	inode = dentry->d_inode;
+	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		err = -EINVAL;
+		goto out_dput;
+	}
+
+	dest = BTRFS_I(inode)->root;
+
+	mutex_lock(&inode->i_mutex);
+	err = d_invalidate(dentry);
+	if (err)
+		goto out_unlock;
+
+	down_write(&root->fs_info->subvol_sem);
+
+	err = may_destroy_subvol(dest);
+	if (err)
+		goto out_up_write;
+
+	trans = btrfs_start_transaction(root, 1);
+	ret = btrfs_unlink_subvol(trans, root, dir,
+				dest->root_key.objectid,
+				dentry->d_name.name,
+				dentry->d_name.len);
+	BUG_ON(ret);
+
+	btrfs_record_root_in_trans(trans, dest);
+
+	memset(&dest->root_item.drop_progress, 0,
+		sizeof(dest->root_item.drop_progress));
+	dest->root_item.drop_level = 0;
+	btrfs_set_root_refs(&dest->root_item, 0);
+
+	ret = btrfs_insert_orphan_item(trans,
+				root->fs_info->tree_root,
+				dest->root_key.objectid);
+	BUG_ON(ret);
+
+	ret = btrfs_commit_transaction(trans, root);
+	BUG_ON(ret);
+	inode->i_flags |= S_DEAD;
+out_up_write:
+	up_write(&root->fs_info->subvol_sem);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	if (!err) {
+		btrfs_invalidate_inodes(dest);
+		d_delete(dentry);
+	}
+out_dput:
+	dput(dentry);
+out_unlock_dir:
+	mutex_unlock(&dir->i_mutex);
+	mnt_drop_write(file->f_path.mnt);
+out:
+	kfree(vol_args);
+	return err;
+}
+
 static int btrfs_ioctl_defrag(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
@@ -865,8 +899,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 	return ret;
 }
 
-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
-		u64 off, u64 olen, u64 destoff)
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+				       u64 off, u64 olen, u64 destoff)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -976,7 +1010,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 	/* punch hole in destination first */
 	btrfs_drop_extents(trans, root, inode, off, off + len,
-			   off + len, 0, &hint_byte);
+			   off + len, 0, &hint_byte, 1);
 
 	/* clone data */
 	key.objectid = src->i_ino;
@@ -1071,8 +1105,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					datao += off - key.offset;
 					datal -= off - key.offset;
 				}
-				if (key.offset + datao + datal + key.offset >
-				    off + len)
+				if (key.offset + datao + datal > off + len)
 					datal = off + len - key.offset - datao;
 				/* disko == 0 means it's a hole */
 				if (!disko)
@@ -1258,6 +1291,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_snap_create(file, argp, 0);
 	case BTRFS_IOC_SUBVOL_CREATE:
 		return btrfs_ioctl_snap_create(file, argp, 1);
+	case BTRFS_IOC_SNAP_DESTROY:
+		return btrfs_ioctl_snap_destroy(file, argp);
 	case BTRFS_IOC_DEFRAG:
 		return btrfs_ioctl_defrag(file);
 	case BTRFS_IOC_RESIZE:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index b320b103fa1..bc49914475e 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
 
 #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
 				   struct btrfs_ioctl_vol_args)
-
+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
+				struct btrfs_ioctl_vol_args)
 #endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7b2f401e604..b5d6d24726b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  *
  * len is the length of the extent
  *
- * This also sets the EXTENT_ORDERED bit on the range in the inode.
- *
  * The tree is given a single reference on the ordered extent that was
  * inserted.
  */
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->start = start;
 	entry->len = len;
 	entry->disk_len = disk_len;
+	entry->bytes_left = len;
 	entry->inode = inode;
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			   &entry->rb_node);
 	BUG_ON(node);
 
-	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
-			   entry_end(entry) - 1, GFP_NOFS);
-
 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 	list_add_tail(&entry->root_extent_list,
 		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	int ret;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
 	mutex_lock(&tree->mutex);
-	clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
-			     GFP_NOFS);
 	node = tree_search(tree, file_offset);
 	if (!node) {
 		ret = 1;
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 		goto out;
 	}
 
-	ret = test_range_bit(io_tree, entry->file_offset,
-			     entry->file_offset + entry->len - 1,
-			     EXTENT_ORDERED, 0);
-	if (ret == 0)
+	if (io_size > entry->bytes_left) {
+		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+		       (unsigned long long)entry->bytes_left,
+		       (unsigned long long)io_size);
+	}
+	entry->bytes_left -= io_size;
+	if (entry->bytes_left == 0)
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+	else
+		ret = 1;
 out:
 	mutex_unlock(&tree->mutex);
 	return ret == 0;
@@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	u64 orig_end;
 	u64 wait_end;
 	struct btrfs_ordered_extent *ordered;
+	int found;
 
 	if (start + len < start) {
 		orig_end = INT_LIMIT(loff_t);
@@ -502,6 +501,7 @@ again:
 					   orig_end >> PAGE_CACHE_SHIFT);
 
 	end = orig_end;
+	found = 0;
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
 		if (!ordered)
@@ -514,6 +514,7 @@ again:
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
+		found++;
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		end = ordered->file_offset;
 		btrfs_put_ordered_extent(ordered);
@@ -521,8 +522,8 @@ again:
 			break;
 		end--;
 	}
-	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
-			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+			   EXTENT_DELALLOC, 0, NULL)) {
 		schedule_timeout(1);
 		goto again;
 	}
@@ -613,7 +614,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 */
 	if (test_range_bit(io_tree, disk_i_size,
 			   ordered->file_offset + ordered->len - 1,
-			   EXTENT_DELALLOC, 0)) {
+			   EXTENT_DELALLOC, 0, NULL)) {
 		goto out;
 	}
 	/*
@@ -664,7 +665,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 */
 	if (i_size_test > entry_end(ordered) &&
 	    !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
-			   EXTENT_DELALLOC, 0)) {
+			   EXTENT_DELALLOC, 0, NULL)) {
 		new_i_size = min_t(u64, i_size_test, i_size_read(inode));
 	}
 	BTRFS_I(inode)->disk_i_size = new_i_size;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3d31c8827b0..993a7ea45c7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
 	/* extent length on disk */
 	u64 disk_len;
 
+	/* number of bytes that still need writing */
+	u64 bytes_left;
+
 	/* flags (described above) */
 	unsigned long flags;
 
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 3c0d52af4f8..79cba5fbc28 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -65,3 +65,23 @@ out:
 	btrfs_free_path(path);
 	return ret;
 }
+
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c04f7f21260..361ad323faa 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -121,6 +121,15 @@ struct inodevec {
 	int nr;
 };
 
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+	u64 start;
+	u64 end;
+	u64 boundary[MAX_EXTENTS];
+	unsigned int nr;
+};
+
 struct reloc_control {
 	/* block group to relocate */
 	struct btrfs_block_group_cache *block_group;
@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
 				struct reloc_control *rc)
 {
 	if (test_range_bit(&rc->processed_blocks, bytenr,
-			   bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
 		return 1;
 	return 0;
 }
@@ -2529,56 +2538,94 @@ out:
 }
 
 static noinline_for_stack
-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
+			 u64 block_start)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	int ret = 0;
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
+
+	em->start = start;
+	em->len = end + 1 - start;
+	em->block_len = em->len;
+	em->block_start = block_start;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	while (1) {
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+		if (ret != -EEXIST) {
+			free_extent_map(em);
+			break;
+		}
+		btrfs_drop_extent_cache(inode, start, end, 0);
+	}
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+					struct file_extent_cluster *cluster)
 {
 	u64 page_start;
 	u64 page_end;
-	unsigned long i;
-	unsigned long first_index;
+	u64 offset = BTRFS_I(inode)->index_cnt;
+	unsigned long index;
 	unsigned long last_index;
-	unsigned int total_read = 0;
-	unsigned int total_dirty = 0;
+	unsigned int dirty_page = 0;
 	struct page *page;
 	struct file_ra_state *ra;
-	struct btrfs_ordered_extent *ordered;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int nr = 0;
 	int ret = 0;
 
+	if (!cluster->nr)
+		return 0;
+
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
 	if (!ra)
 		return -ENOMEM;
 
+	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+
 	mutex_lock(&inode->i_mutex);
-	first_index = start >> PAGE_CACHE_SHIFT;
-	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
 
-	/* make sure the dirty trick played by the caller work */
-	while (1) {
-		ret = invalidate_inode_pages2_range(inode->i_mapping,
-						    first_index, last_index);
-		if (ret != -EBUSY)
-			break;
-		schedule_timeout(HZ/10);
-	}
+	i_size_write(inode, cluster->end + 1 - offset);
+	ret = setup_extent_mapping(inode, cluster->start - offset,
+				   cluster->end - offset, cluster->start);
 	if (ret)
 		goto out_unlock;
 
 	file_ra_state_init(ra, inode->i_mapping);
 
-	for (i = first_index ; i <= last_index; i++) {
-		if (total_read % ra->ra_pages == 0) {
-			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
-				min(last_index, ra->ra_pages + i - 1));
-		}
-		total_read++;
-again:
-		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
-			BUG_ON(1);
-		page = grab_cache_page(inode->i_mapping, i);
+	WARN_ON(cluster->start != cluster->boundary[0]);
+	while (index <= last_index) {
+		page = find_lock_page(inode->i_mapping, index);
 		if (!page) {
-			ret = -ENOMEM;
-			goto out_unlock;
+			page_cache_sync_readahead(inode->i_mapping,
+						  ra, NULL, index,
+						  last_index + 1 - index);
+			page = grab_cache_page(inode->i_mapping, index);
+			if (!page) {
+				ret = -ENOMEM;
+				goto out_unlock;
+			}
+		}
+
+		if (PageReadahead(page)) {
+			page_cache_async_readahead(inode->i_mapping,
+						   ra, NULL, page, index,
+						   last_index + 1 - index);
 		}
+
 		if (!PageUptodate(page)) {
 			btrfs_readpage(NULL, page);
 			lock_page(page);
@@ -2589,75 +2636,79 @@ again:
 				goto out_unlock;
 			}
 		}
-		wait_on_page_writeback(page);
 
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
-		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-		ordered = btrfs_lookup_ordered_extent(inode, page_start);
-		if (ordered) {
-			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-			unlock_page(page);
-			page_cache_release(page);
-			btrfs_start_ordered_extent(inode, ordered, 1);
-			btrfs_put_ordered_extent(ordered);
-			goto again;
-		}
+
+		lock_extent(&BTRFS_I(inode)->io_tree,
+			    page_start, page_end, GFP_NOFS);
+
 		set_page_extent_mapped(page);
 
-		if (i == first_index)
-			set_extent_bits(io_tree, page_start, page_end,
+		if (nr < cluster->nr &&
+		    page_start + offset == cluster->boundary[nr]) {
+			set_extent_bits(&BTRFS_I(inode)->io_tree,
+					page_start, page_end,
 					EXTENT_BOUNDARY, GFP_NOFS);
+			nr++;
+		}
 		btrfs_set_extent_delalloc(inode, page_start, page_end);
 
 		set_page_dirty(page);
-		total_dirty++;
+		dirty_page++;
 
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent(&BTRFS_I(inode)->io_tree,
+			      page_start, page_end, GFP_NOFS);
 		unlock_page(page);
 		page_cache_release(page);
+
+		index++;
+		if (nr < cluster->nr &&
+		    page_end + 1 + offset == cluster->boundary[nr]) {
+			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+							   dirty_page);
+			dirty_page = 0;
+		}
+	}
+	if (dirty_page) {
+		balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+						   dirty_page);
 	}
+	WARN_ON(nr != cluster->nr);
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
 	kfree(ra);
-	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
 	return ret;
 }
 
 static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+			 struct file_extent_cluster *cluster)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct extent_map *em;
-	u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
-	u64 end = start + extent_key->offset - 1;
-
-	em = alloc_extent_map(GFP_NOFS);
-	em->start = start;
-	em->len = extent_key->offset;
-	em->block_len = extent_key->offset;
-	em->block_start = extent_key->objectid;
-	em->bdev = root->fs_info->fs_devices->latest_bdev;
-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+	int ret;
 
-	/* setup extent map to cheat btrfs_readpage */
-	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
-	while (1) {
-		int ret;
-		spin_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em);
-		spin_unlock(&em_tree->lock);
-		if (ret != -EEXIST) {
-			free_extent_map(em);
-			break;
-		}
-		btrfs_drop_extent_cache(inode, start, end, 0);
+	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+		ret = relocate_file_extent_cluster(inode, cluster);
+		if (ret)
+			return ret;
+		cluster->nr = 0;
 	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
 
-	return relocate_inode_pages(inode, start, extent_key->offset);
+	if (!cluster->nr)
+		cluster->start = extent_key->objectid;
+	else
+		BUG_ON(cluster->nr >= MAX_EXTENTS);
+	cluster->end = extent_key->objectid + extent_key->offset - 1;
+	cluster->boundary[cluster->nr] = extent_key->objectid;
+	cluster->nr++;
+
+	if (cluster->nr >= MAX_EXTENTS) {
+		ret = relocate_file_extent_cluster(inode, cluster);
+		if (ret)
+			return ret;
+		cluster->nr = 0;
+	}
+	return 0;
 }
 
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
 	return 0;
 }
 
+
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
 	struct rb_root blocks = RB_ROOT;
 	struct btrfs_key key;
+	struct file_extent_cluster *cluster;
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_path *path;
 	struct btrfs_extent_item *ei;
@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	int ret;
 	int err = 0;
 
+	cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
+	if (!cluster)
+		return -ENOMEM;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
+	rc->extents_found = 0;
+	rc->extents_skipped = 0;
+
 	rc->search_start = rc->block_group->key.objectid;
 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
 			  GFP_NOFS);
@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 		}
 
 		nr = trans->blocks_used;
-		btrfs_end_transaction_throttle(trans, rc->extent_root);
+		btrfs_end_transaction(trans, rc->extent_root);
 		trans = NULL;
 		btrfs_btree_balance_dirty(rc->extent_root, nr);
 
 		if (rc->stage == MOVE_DATA_EXTENTS &&
 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
 			rc->found_file_extent = 1;
-			ret = relocate_data_extent(rc->data_inode, &key);
+			ret = relocate_data_extent(rc->data_inode,
+						   &key, cluster);
 			if (ret < 0) {
 				err = ret;
 				break;
@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 		btrfs_btree_balance_dirty(rc->extent_root, nr);
 	}
 
+	if (!err) {
+		ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+		if (ret < 0)
+			err = ret;
+	}
+
+	kfree(cluster);
+
 	rc->create_reloc_root = 0;
 	smp_mb();
 
@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 }
 
 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 u64 objectid, u64 size)
+				 struct btrfs_root *root, u64 objectid)
 {
 	struct btrfs_path *path;
 	struct btrfs_inode_item *item;
@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
 	btrfs_set_inode_generation(leaf, item, 1);
-	btrfs_set_inode_size(leaf, item, size);
+	btrfs_set_inode_size(leaf, item, 0);
 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
 	btrfs_mark_buffer_dirty(leaf);
@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	if (err)
 		goto out;
 
-	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
-	BUG_ON(err);
-
-	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-				       group->key.offset, 0, group->key.offset,
-				       0, 0, 0);
+	err = __insert_orphan_inode(trans, root, objectid);
 	BUG_ON(err);
 
 	key.objectid = objectid;
@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
 
 	while (1) {
-		mutex_lock(&fs_info->cleaner_mutex);
-		btrfs_clean_old_snapshots(fs_info->tree_root);
-		mutex_unlock(&fs_info->cleaner_mutex);
-
 		rc->extents_found = 0;
 		rc->extents_skipped = 0;
 
+		mutex_lock(&fs_info->cleaner_mutex);
+
+		btrfs_clean_old_snapshots(fs_info->tree_root);
 		ret = relocate_block_group(rc);
+
+		mutex_unlock(&fs_info->cleaner_mutex);
 		if (ret < 0) {
 			err = ret;
 			break;
@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		}
 	}
 
-	filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
-				 rc->block_group->key.objectid,
-				 rc->block_group->key.objectid +
-				 rc->block_group->key.offset - 1);
+	filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
+				     rc->block_group->key.objectid,
+				     rc->block_group->key.objectid +
+				     rc->block_group->key.offset - 1);
 
 	WARN_ON(rc->block_group->pinned > 0);
 	WARN_ON(rc->block_group->reserved > 0);
@@ -3530,6 +3594,26 @@ out:
 	return err;
 }
 
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+
+	memset(&root->root_item.drop_progress, 0,
+		sizeof(root->root_item.drop_progress));
+	root->root_item.drop_level = 0;
+	btrfs_set_root_refs(&root->root_item, 0);
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&root->root_key, &root->root_item);
+	BUG_ON(ret);
+
+	ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
+	BUG_ON(ret);
+	return 0;
+}
+
 /*
  * recover relocation interrupted by system crash.
  *
@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 			fs_root = read_fs_root(root->fs_info,
 					       reloc_root->root_key.offset);
 			if (IS_ERR(fs_root)) {
-				err = PTR_ERR(fs_root);
-				goto out;
+				ret = PTR_ERR(fs_root);
+				if (ret != -ENOENT) {
+					err = ret;
+					goto out;
+				}
+				mark_garbage_root(reloc_root);
 			}
 		}
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0ddc6d61c55..9351428f30e 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 		goto out;
 
 	BUG_ON(ret == 0);
+	if (path->slots[0] == 0) {
+		ret = 1;
+		goto out;
+	}
 	l = path->nodes[0];
-	BUG_ON(path->slots[0] == 0);
 	slot = path->slots[0] - 1;
 	btrfs_item_key_to_cpu(l, &found_key, slot);
-	if (found_key.objectid != objectid) {
+	if (found_key.objectid != objectid ||
+	    found_key.type != BTRFS_ROOT_ITEM_KEY) {
 		ret = 1;
 		goto out;
 	}
-	read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
-			   sizeof(*item));
-	memcpy(key, &found_key, sizeof(found_key));
+	if (item)
+		read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+				   sizeof(*item));
+	if (key)
+		memcpy(key, &found_key, sizeof(found_key));
 	ret = 0;
 out:
 	btrfs_free_path(path);
@@ -249,6 +255,59 @@ err:
 	return ret;
 }
 
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int err = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = 0;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(tree_root, path);
+			if (ret < 0)
+				err = ret;
+			if (ret != 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(tree_root, path);
+
+		if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
+		    key.type != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		ret = btrfs_find_dead_roots(tree_root, key.offset);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		key.offset++;
+	}
+
+	btrfs_free_path(path);
+	return err;
+}
+
 /* drop the root item for 'key' from 'root' */
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key)
@@ -278,31 +337,57 @@ out:
 	return ret;
 }
 
-#if 0 /* this will get used when snapshot deletion is implemented */
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *tree_root,
-		       u64 root_id, u8 type, u64 ref_id)
+		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+		       const char *name, int name_len)
+
 {
+	struct btrfs_path *path;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
 	struct btrfs_key key;
+	unsigned long ptr;
+	int err = 0;
 	int ret;
-	struct btrfs_path *path;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
 	key.objectid = root_id;
-	key.type = type;
+	key.type = BTRFS_ROOT_BACKREF_KEY;
 	key.offset = ref_id;
-
+again:
 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
-	BUG_ON(ret);
-
-	ret = btrfs_del_item(trans, tree_root, path);
-	BUG_ON(ret);
+	BUG_ON(ret < 0);
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_root_ref);
+
+		WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+		WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+		ptr = (unsigned long)(ref + 1);
+		WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+		*sequence = btrfs_root_ref_sequence(leaf, ref);
+
+		ret = btrfs_del_item(trans, tree_root, path);
+		BUG_ON(ret);
+	} else
+		err = -ENOENT;
+
+	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+		btrfs_release_path(tree_root, path);
+		key.objectid = ref_id;
+		key.type = BTRFS_ROOT_REF_KEY;
+		key.offset = root_id;
+		goto again;
+	}
 
 	btrfs_free_path(path);
-	return ret;
+	return err;
 }
-#endif
 
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
 		   struct btrfs_path *path,
@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
 	return ret;
 }
 
-
 /*
  * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
  * or BTRFS_ROOT_BACKREF_KEY.
@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
  */
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *tree_root,
-		       u64 root_id, u8 type, u64 ref_id,
-		       u64 dirid, u64 sequence,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
 		       const char *name, int name_len)
 {
 	struct btrfs_key key;
@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	unsigned long ptr;
 
-
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
 	key.objectid = root_id;
-	key.type = type;
+	key.type = BTRFS_ROOT_BACKREF_KEY;
 	key.offset = ref_id;
-
+again:
 	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
 				      sizeof(*ref) + name_len);
 	BUG_ON(ret);
@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 	write_extent_buffer(leaf, name, ptr, name_len);
 	btrfs_mark_buffer_dirty(leaf);
 
+	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+		btrfs_release_path(tree_root, path);
+		key.objectid = ref_id;
+		key.type = BTRFS_ROOT_REF_KEY;
+		key.offset = root_id;
+		goto again;
+	}
+
 	btrfs_free_path(path);
-	return ret;
+	return 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2db17cd66fc..67035385444 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -676,6 +676,7 @@ static int btrfs_unfreeze(struct super_block *sb)
 }
 
 static const struct super_operations btrfs_super_ops = {
+	.drop_inode	= btrfs_drop_inode,
 	.delete_inode	= btrfs_delete_inode,
 	.put_super	= btrfs_put_super,
 	.sync_fs	= btrfs_sync_fs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cdbb5022da5..88f866f85e7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
 {
 	if (root->ref_cows && root->last_trans < trans->transid) {
 		WARN_ON(root == root->fs_info->extent_root);
-		WARN_ON(root->root_item.refs == 0);
 		WARN_ON(root->commit_root != root->node);
 
 		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
@@ -720,7 +719,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
 	key.objectid = objectid;
-	key.offset = 0;
+	/* record when the snapshot was created in key.offset */
+	key.offset = trans->transid;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
 	old = btrfs_lock_root_node(root);
@@ -778,24 +778,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
 	BUG_ON(ret);
 
-	/* add the backref first */
 	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
 				 pending->root_key.objectid,
-				 BTRFS_ROOT_BACKREF_KEY,
 				 parent_root->root_key.objectid,
 				 parent_inode->i_ino, index, pending->name,
 				 namelen);
 
 	BUG_ON(ret);
 
-	/* now add the forward ref */
-	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
-				 parent_root->root_key.objectid,
-				 BTRFS_ROOT_REF_KEY,
-				 pending->root_key.objectid,
-				 parent_inode->i_ino, index, pending->name,
-				 namelen);
-
 	inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
 	d_instantiate(pending->dentry, inode);
 fail:
@@ -874,7 +864,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
-	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
 	int should_grow = 0;
@@ -915,13 +904,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
-	if (!pinned_copy)
-		return -ENOMEM;
-
-	extent_io_tree_init(pinned_copy,
-			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
-
 	trans->transaction->in_commit = 1;
 	trans->transaction->blocked = 1;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -1019,6 +1001,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = commit_cowonly_roots(trans, root);
 	BUG_ON(ret);
 
+	btrfs_prepare_extent_commit(trans, root);
+
 	cur_trans = root->fs_info->running_transaction;
 	spin_lock(&root->fs_info->new_trans_lock);
 	root->fs_info->running_transaction = NULL;
@@ -1042,8 +1026,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
 	       sizeof(root->fs_info->super_copy));
 
-	btrfs_copy_pinned(root, pinned_copy);
-
 	trans->transaction->blocked = 0;
 
 	wake_up(&root->fs_info->transaction_wait);
@@ -1059,8 +1041,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	mutex_unlock(&root->fs_info->tree_log_mutex);
 
-	btrfs_finish_extent_commit(trans, root, pinned_copy);
-	kfree(pinned_copy);
+	btrfs_finish_extent_commit(trans, root);
 
 	/* do the directory inserts of any pending snapshot creations */
 	finish_pending_snapshots(trans, root->fs_info);
@@ -1096,8 +1077,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 
 	while (!list_empty(&list)) {
 		root = list_entry(list.next, struct btrfs_root, root_list);
-		list_del_init(&root->root_list);
-		btrfs_drop_snapshot(root, 0);
+		list_del(&root->root_list);
+
+		if (btrfs_header_backref_rev(root->node) <
+		    BTRFS_MIXED_BACKREF_REV)
+			btrfs_drop_snapshot(root, 0);
+		else
+			btrfs_drop_snapshot(root, 1);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 30c0d45c1b5..7827841b55c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -263,8 +263,8 @@ static int process_one_buffer(struct btrfs_root *log,
 			      struct walk_control *wc, u64 gen)
 {
 	if (wc->pin)
-		btrfs_update_pinned_extents(log->fs_info->extent_root,
-					    eb->start, eb->len, 1);
+		btrfs_pin_extent(log->fs_info->extent_root,
+				 eb->start, eb->len, 0);
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
 		if (wc->write)
@@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
 	ret = btrfs_drop_extents(trans, root, inode,
-			 start, extent_end, extent_end, start, &alloc_hint);
+			 start, extent_end, extent_end, start, &alloc_hint, 1);
 	BUG_ON(ret);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -2841,7 +2841,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
 			break;
 
-		if (parent == sb->s_root)
+		if (IS_ROOT(parent))
 			break;
 
 		parent = parent->d_parent;
@@ -2880,6 +2880,12 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		goto end_no_trans;
 	}
 
+	if (root != BTRFS_I(inode)->root ||
+	    btrfs_root_refs(&root->root_item) == 0) {
+		ret = 1;
+		goto end_no_trans;
+	}
+
 	ret = check_parent_dirs_for_sync(trans, inode, parent,
 					 sb, last_committed);
 	if (ret)
@@ -2907,12 +2913,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 			break;
 
 		inode = parent->d_inode;
+		if (root != BTRFS_I(inode)->root)
+			break;
+
 		if (BTRFS_I(inode)->generation >
 		    root->fs_info->last_trans_committed) {
 			ret = btrfs_log_inode(trans, root, inode, inode_only);
 			BUG_ON(ret);
 		}
-		if (parent == sb->s_root)
+		if (IS_ROOT(parent))
 			break;
 
 		parent = parent->d_parent;
@@ -2951,7 +2960,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 	struct btrfs_key tmp_key;
 	struct btrfs_root *log;
 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
-	u64 highest_inode;
 	struct walk_control wc = {
 		.process_func = process_one_buffer,
 		.stage = 0,
@@ -3010,11 +3018,6 @@ again:
 						      path);
 			BUG_ON(ret);
 		}
-		ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
-		if (ret == 0) {
-			wc.replay_dest->highest_inode = highest_inode;
-			wc.replay_dest->last_inode_alloc = highest_inode;
-		}
 
 		key.offset = found_key.offset - 1;
 		wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cf405b0828..23e7d36ff32 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -276,7 +276,7 @@ loop_lock:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
+		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 		    fs_info->fs_devices->open_devices > 1) {
 			struct io_context *ioc;
 
@@ -719,10 +719,9 @@ error:
  * called very infrequently and that a given device has a small number
  * of extents
  */
-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
-					 struct btrfs_device *device,
-					 u64 num_bytes, u64 *start,
-					 u64 *max_avail)
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
+			 u64 *start, u64 *max_avail)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
@@ -1736,6 +1735,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	extent_root = root->fs_info->extent_root;
 	em_tree = &root->fs_info->mapping_tree.map_tree;
 
+	ret = btrfs_can_relocate(extent_root, chunk_offset);
+	if (ret)
+		return -ENOSPC;
+
 	/* step one, relocate all the extents inside this chunk */
 	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
 	BUG_ON(ret);
@@ -1749,9 +1752,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	 * step two, delete the device extents and the
 	 * chunk tree entries
 	 */
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	BUG_ON(em->start > chunk_offset ||
 	       em->start + em->len < chunk_offset);
@@ -1780,9 +1783,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
 	BUG_ON(ret);
 
-	spin_lock(&em_tree->lock);
+	write_lock(&em_tree->lock);
 	remove_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
+	write_unlock(&em_tree->lock);
 
 	kfree(map);
 	em->bdev = NULL;
@@ -1807,12 +1810,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
 	struct btrfs_key found_key;
 	u64 chunk_tree = chunk_root->root_key.objectid;
 	u64 chunk_type;
+	bool retried = false;
+	int failed = 0;
 	int ret;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
+again:
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -1842,7 +1848,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
 			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
 						   found_key.objectid,
 						   found_key.offset);
-			BUG_ON(ret);
+			if (ret == -ENOSPC)
+				failed++;
+			else if (ret)
+				BUG();
 		}
 
 		if (found_key.offset == 0)
@@ -1850,6 +1859,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
 		key.offset = found_key.offset - 1;
 	}
 	ret = 0;
+	if (failed && !retried) {
+		failed = 0;
+		retried = true;
+		goto again;
+	} else if (failed && retried) {
+		WARN_ON(1);
+		ret = -ENOSPC;
+	}
 error:
 	btrfs_free_path(path);
 	return ret;
@@ -1894,6 +1911,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 			continue;
 
 		ret = btrfs_shrink_device(device, old_size - size_to_free);
+		if (ret == -ENOSPC)
+			break;
 		BUG_ON(ret);
 
 		trans = btrfs_start_transaction(dev_root, 1);
@@ -1938,9 +1957,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		chunk = btrfs_item_ptr(path->nodes[0],
 				       path->slots[0],
 				       struct btrfs_chunk);
-		key.offset = found_key.offset;
 		/* chunk zero is special */
-		if (key.offset == 0)
+		if (found_key.offset == 0)
 			break;
 
 		btrfs_release_path(chunk_root, path);
@@ -1948,7 +1966,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
 					   found_key.offset);
-		BUG_ON(ret);
+		BUG_ON(ret && ret != -ENOSPC);
+		key.offset = found_key.offset - 1;
 	}
 	ret = 0;
 error:
@@ -1974,10 +1993,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	u64 chunk_offset;
 	int ret;
 	int slot;
+	int failed = 0;
+	bool retried = false;
 	struct extent_buffer *l;
 	struct btrfs_key key;
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 old_size = device->total_bytes;
 	u64 diff = device->total_bytes - new_size;
 
 	if (new_size >= device->total_bytes)
@@ -1987,12 +2009,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	if (!path)
 		return -ENOMEM;
 
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		ret = -ENOMEM;
-		goto done;
-	}
-
 	path->reada = 2;
 
 	lock_chunks(root);
@@ -2001,8 +2017,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	if (device->writeable)
 		device->fs_devices->total_rw_bytes -= diff;
 	unlock_chunks(root);
-	btrfs_end_transaction(trans, root);
 
+again:
 	key.objectid = device->devid;
 	key.offset = (u64)-1;
 	key.type = BTRFS_DEV_EXTENT_KEY;
@@ -2017,6 +2033,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 			goto done;
 		if (ret) {
 			ret = 0;
+			btrfs_release_path(root, path);
 			break;
 		}
 
@@ -2024,14 +2041,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 
-		if (key.objectid != device->devid)
+		if (key.objectid != device->devid) {
+			btrfs_release_path(root, path);
 			break;
+		}
 
 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
 		length = btrfs_dev_extent_length(l, dev_extent);
 
-		if (key.offset + length <= new_size)
+		if (key.offset + length <= new_size) {
+			btrfs_release_path(root, path);
 			break;
+		}
 
 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2040,8 +2061,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
 		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
 					   chunk_offset);
-		if (ret)
+		if (ret && ret != -ENOSPC)
 			goto done;
+		if (ret == -ENOSPC)
+			failed++;
+		key.offset -= 1;
+	}
+
+	if (failed && !retried) {
+		failed = 0;
+		retried = true;
+		goto again;
+	} else if (failed && retried) {
+		ret = -ENOSPC;
+		lock_chunks(root);
+
+		device->total_bytes = old_size;
+		if (device->writeable)
+			device->fs_devices->total_rw_bytes += diff;
+		unlock_chunks(root);
+		goto done;
 	}
 
 	/* Shrinking succeeded, else we would be at "done". */
@@ -2294,9 +2333,9 @@ again:
 	em->block_len = em->len;
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
-	spin_lock(&em_tree->lock);
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
+	write_unlock(&em_tree->lock);
 	BUG_ON(ret);
 	free_extent_map(em);
 
@@ -2491,9 +2530,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
 	int readonly = 0;
 	int i;
 
-	spin_lock(&map_tree->map_tree.lock);
+	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
-	spin_unlock(&map_tree->map_tree.lock);
+	read_unlock(&map_tree->map_tree.lock);
 	if (!em)
 		return 1;
 
@@ -2518,11 +2557,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 	struct extent_map *em;
 
 	while (1) {
-		spin_lock(&tree->map_tree.lock);
+		write_lock(&tree->map_tree.lock);
 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
 		if (em)
 			remove_extent_mapping(&tree->map_tree, em);
-		spin_unlock(&tree->map_tree.lock);
+		write_unlock(&tree->map_tree.lock);
 		if (!em)
 			break;
 		kfree(em->bdev);
@@ -2540,9 +2579,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
 	int ret;
 
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, len);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -2604,9 +2643,9 @@ again:
 		atomic_set(&multi->error, 0);
 	}
 
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	if (!em && unplug_page)
 		return 0;
@@ -2763,9 +2802,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	u64 stripe_nr;
 	int i, j, nr = 0;
 
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	BUG_ON(!em || em->start != chunk_start);
 	map = (struct map_lookup *)em->bdev;
@@ -3053,9 +3092,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
 
-	spin_lock(&map_tree->map_tree.lock);
+	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
-	spin_unlock(&map_tree->map_tree.lock);
+	read_unlock(&map_tree->map_tree.lock);
 
 	/* already mapped? */
 	if (em && em->start <= logical && em->start + em->len > logical) {
@@ -3114,9 +3153,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		map->stripes[i].dev->in_fs_metadata = 1;
 	}
 
-	spin_lock(&map_tree->map_tree.lock);
+	write_lock(&map_tree->map_tree.lock);
 	ret = add_extent_mapping(&map_tree->map_tree, em);
-	spin_unlock(&map_tree->map_tree.lock);
+	write_unlock(&map_tree->map_tree.lock);
 	BUG_ON(ret);
 	free_extent_map(em);
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5139a833f72..31b0fabdd2e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
 void btrfs_unlock_volumes(void);
 void btrfs_lock_volumes(void);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
+			 u64 *start, u64 *max_avail);
 #endif
diff --git a/fs/buffer.c b/fs/buffer.c
index 209f7f15f5f..24afd7422ae 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2239,16 +2239,10 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	void *fsdata;
-	unsigned long limit;
 	int err;
 
-	err = -EFBIG;
-        limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
-		send_sig(SIGXFSZ, current, 0);
-		goto out;
-	}
-	if (size > inode->i_sb->s_maxbytes)
+	err = inode_newsize_ok(inode, size);
+	if (err)
 		goto out;
 
 	err = pagecache_write_begin(NULL, mapping, size, 0,
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3cbc57f932d..d6db933df2b 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -264,7 +264,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
 {
 	struct char_device_struct *cd;
 	struct cdev *cdev;
-	char *s;
 	int err = -ENOMEM;
 
 	cd = __register_chrdev_region(major, baseminor, count, name);
@@ -278,8 +277,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
 	cdev->owner = fops->owner;
 	cdev->ops = fops;
 	kobject_set_name(&cdev->kobj, "%s", name);
-	for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
-		*s = '!';
 		
 	err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
 	if (err)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d79ce2e95c2..90c5b39f031 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -185,8 +185,7 @@ out_mount_failed:
 			cifs_sb->mountdata = NULL;
 		}
 #endif
-		if (cifs_sb->local_nls)
-			unload_nls(cifs_sb->local_nls);
+		unload_nls(cifs_sb->local_nls);
 		kfree(cifs_sb);
 	}
 	return rc;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 1f09c761931..5e2492535da 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1557,57 +1557,24 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 
 static int cifs_vmtruncate(struct inode *inode, loff_t offset)
 {
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long limit;
+	loff_t oldsize;
+	int err;
 
 	spin_lock(&inode->i_lock);
-	if (inode->i_size < offset)
-		goto do_expand;
-	/*
-	 * truncation of in-use swapfiles is disallowed - it would cause
-	 * subsequent swapout to scribble on the now-freed blocks.
-	 */
-	if (IS_SWAPFILE(inode)) {
-		spin_unlock(&inode->i_lock);
-		goto out_busy;
-	}
-	i_size_write(inode, offset);
-	spin_unlock(&inode->i_lock);
-	/*
-	 * unmap_mapping_range is called twice, first simply for efficiency
-	 * so that truncate_inode_pages does fewer single-page unmaps. However
-	 * after this first call, and before truncate_inode_pages finishes,
-	 * it is possible for private pages to be COWed, which remain after
-	 * truncate_inode_pages finishes, hence the second unmap_mapping_range
-	 * call must be made for correctness.
-	 */
-	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-	truncate_inode_pages(mapping, offset);
-	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-	goto out_truncate;
-
-do_expand:
-	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-	if (limit != RLIM_INFINITY && offset > limit) {
+	err = inode_newsize_ok(inode, offset);
+	if (err) {
 		spin_unlock(&inode->i_lock);
-		goto out_sig;
-	}
-	if (offset > inode->i_sb->s_maxbytes) {
-		spin_unlock(&inode->i_lock);
-		goto out_big;
+		goto out;
 	}
+
+	oldsize = inode->i_size;
 	i_size_write(inode, offset);
 	spin_unlock(&inode->i_lock);
-out_truncate:
+	truncate_pagecache(inode, oldsize, offset);
 	if (inode->i_op->truncate)
 		inode->i_op->truncate(inode);
-	return 0;
-out_sig:
-	send_sig(SIGXFSZ, current, 0);
-out_big:
-	return -EFBIG;
-out_busy:
-	return -ETXTBSY;
+out:
+	return err;
 }
 
 static int
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 8ccd5ed81d9..d99860a3389 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -2,6 +2,7 @@
 #define _CODA_INT_
 
 struct dentry;
+struct file;
 
 extern struct file_system_type coda_fs_type;
 extern unsigned long coda_timeout;
diff --git a/fs/compat.c b/fs/compat.c
index 3aa48834a22..d576b552e8e 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -768,13 +768,13 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
 				 char __user * type, unsigned long flags,
 				 void __user * data)
 {
-	unsigned long type_page;
+	char *kernel_type;
 	unsigned long data_page;
-	unsigned long dev_page;
+	char *kernel_dev;
 	char *dir_page;
 	int retval;
 
-	retval = copy_mount_options (type, &type_page);
+	retval = copy_mount_string(type, &kernel_type);
 	if (retval < 0)
 		goto out;
 
@@ -783,38 +783,38 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
 	if (IS_ERR(dir_page))
 		goto out1;
 
-	retval = copy_mount_options (dev_name, &dev_page);
+	retval = copy_mount_string(dev_name, &kernel_dev);
 	if (retval < 0)
 		goto out2;
 
-	retval = copy_mount_options (data, &data_page);
+	retval = copy_mount_options(data, &data_page);
 	if (retval < 0)
 		goto out3;
 
 	retval = -EINVAL;
 
-	if (type_page && data_page) {
-		if (!strcmp((char *)type_page, SMBFS_NAME)) {
+	if (kernel_type && data_page) {
+		if (!strcmp(kernel_type, SMBFS_NAME)) {
 			do_smb_super_data_conv((void *)data_page);
-		} else if (!strcmp((char *)type_page, NCPFS_NAME)) {
+		} else if (!strcmp(kernel_type, NCPFS_NAME)) {
 			do_ncp_super_data_conv((void *)data_page);
-		} else if (!strcmp((char *)type_page, NFS4_NAME)) {
+		} else if (!strcmp(kernel_type, NFS4_NAME)) {
 			if (do_nfs4_super_data_conv((void *) data_page))
 				goto out4;
 		}
 	}
 
-	retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
+	retval = do_mount(kernel_dev, dir_page, kernel_type,
 			flags, (void*)data_page);
 
  out4:
 	free_page(data_page);
  out3:
-	free_page(dev_page);
+	kfree(kernel_dev);
  out2:
 	putname(dir_page);
  out1:
-	free_page(type_page);
+	kfree(kernel_type);
  out:
 	return retval;
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index a2edb791344..31f4b0e6d72 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -63,9 +63,9 @@ static void drop_slab(void)
 }
 
 int drop_caches_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (write) {
 		if (sysctl_drop_caches & 1)
 			drop_pagecache();
diff --git a/fs/exec.c b/fs/exec.c
index 5c833c18d0d..d49be6bc179 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,6 +55,7 @@
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -63,6 +64,7 @@
 
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
+unsigned int core_pipe_limit;
 int suid_dumpable = 0;
 
 /* The maximal length of core_pattern is also specified in sysctl.c */
@@ -1393,18 +1395,16 @@ out_ret:
 	return retval;
 }
 
-int set_binfmt(struct linux_binfmt *new)
+void set_binfmt(struct linux_binfmt *new)
 {
-	struct linux_binfmt *old = current->binfmt;
+	struct mm_struct *mm = current->mm;
 
-	if (new) {
-		if (!try_module_get(new->module))
-			return -1;
-	}
-	current->binfmt = new;
-	if (old)
-		module_put(old->module);
-	return 0;
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
+
+	mm->binfmt = new;
+	if (new)
+		__module_get(new->module);
 }
 
 EXPORT_SYMBOL(set_binfmt);
@@ -1728,6 +1728,29 @@ int get_dumpable(struct mm_struct *mm)
 	return (ret >= 2) ? 2 : ret;
 }
 
+static void wait_for_dump_helpers(struct file *file)
+{
+	struct pipe_inode_info *pipe;
+
+	pipe = file->f_path.dentry->d_inode->i_pipe;
+
+	pipe_lock(pipe);
+	pipe->readers++;
+	pipe->writers--;
+
+	while ((pipe->readers > 1) && (!signal_pending(current))) {
+		wake_up_interruptible_sync(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+		pipe_wait(pipe);
+	}
+
+	pipe->readers--;
+	pipe->writers++;
+	pipe_unlock(pipe);
+
+}
+
+
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
 	struct core_state core_state;
@@ -1744,11 +1767,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 	unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
 	char **helper_argv = NULL;
 	int helper_argc = 0;
-	char *delimit;
+	int dump_count = 0;
+	static atomic_t core_dump_count = ATOMIC_INIT(0);
 
 	audit_core_dumps(signr);
 
-	binfmt = current->binfmt;
+	binfmt = mm->binfmt;
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
 
@@ -1799,54 +1823,63 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
  	lock_kernel();
 	ispipe = format_corename(corename, signr);
 	unlock_kernel();
-	/*
-	 * Don't bother to check the RLIMIT_CORE value if core_pattern points
-	 * to a pipe.  Since we're not writing directly to the filesystem
-	 * RLIMIT_CORE doesn't really apply, as no actual core file will be
-	 * created unless the pipe reader choses to write out the core file
-	 * at which point file size limits and permissions will be imposed
-	 * as it does with any other process
-	 */
+
 	if ((!ispipe) && (core_limit < binfmt->min_coredump))
 		goto fail_unlock;
 
  	if (ispipe) {
+		if (core_limit == 0) {
+			/*
+			 * Normally core limits are irrelevant to pipes, since
+			 * we're not writing to the file system, but we use
+			 * core_limit of 0 here as a speacial value. Any
+			 * non-zero limit gets set to RLIM_INFINITY below, but
+			 * a limit of 0 skips the dump.  This is a consistent
+			 * way to catch recursive crashes.  We can still crash
+			 * if the core_pattern binary sets RLIM_CORE =  !0
+			 * but it runs as root, and can do lots of stupid things
+			 * Note that we use task_tgid_vnr here to grab the pid
+			 * of the process group leader.  That way we get the
+			 * right pid if a thread in a multi-threaded
+			 * core_pattern process dies.
+			 */
+			printk(KERN_WARNING
+				"Process %d(%s) has RLIMIT_CORE set to 0\n",
+				task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Aborting core\n");
+			goto fail_unlock;
+		}
+
+		dump_count = atomic_inc_return(&core_dump_count);
+		if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+			printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
+			       task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Skipping core dump\n");
+			goto fail_dropcount;
+		}
+
 		helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
 		if (!helper_argv) {
 			printk(KERN_WARNING "%s failed to allocate memory\n",
 			       __func__);
-			goto fail_unlock;
-		}
-		/* Terminate the string before the first option */
-		delimit = strchr(corename, ' ');
-		if (delimit)
-			*delimit = '\0';
-		delimit = strrchr(helper_argv[0], '/');
-		if (delimit)
-			delimit++;
-		else
-			delimit = helper_argv[0];
-		if (!strcmp(delimit, current->comm)) {
-			printk(KERN_NOTICE "Recursive core dump detected, "
-					"aborting\n");
-			goto fail_unlock;
+			goto fail_dropcount;
 		}
 
 		core_limit = RLIM_INFINITY;
 
 		/* SIGPIPE can happen, but it's just never processed */
- 		if (call_usermodehelper_pipe(corename+1, helper_argv, NULL,
+		if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
 				&file)) {
  			printk(KERN_INFO "Core dump to %s pipe failed\n",
 			       corename);
- 			goto fail_unlock;
+			goto fail_dropcount;
  		}
  	} else
  		file = filp_open(corename,
 				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
 				 0600);
 	if (IS_ERR(file))
-		goto fail_unlock;
+		goto fail_dropcount;
 	inode = file->f_path.dentry->d_inode;
 	if (inode->i_nlink > 1)
 		goto close_fail;	/* multiple links - don't dump */
@@ -1875,7 +1908,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 	if (retval)
 		current->signal->group_exit_code |= 0x80;
 close_fail:
+	if (ispipe && core_pipe_limit)
+		wait_for_dump_helpers(file);
 	filp_close(file, NULL);
+fail_dropcount:
+	if (dump_count)
+		atomic_dec(&core_dump_count);
 fail_unlock:
 	if (helper_argv)
 		argv_free(helper_argv);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 5ab10c3bbeb..9f500dec3b5 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -214,7 +214,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 	}
 
 	lock_super(sb);
-	lock_kernel();
 	sbi = sb->s_fs_info;
 	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
 	fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
@@ -245,7 +244,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 out:
 	if (or)
 		osd_end_request(or);
-	unlock_kernel();
 	unlock_super(sb);
 	kfree(fscb);
 	return ret;
@@ -268,8 +266,6 @@ static void exofs_put_super(struct super_block *sb)
 	int num_pend;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
 
-	lock_kernel();
-
 	if (sb->s_dirt)
 		exofs_write_super(sb);
 
@@ -286,8 +282,6 @@ static void exofs_put_super(struct super_block *sb)
 	osduld_put_device(sbi->s_dev);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 /*
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 1c1638f873a..ade634076d0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -819,6 +819,7 @@ const struct address_space_operations ext2_aops = {
 	.writepages		= ext2_writepages,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate	= block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
 const struct address_space_operations ext2_aops_xip = {
@@ -837,6 +838,7 @@ const struct address_space_operations ext2_nobh_aops = {
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
 	.migratepage		= buffer_migrate_page,
+	.error_remove_page	= generic_error_remove_page,
 };
 
 /*
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index cd098a7b77f..acf1b142332 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1830,6 +1830,7 @@ static const struct address_space_operations ext3_ordered_aops = {
 	.direct_IO		= ext3_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
 static const struct address_space_operations ext3_writeback_aops = {
@@ -1845,6 +1846,7 @@ static const struct address_space_operations ext3_writeback_aops = {
 	.direct_IO		= ext3_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
 static const struct address_space_operations ext3_journalled_aops = {
@@ -1859,6 +1861,7 @@ static const struct address_space_operations ext3_journalled_aops = {
 	.invalidatepage		= ext3_invalidatepage,
 	.releasepage		= ext3_releasepage,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
 void ext3_set_aops(struct inode *inode)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3a798737e30..064746fad58 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3386,6 +3386,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
 static const struct address_space_operations ext4_writeback_aops = {
@@ -3401,6 +3402,7 @@ static const struct address_space_operations ext4_writeback_aops = {
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
 static const struct address_space_operations ext4_journalled_aops = {
@@ -3415,6 +3417,7 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
 static const struct address_space_operations ext4_da_aops = {
@@ -3431,6 +3434,7 @@ static const struct address_space_operations ext4_da_aops = {
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
 void ext4_set_aops(struct inode *inode)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8970d8c49bb..04629d1302f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -470,19 +470,11 @@ static void fat_put_super(struct super_block *sb)
 
 	iput(sbi->fat_inode);
 
-	if (sbi->nls_disk) {
-		unload_nls(sbi->nls_disk);
-		sbi->nls_disk = NULL;
-		sbi->options.codepage = fat_default_codepage;
-	}
-	if (sbi->nls_io) {
-		unload_nls(sbi->nls_io);
-		sbi->nls_io = NULL;
-	}
-	if (sbi->options.iocharset != fat_default_iocharset) {
+	unload_nls(sbi->nls_disk);
+	unload_nls(sbi->nls_io);
+
+	if (sbi->options.iocharset != fat_default_iocharset)
 		kfree(sbi->options.iocharset);
-		sbi->options.iocharset = fat_default_iocharset;
-	}
 
 	sb->s_fs_info = NULL;
 	kfree(sbi);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ae413086db9..fc089f2f7f5 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -263,6 +263,79 @@ pid_t f_getown(struct file *filp)
 	return pid;
 }
 
+static int f_setown_ex(struct file *filp, unsigned long arg)
+{
+	struct f_owner_ex * __user owner_p = (void * __user)arg;
+	struct f_owner_ex owner;
+	struct pid *pid;
+	int type;
+	int ret;
+
+	ret = copy_from_user(&owner, owner_p, sizeof(owner));
+	if (ret)
+		return ret;
+
+	switch (owner.type) {
+	case F_OWNER_TID:
+		type = PIDTYPE_MAX;
+		break;
+
+	case F_OWNER_PID:
+		type = PIDTYPE_PID;
+		break;
+
+	case F_OWNER_GID:
+		type = PIDTYPE_PGID;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	rcu_read_lock();
+	pid = find_vpid(owner.pid);
+	if (owner.pid && !pid)
+		ret = -ESRCH;
+	else
+		ret = __f_setown(filp, pid, type, 1);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int f_getown_ex(struct file *filp, unsigned long arg)
+{
+	struct f_owner_ex * __user owner_p = (void * __user)arg;
+	struct f_owner_ex owner;
+	int ret = 0;
+
+	read_lock(&filp->f_owner.lock);
+	owner.pid = pid_vnr(filp->f_owner.pid);
+	switch (filp->f_owner.pid_type) {
+	case PIDTYPE_MAX:
+		owner.type = F_OWNER_TID;
+		break;
+
+	case PIDTYPE_PID:
+		owner.type = F_OWNER_PID;
+		break;
+
+	case PIDTYPE_PGID:
+		owner.type = F_OWNER_GID;
+		break;
+
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+		break;
+	}
+	read_unlock(&filp->f_owner.lock);
+
+	if (!ret)
+		ret = copy_to_user(owner_p, &owner, sizeof(owner));
+	return ret;
+}
+
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		struct file *filp)
 {
@@ -313,6 +386,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_SETOWN:
 		err = f_setown(filp, arg, 1);
 		break;
+	case F_GETOWN_EX:
+		err = f_getown_ex(filp, arg);
+		break;
+	case F_SETOWN_EX:
+		err = f_setown_ex(filp, arg);
+		break;
 	case F_GETSIG:
 		err = filp->f_owner.signum;
 		break;
@@ -428,8 +507,7 @@ static inline int sigio_perm(struct task_struct *p,
 
 static void send_sigio_to_task(struct task_struct *p,
 			       struct fown_struct *fown,
-			       int fd,
-			       int reason)
+			       int fd, int reason, int group)
 {
 	/*
 	 * F_SETSIG can change ->signum lockless in parallel, make
@@ -461,11 +539,11 @@ static void send_sigio_to_task(struct task_struct *p,
 			else
 				si.si_band = band_table[reason - POLL_IN];
 			si.si_fd    = fd;
-			if (!group_send_sig_info(signum, &si, p))
+			if (!do_send_sig_info(signum, &si, p, group))
 				break;
 		/* fall-through: fall back on the old plain SIGIO signal */
 		case 0:
-			group_send_sig_info(SIGIO, SEND_SIG_PRIV, p);
+			do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
 	}
 }
 
@@ -474,16 +552,23 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
 	struct task_struct *p;
 	enum pid_type type;
 	struct pid *pid;
+	int group = 1;
 	
 	read_lock(&fown->lock);
+
 	type = fown->pid_type;
+	if (type == PIDTYPE_MAX) {
+		group = 0;
+		type = PIDTYPE_PID;
+	}
+
 	pid = fown->pid;
 	if (!pid)
 		goto out_unlock_fown;
 	
 	read_lock(&tasklist_lock);
 	do_each_pid_task(pid, type, p) {
-		send_sigio_to_task(p, fown, fd, band);
+		send_sigio_to_task(p, fown, fd, band, group);
 	} while_each_pid_task(pid, type, p);
 	read_unlock(&tasklist_lock);
  out_unlock_fown:
@@ -491,10 +576,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
 }
 
 static void send_sigurg_to_task(struct task_struct *p,
-                                struct fown_struct *fown)
+				struct fown_struct *fown, int group)
 {
 	if (sigio_perm(p, fown, SIGURG))
-		group_send_sig_info(SIGURG, SEND_SIG_PRIV, p);
+		do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
 }
 
 int send_sigurg(struct fown_struct *fown)
@@ -502,10 +587,17 @@ int send_sigurg(struct fown_struct *fown)
 	struct task_struct *p;
 	enum pid_type type;
 	struct pid *pid;
+	int group = 1;
 	int ret = 0;
 	
 	read_lock(&fown->lock);
+
 	type = fown->pid_type;
+	if (type == PIDTYPE_MAX) {
+		group = 0;
+		type = PIDTYPE_PID;
+	}
+
 	pid = fown->pid;
 	if (!pid)
 		goto out_unlock_fown;
@@ -514,7 +606,7 @@ int send_sigurg(struct fown_struct *fown)
 	
 	read_lock(&tasklist_lock);
 	do_each_pid_task(pid, type, p) {
-		send_sigurg_to_task(p, fown);
+		send_sigurg_to_task(p, fown, group);
 	} while_each_pid_task(pid, type, p);
 	read_unlock(&tasklist_lock);
  out_unlock_fown:
diff --git a/fs/file_table.c b/fs/file_table.c
index 334ce39881f..8eb44042e00 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
  * Handle nr_files sysctl
  */
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	files_stat.nr_files = get_nr_files();
-	return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #else
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e703654e7f4..992f6c9410b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1276,14 +1276,9 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 		return 0;
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		unsigned long limit;
-		if (IS_SWAPFILE(inode))
-			return -ETXTBSY;
-		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-		if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) {
-			send_sig(SIGXFSZ, current, 0);
-			return -EFBIG;
-		}
+		err = inode_newsize_ok(inode, attr->ia_size);
+		if (err)
+			return err;
 		is_truncate = true;
 	}
 
@@ -1350,8 +1345,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
 	 */
 	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
-		if (outarg.attr.size < oldsize)
-			fuse_truncate(inode->i_mapping, outarg.attr.size);
+		truncate_pagecache(inode, oldsize, outarg.attr.size);
 		invalidate_inode_pages2(inode->i_mapping);
 	}
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fc9c79feb5f..01cc462ff45 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -606,8 +606,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 				   u64 attr_valid);
 
-void fuse_truncate(struct address_space *mapping, loff_t offset);
-
 /**
  * Initialize the client device
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6da947daabd..1a822ce2b24 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -140,14 +140,6 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-void fuse_truncate(struct address_space *mapping, loff_t offset)
-{
-	/* See vmtruncate() */
-	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-	truncate_inode_pages(mapping, offset);
-	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-}
-
 void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 				   u64 attr_valid)
 {
@@ -205,8 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 	spin_unlock(&fc->lock);
 
 	if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
-		if (attr->size < oldsize)
-			fuse_truncate(inode->i_mapping, attr->size);
+		truncate_pagecache(inode, oldsize, attr->size);
 		invalidate_inode_pages2(inode->i_mapping);
 	}
 }
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7ebae9a4ecc..694b5d48f03 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1135,6 +1135,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
 	.direct_IO = gfs2_direct_IO,
 	.migratepage = buffer_migrate_page,
 	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
 };
 
 static const struct address_space_operations gfs2_ordered_aops = {
@@ -1151,6 +1152,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
 	.direct_IO = gfs2_direct_IO,
 	.migratepage = buffer_migrate_page,
 	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
 };
 
 static const struct address_space_operations gfs2_jdata_aops = {
@@ -1166,6 +1168,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
 	.invalidatepage = gfs2_invalidatepage,
 	.releasepage = gfs2_releasepage,
 	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
 };
 
 void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index c3ac1805405..247436c10de 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -12,7 +12,6 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
-#include <linux/utsname.h>
 #include <linux/mm.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 7b6165f25fb..8bbe03c3f6d 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -344,10 +344,8 @@ void hfs_mdb_put(struct super_block *sb)
 	brelse(HFS_SB(sb)->mdb_bh);
 	brelse(HFS_SB(sb)->alt_mdb_bh);
 
-	if (HFS_SB(sb)->nls_io)
-		unload_nls(HFS_SB(sb)->nls_io);
-	if (HFS_SB(sb)->nls_disk)
-		unload_nls(HFS_SB(sb)->nls_disk);
+	unload_nls(HFS_SB(sb)->nls_io);
+	unload_nls(HFS_SB(sb)->nls_disk);
 
 	free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
 	kfree(HFS_SB(sb));
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c0759fe0855..43022f3d514 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -229,8 +229,7 @@ static void hfsplus_put_super(struct super_block *sb)
 	iput(HFSPLUS_SB(sb).alloc_file);
 	iput(HFSPLUS_SB(sb).hidden_dir);
 	brelse(HFSPLUS_SB(sb).s_vhbh);
-	if (HFSPLUS_SB(sb).nls)
-		unload_nls(HFSPLUS_SB(sb).nls);
+	unload_nls(HFSPLUS_SB(sb).nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 
@@ -464,8 +463,7 @@ out:
 
 cleanup:
 	hfsplus_put_super(sb);
-	if (nls)
-		unload_nls(nls);
+	unload_nls(nls);
 	return err;
 }
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index eba6d552d9c..87a1258953b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -380,36 +380,11 @@ static void hugetlbfs_delete_inode(struct inode *inode)
 
 static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
 {
-	struct super_block *sb = inode->i_sb;
-
-	if (!hlist_unhashed(&inode->i_hash)) {
-		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-			list_move(&inode->i_list, &inode_unused);
-		inodes_stat.nr_unused++;
-		if (!sb || (sb->s_flags & MS_ACTIVE)) {
-			spin_unlock(&inode_lock);
-			return;
-		}
-		inode->i_state |= I_WILL_FREE;
-		spin_unlock(&inode_lock);
-		/*
-		 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
-		 * in our backing_dev_info.
-		 */
-		write_inode_now(inode, 1);
-		spin_lock(&inode_lock);
-		inode->i_state &= ~I_WILL_FREE;
-		inodes_stat.nr_unused--;
-		hlist_del_init(&inode->i_hash);
+	if (generic_detach_inode(inode)) {
+		truncate_hugepages(inode, 0);
+		clear_inode(inode);
+		destroy_inode(inode);
 	}
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
-	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
-	spin_unlock(&inode_lock);
-	truncate_hugepages(inode, 0);
-	clear_inode(inode);
-	destroy_inode(inode);
 }
 
 static void hugetlbfs_drop_inode(struct inode *inode)
@@ -936,15 +911,9 @@ static struct file_system_type hugetlbfs_fs_type = {
 
 static struct vfsmount *hugetlbfs_vfsmount;
 
-static int can_do_hugetlb_shm(int creat_flags)
+static int can_do_hugetlb_shm(void)
 {
-	if (creat_flags != HUGETLB_SHMFS_INODE)
-		return 0;
-	if (capable(CAP_IPC_LOCK))
-		return 1;
-	if (in_group_p(sysctl_hugetlb_shm_group))
-		return 1;
-	return 0;
+	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
 
 struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
@@ -960,7 +929,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
 	if (!hugetlbfs_vfsmount)
 		return ERR_PTR(-ENOENT);
 
-	if (!can_do_hugetlb_shm(creat_flags)) {
+	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
 		*user = current_user();
 		if (user_shm_lock(size, *user)) {
 			WARN_ONCE(1,
diff --git a/fs/inode.c b/fs/inode.c
index 76582b06ab9..4d8e3be5597 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1241,7 +1241,16 @@ void generic_delete_inode(struct inode *inode)
 }
 EXPORT_SYMBOL(generic_delete_inode);
 
-static void generic_forget_inode(struct inode *inode)
+/**
+ *	generic_detach_inode - remove inode from inode lists
+ *	@inode: inode to remove
+ *
+ *	Remove inode from inode lists, write it if it's dirty. This is just an
+ *	internal VFS helper exported for hugetlbfs. Do not use!
+ *
+ *	Returns 1 if inode should be completely destroyed.
+ */
+int generic_detach_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 
@@ -1251,7 +1260,7 @@ static void generic_forget_inode(struct inode *inode)
 		inodes_stat.nr_unused++;
 		if (sb->s_flags & MS_ACTIVE) {
 			spin_unlock(&inode_lock);
-			return;
+			return 0;
 		}
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_WILL_FREE;
@@ -1269,6 +1278,14 @@ static void generic_forget_inode(struct inode *inode)
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(generic_detach_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+	if (!generic_detach_inode(inode))
+		return;
 	if (inode->i_data.nrpages)
 		truncate_inode_pages(&inode->i_data, 0);
 	clear_inode(inode);
@@ -1399,31 +1416,31 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	struct timespec now;
 
-	if (mnt_want_write(mnt))
-		return;
 	if (inode->i_flags & S_NOATIME)
-		goto out;
+		return;
 	if (IS_NOATIME(inode))
-		goto out;
+		return;
 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
-		goto out;
+		return;
 
 	if (mnt->mnt_flags & MNT_NOATIME)
-		goto out;
+		return;
 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
-		goto out;
+		return;
 
 	now = current_fs_time(inode->i_sb);
 
 	if (!relatime_need_update(mnt, inode, now))
-		goto out;
+		return;
 
 	if (timespec_equal(&inode->i_atime, &now))
-		goto out;
+		return;
+
+	if (mnt_want_write(mnt))
+		return;
 
 	inode->i_atime = now;
 	mark_inode_dirty_sync(inode);
-out:
 	mnt_drop_write(mnt);
 }
 EXPORT_SYMBOL(touch_atime);
@@ -1444,34 +1461,37 @@ void file_update_time(struct file *file)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct timespec now;
-	int sync_it = 0;
-	int err;
+	enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
 
+	/* First try to exhaust all avenues to not sync */
 	if (IS_NOCMTIME(inode))
 		return;
 
-	err = mnt_want_write_file(file);
-	if (err)
-		return;
-
 	now = current_fs_time(inode->i_sb);
-	if (!timespec_equal(&inode->i_mtime, &now)) {
-		inode->i_mtime = now;
-		sync_it = 1;
-	}
+	if (!timespec_equal(&inode->i_mtime, &now))
+		sync_it = S_MTIME;
 
-	if (!timespec_equal(&inode->i_ctime, &now)) {
-		inode->i_ctime = now;
-		sync_it = 1;
-	}
+	if (!timespec_equal(&inode->i_ctime, &now))
+		sync_it |= S_CTIME;
 
-	if (IS_I_VERSION(inode)) {
-		inode_inc_iversion(inode);
-		sync_it = 1;
-	}
+	if (IS_I_VERSION(inode))
+		sync_it |= S_VERSION;
+
+	if (!sync_it)
+		return;
 
-	if (sync_it)
-		mark_inode_dirty_sync(inode);
+	/* Finally allowed to write? Takes lock. */
+	if (mnt_want_write_file(file))
+		return;
+
+	/* Only change inode inside the lock region */
+	if (sync_it & S_VERSION)
+		inode_inc_iversion(inode);
+	if (sync_it & S_CTIME)
+		inode->i_ctime = now;
+	if (sync_it & S_MTIME)
+		inode->i_mtime = now;
+	mark_inode_dirty_sync(inode);
 	mnt_drop_write(file->f_path.mnt);
 }
 EXPORT_SYMBOL(file_update_time);
@@ -1599,7 +1619,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
 	else if (S_ISSOCK(mode))
 		inode->i_fop = &bad_sock_fops;
 	else
-		printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
-		       mode);
+		printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
+				  " inode %s:%lu\n", mode, inode->i_sb->s_id,
+				  inode->i_ino);
 }
 EXPORT_SYMBOL(init_special_inode);
diff --git a/fs/internal.h b/fs/internal.h
index d55ef562f0b..515175b8b72 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -57,6 +57,7 @@ extern int check_unsafe_exec(struct linux_binprm *);
  * namespace.c
  */
 extern int copy_mount_options(const void __user *, unsigned long *);
+extern int copy_mount_string(const void __user *, char **);
 
 extern void free_vfsmnt(struct vfsmount *);
 extern struct vfsmount *alloc_vfsmnt(const char *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5612880fcbe..7b17a14396f 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -162,20 +162,21 @@ EXPORT_SYMBOL(fiemap_check_flags);
 static int fiemap_check_ranges(struct super_block *sb,
 			       u64 start, u64 len, u64 *new_len)
 {
+	u64 maxbytes = (u64) sb->s_maxbytes;
+
 	*new_len = len;
 
 	if (len == 0)
 		return -EINVAL;
 
-	if (start > sb->s_maxbytes)
+	if (start > maxbytes)
 		return -EFBIG;
 
 	/*
 	 * Shrink request scope to what the fs can actually handle.
 	 */
-	if ((len > sb->s_maxbytes) ||
-	    (sb->s_maxbytes - len) < start)
-		*new_len = sb->s_maxbytes - start;
+	if (len > maxbytes || (maxbytes - len) < start)
+		*new_len = maxbytes - start;
 
 	return 0;
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 85f96bc651c..6b4dcd4f294 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -46,10 +46,7 @@ static void isofs_put_super(struct super_block *sb)
 #ifdef CONFIG_JOLIET
 	lock_kernel();
 
-	if (sbi->s_nls_iocharset) {
-		unload_nls(sbi->s_nls_iocharset);
-		sbi->s_nls_iocharset = NULL;
-	}
+	unload_nls(sbi->s_nls_iocharset);
 
 	unlock_kernel();
 #endif
@@ -912,8 +909,7 @@ out_no_root:
 		printk(KERN_WARNING "%s: get root inode failed\n", __func__);
 out_no_inode:
 #ifdef CONFIG_JOLIET
-	if (sbi->s_nls_iocharset)
-		unload_nls(sbi->s_nls_iocharset);
+	unload_nls(sbi->s_nls_iocharset);
 #endif
 	goto out_freesbi;
 out_no_read:
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 37e6dcda8fc..2234c73fc57 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -178,13 +178,11 @@ static void jfs_put_super(struct super_block *sb)
 	rc = jfs_umount(sb);
 	if (rc)
 		jfs_err("jfs_umount failed with return code %d", rc);
-	if (sbi->nls_tab)
-		unload_nls(sbi->nls_tab);
-	sbi->nls_tab = NULL;
+
+	unload_nls(sbi->nls_tab);
 
 	truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
 	iput(sbi->direct_inode);
-	sbi->direct_inode = NULL;
 
 	kfree(sbi);
 
@@ -347,8 +345,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
 
 	if (nls_map != (void *) -1) {
 		/* Discard old (if remount) */
-		if (sbi->nls_tab)
-			unload_nls(sbi->nls_tab);
+		unload_nls(sbi->nls_tab);
 		sbi->nls_tab = nls_map;
 	}
 	return 1;
diff --git a/fs/libfs.c b/fs/libfs.c
index dcec3d3ea64..219576c52d8 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -527,14 +527,18 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 				const void *from, size_t available)
 {
 	loff_t pos = *ppos;
+	size_t ret;
+
 	if (pos < 0)
 		return -EINVAL;
-	if (pos >= available)
+	if (pos >= available || !count)
 		return 0;
 	if (count > available - pos)
 		count = available - pos;
-	if (copy_to_user(to, from + pos, count))
+	ret = copy_to_user(to, from + pos, count);
+	if (ret == count)
 		return -EFAULT;
+	count -= ret;
 	*ppos = pos + count;
 	return count;
 }
@@ -735,10 +739,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
 	if (copy_from_user(attr->set_buf, buf, size))
 		goto out;
 
-	ret = len; /* claim we got the whole input */
 	attr->set_buf[size] = '\0';
 	val = simple_strtol(attr->set_buf, NULL, 0);
-	attr->set(attr->data, val);
+	ret = attr->set(attr->data, val);
+	if (ret == 0)
+		ret = len; /* on success, claim we got the whole input */
 out:
 	mutex_unlock(&attr->mutex);
 	return ret;
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 0336f2beacd..b583ab0a4cb 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -8,7 +8,6 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
-#include <linux/utsname.h>
 #include <linux/nfs.h>
 
 #include <linux/sunrpc/xdr.h>
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index e1d52865319..ad9dbbc9145 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -9,7 +9,6 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
-#include <linux/utsname.h>
 #include <linux/nfs.h>
 
 #include <linux/sunrpc/xdr.h>
diff --git a/fs/namespace.c b/fs/namespace.c
index 7230787d18b..bdc3cb4fd22 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,7 +1640,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
 {
 	struct vfsmount *mnt;
 
-	if (!type || !memchr(type, 0, PAGE_SIZE))
+	if (!type)
 		return -EINVAL;
 
 	/* we need capabilities... */
@@ -1871,6 +1871,23 @@ int copy_mount_options(const void __user * data, unsigned long *where)
 	return 0;
 }
 
+int copy_mount_string(const void __user *data, char **where)
+{
+	char *tmp;
+
+	if (!data) {
+		*where = NULL;
+		return 0;
+	}
+
+	tmp = strndup_user(data, PAGE_SIZE);
+	if (IS_ERR(tmp))
+		return PTR_ERR(tmp);
+
+	*where = tmp;
+	return 0;
+}
+
 /*
  * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
@@ -1900,8 +1917,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 
 	if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
 		return -EINVAL;
-	if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
-		return -EINVAL;
 
 	if (data_page)
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
@@ -2070,40 +2085,42 @@ EXPORT_SYMBOL(create_mnt_ns);
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 		char __user *, type, unsigned long, flags, void __user *, data)
 {
-	int retval;
+	int ret;
+	char *kernel_type;
+	char *kernel_dir;
+	char *kernel_dev;
 	unsigned long data_page;
-	unsigned long type_page;
-	unsigned long dev_page;
-	char *dir_page;
 
-	retval = copy_mount_options(type, &type_page);
-	if (retval < 0)
-		return retval;
+	ret = copy_mount_string(type, &kernel_type);
+	if (ret < 0)
+		goto out_type;
 
-	dir_page = getname(dir_name);
-	retval = PTR_ERR(dir_page);
-	if (IS_ERR(dir_page))
-		goto out1;
+	kernel_dir = getname(dir_name);
+	if (IS_ERR(kernel_dir)) {
+		ret = PTR_ERR(kernel_dir);
+		goto out_dir;
+	}
 
-	retval = copy_mount_options(dev_name, &dev_page);
-	if (retval < 0)
-		goto out2;
+	ret = copy_mount_string(dev_name, &kernel_dev);
+	if (ret < 0)
+		goto out_dev;
 
-	retval = copy_mount_options(data, &data_page);
-	if (retval < 0)
-		goto out3;
+	ret = copy_mount_options(data, &data_page);
+	if (ret < 0)
+		goto out_data;
 
-	retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
-			  flags, (void *)data_page);
-	free_page(data_page);
+	ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
+		(void *) data_page);
 
-out3:
-	free_page(dev_page);
-out2:
-	putname(dir_page);
-out1:
-	free_page(type_page);
-	return retval;
+	free_page(data_page);
+out_data:
+	kfree(kernel_dev);
+out_dev:
+	putname(kernel_dir);
+out_dir:
+	kfree(kernel_type);
+out_type:
+	return ret;
 }
 
 /*
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b99ce205b1b..cf98da1be23 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -746,16 +746,8 @@ static void ncp_put_super(struct super_block *sb)
 
 #ifdef CONFIG_NCPFS_NLS
 	/* unload the NLS charsets */
-	if (server->nls_vol)
-	{
-		unload_nls(server->nls_vol);
-		server->nls_vol = NULL;
-	}
-	if (server->nls_io)
-	{
-		unload_nls(server->nls_io);
-		server->nls_io = NULL;
-	}
+	unload_nls(server->nls_vol);
+	unload_nls(server->nls_io);
 #endif /* CONFIG_NCPFS_NLS */
 
 	if (server->info_filp)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 53a7ed7eb9c..0d58caf4a6e 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -223,10 +223,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 	oldset_io = server->nls_io;
 	server->nls_io = iocharset;
 
-	if (oldset_cp)
-		unload_nls(oldset_cp);
-	if (oldset_io)
-		unload_nls(oldset_io);
+	unload_nls(oldset_cp);
+	unload_nls(oldset_io);
 
 	return 0;
 }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 152025358da..63976c0ccc2 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -648,8 +648,6 @@ static int nfs_start_lockd(struct nfs_server *server)
 		.hostname	= clp->cl_hostname,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrlen	= clp->cl_addrlen,
-		.protocol	= server->flags & NFS_MOUNT_TCP ?
-						IPPROTO_TCP : IPPROTO_UDP,
 		.nfs_version	= clp->rpc_ops->version,
 		.noresvport	= server->flags & NFS_MOUNT_NORESVPORT ?
 					1 : 0,
@@ -660,6 +658,14 @@ static int nfs_start_lockd(struct nfs_server *server)
 	if (server->flags & NFS_MOUNT_NONLM)
 		return 0;
 
+	switch (clp->cl_proto) {
+		default:
+			nlm_init.protocol = IPPROTO_TCP;
+			break;
+		case XPRT_TRANSPORT_UDP:
+			nlm_init.protocol = IPPROTO_UDP;
+	}
+
 	host = nlmclnt_init(&nlm_init);
 	if (IS_ERR(host))
 		return PTR_ERR(host);
@@ -787,7 +793,7 @@ static int nfs_init_server(struct nfs_server *server,
 	dprintk("--> nfs_init_server()\n");
 
 #ifdef CONFIG_NFS_V3
-	if (data->flags & NFS_MOUNT_VER3)
+	if (data->version == 3)
 		cl_init.rpc_ops = &nfs_v3_clientops;
 #endif
 
@@ -964,6 +970,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
 	target->acdirmin = source->acdirmin;
 	target->acdirmax = source->acdirmax;
 	target->caps = source->caps;
+	target->options = source->options;
 }
 
 /*
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5021b75d2d1..86d6b4db109 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -525,6 +525,7 @@ const struct address_space_operations nfs_file_aops = {
 	.direct_IO = nfs_direct_IO,
 	.migratepage = nfs_migrate_page,
 	.launder_page = nfs_launder_page,
+	.error_remove_page = generic_error_remove_page,
 };
 
 /*
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 379be678cb7..70fad69eb95 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -58,17 +58,34 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
 /*
  * Get the cache cookie for an NFS superblock.  We have to handle
  * uniquification here because the cache doesn't do it for us.
+ *
+ * The default uniquifier is just an empty string, but it may be overridden
+ * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
+ * superblock across an automount point of some nature.
  */
-void nfs_fscache_get_super_cookie(struct super_block *sb,
-				  struct nfs_parsed_mount_data *data)
+void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,
+				  struct nfs_clone_mount *mntdata)
 {
 	struct nfs_fscache_key *key, *xkey;
 	struct nfs_server *nfss = NFS_SB(sb);
 	struct rb_node **p, *parent;
-	const char *uniq = data->fscache_uniq ?: "";
 	int diff, ulen;
 
-	ulen = strlen(uniq);
+	if (uniq) {
+		ulen = strlen(uniq);
+	} else if (mntdata) {
+		struct nfs_server *mnt_s = NFS_SB(mntdata->sb);
+		if (mnt_s->fscache_key) {
+			uniq = mnt_s->fscache_key->key.uniquifier;
+			ulen = mnt_s->fscache_key->key.uniq_len;
+		}
+	}
+
+	if (!uniq) {
+		uniq = "";
+		ulen = 1;
+	}
+
 	key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
 	if (!key)
 		return;
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 6e809bb0ff0..b9c572d0679 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -74,7 +74,8 @@ extern void nfs_fscache_get_client_cookie(struct nfs_client *);
 extern void nfs_fscache_release_client_cookie(struct nfs_client *);
 
 extern void nfs_fscache_get_super_cookie(struct super_block *,
-					 struct nfs_parsed_mount_data *);
+					 const char *,
+					 struct nfs_clone_mount *);
 extern void nfs_fscache_release_super_cookie(struct super_block *);
 
 extern void nfs_fscache_init_inode_cookie(struct inode *);
@@ -173,7 +174,8 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
 
 static inline void nfs_fscache_get_super_cookie(
 	struct super_block *sb,
-	struct nfs_parsed_mount_data *data)
+	const char *uniq,
+	struct nfs_clone_mount *mntdata)
 {
 }
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 060022b4651..faa091865ad 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -458,49 +458,21 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
  */
 static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 {
-	if (i_size_read(inode) < offset) {
-		unsigned long limit;
-
-		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-		if (limit != RLIM_INFINITY && offset > limit)
-			goto out_sig;
-		if (offset > inode->i_sb->s_maxbytes)
-			goto out_big;
-		spin_lock(&inode->i_lock);
-		i_size_write(inode, offset);
-		spin_unlock(&inode->i_lock);
-	} else {
-		struct address_space *mapping = inode->i_mapping;
+	loff_t oldsize;
+	int err;
 
-		/*
-		 * truncation of in-use swapfiles is disallowed - it would
-		 * cause subsequent swapout to scribble on the now-freed
-		 * blocks.
-		 */
-		if (IS_SWAPFILE(inode))
-			return -ETXTBSY;
-		spin_lock(&inode->i_lock);
-		i_size_write(inode, offset);
-		spin_unlock(&inode->i_lock);
+	err = inode_newsize_ok(inode, offset);
+	if (err)
+		goto out;
 
-		/*
-		 * unmap_mapping_range is called twice, first simply for
-		 * efficiency so that truncate_inode_pages does fewer
-		 * single-page unmaps.  However after this first call, and
-		 * before truncate_inode_pages finishes, it is possible for
-		 * private pages to be COWed, which remain after
-		 * truncate_inode_pages finishes, hence the second
-		 * unmap_mapping_range call must be made for correctness.
-		 */
-		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-		truncate_inode_pages(mapping, offset);
-		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-	}
-	return 0;
-out_sig:
-	send_sig(SIGXFSZ, current, 0);
-out_big:
-	return -EFBIG;
+	spin_lock(&inode->i_lock);
+	oldsize = inode->i_size;
+	i_size_write(inode, offset);
+	spin_unlock(&inode->i_lock);
+
+	truncate_pagecache(inode, oldsize, offset);
+out:
+	return err;
 }
 
 /**
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c862c9340f9..5e078b222b4 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -13,7 +13,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ee6a13f0544..3f8881d1a05 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -7,7 +7,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 35869a4921f..5fe5492fbd2 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -10,7 +10,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index be6544aef41..ed7c269e251 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -36,7 +36,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cfc30d362f9..83ad47cbdd8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -39,7 +39,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 7be72d90d49..ef583854d8d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -32,7 +32,6 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1cc0587cfe..810770f9681 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -728,6 +728,27 @@ static void nfs_umount_begin(struct super_block *sb)
 	unlock_kernel();
 }
 
+static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags)
+{
+	struct nfs_parsed_mount_data *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data) {
+		data->flags		= flags;
+		data->rsize		= NFS_MAX_FILE_IO_SIZE;
+		data->wsize		= NFS_MAX_FILE_IO_SIZE;
+		data->acregmin		= NFS_DEF_ACREGMIN;
+		data->acregmax		= NFS_DEF_ACREGMAX;
+		data->acdirmin		= NFS_DEF_ACDIRMIN;
+		data->acdirmax		= NFS_DEF_ACDIRMAX;
+		data->nfs_server.port	= NFS_UNSPEC_PORT;
+		data->auth_flavors[0]	= RPC_AUTH_UNIX;
+		data->auth_flavor_len	= 1;
+		data->minorversion	= 0;
+	}
+	return data;
+}
+
 /*
  * Sanity-check a server address provided by the mount command.
  *
@@ -1430,10 +1451,13 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	int status;
 
 	if (args->mount_server.version == 0) {
-		if (args->flags & NFS_MOUNT_VER3)
-			args->mount_server.version = NFS_MNT3_VERSION;
-		else
-			args->mount_server.version = NFS_MNT_VERSION;
+		switch (args->version) {
+			default:
+				args->mount_server.version = NFS_MNT3_VERSION;
+				break;
+			case 2:
+				args->mount_server.version = NFS_MNT_VERSION;
+		}
 	}
 	request.version = args->mount_server.version;
 
@@ -1634,20 +1658,6 @@ static int nfs_validate_mount_data(void *options,
 	if (data == NULL)
 		goto out_no_data;
 
-	args->flags		= (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
-	args->rsize		= NFS_MAX_FILE_IO_SIZE;
-	args->wsize		= NFS_MAX_FILE_IO_SIZE;
-	args->acregmin		= NFS_DEF_ACREGMIN;
-	args->acregmax		= NFS_DEF_ACREGMAX;
-	args->acdirmin		= NFS_DEF_ACDIRMIN;
-	args->acdirmax		= NFS_DEF_ACDIRMAX;
-	args->mount_server.port	= NFS_UNSPEC_PORT;
-	args->nfs_server.port	= NFS_UNSPEC_PORT;
-	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-	args->auth_flavors[0]	= RPC_AUTH_UNIX;
-	args->auth_flavor_len	= 1;
-	args->minorversion	= 0;
-
 	switch (data->version) {
 	case 1:
 		data->namlen = 0;
@@ -1778,7 +1788,7 @@ static int nfs_validate_mount_data(void *options,
 	}
 
 #ifndef CONFIG_NFS_V3
-	if (args->flags & NFS_MOUNT_VER3)
+	if (args->version == 3)
 		goto out_v3_not_compiled;
 #endif /* !CONFIG_NFS_V3 */
 
@@ -1936,7 +1946,7 @@ static void nfs_fill_super(struct super_block *sb,
 	if (data->bsize)
 		sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
 
-	if (server->flags & NFS_MOUNT_VER3) {
+	if (server->nfs_client->rpc_ops->version == 3) {
 		/* The VFS shouldn't apply the umask to mode bits. We will do
 		 * so ourselves when necessary.
 		 */
@@ -1960,7 +1970,7 @@ static void nfs_clone_super(struct super_block *sb,
 	sb->s_blocksize = old_sb->s_blocksize;
 	sb->s_maxbytes = old_sb->s_maxbytes;
 
-	if (server->flags & NFS_MOUNT_VER3) {
+	if (server->nfs_client->rpc_ops->version == 3) {
 		/* The VFS shouldn't apply the umask to mode bits. We will do
 		 * so ourselves when necessary.
 		 */
@@ -2094,7 +2104,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	};
 	int error = -ENOMEM;
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
 	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
 	if (data == NULL || mntfh == NULL)
 		goto out_free_fh;
@@ -2144,7 +2154,8 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs_fill_super(s, data);
-		nfs_fscache_get_super_cookie(s, data);
+		nfs_fscache_get_super_cookie(
+			s, data ? data->fscache_uniq : NULL, NULL);
 	}
 
 	mntroot = nfs_get_root(s, mntfh);
@@ -2245,6 +2256,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs_clone_super(s, data->sb);
+		nfs_fscache_get_super_cookie(s, NULL, data);
 	}
 
 	mntroot = nfs_get_root(s, data->fh);
@@ -2362,18 +2374,7 @@ static int nfs4_validate_mount_data(void *options,
 	if (data == NULL)
 		goto out_no_data;
 
-	args->rsize		= NFS_MAX_FILE_IO_SIZE;
-	args->wsize		= NFS_MAX_FILE_IO_SIZE;
-	args->acregmin		= NFS_DEF_ACREGMIN;
-	args->acregmax		= NFS_DEF_ACREGMAX;
-	args->acdirmin		= NFS_DEF_ACDIRMIN;
-	args->acdirmax		= NFS_DEF_ACDIRMAX;
-	args->nfs_server.port	= NFS_UNSPEC_PORT;
-	args->auth_flavors[0]	= RPC_AUTH_UNIX;
-	args->auth_flavor_len	= 1;
 	args->version		= 4;
-	args->minorversion	= 0;
-
 	switch (data->version) {
 	case 1:
 		if (data->host_addrlen > sizeof(args->nfs_server.address))
@@ -2508,7 +2509,8 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs4_fill_super(s);
-		nfs_fscache_get_super_cookie(s, data);
+		nfs_fscache_get_super_cookie(
+			s, data ? data->fscache_uniq : NULL, NULL);
 	}
 
 	mntroot = nfs4_get_root(s, mntfh);
@@ -2656,7 +2658,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	struct nfs_parsed_mount_data *data;
 	int error = -ENOMEM;
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	data = nfs_alloc_parsed_mount_data(0);
 	if (data == NULL)
 		goto out_free_data;
 
@@ -2741,6 +2743,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs4_clone_super(s, data->sb);
+		nfs_fscache_get_super_cookie(s, NULL, data);
 	}
 
 	mntroot = nfs4_get_root(s, data->fh);
@@ -2822,6 +2825,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs4_fill_super(s);
+		nfs_fscache_get_super_cookie(s, NULL, data);
 	}
 
 	mntroot = nfs4_get_root(s, &mntfh);
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index cdfa86fa147..ba2c199592f 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -38,7 +38,6 @@
 #include <linux/init.h>
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 477d37d83b3..2224b4d07bf 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -270,7 +270,8 @@ struct nls_table *load_nls(char *charset)
 
 void unload_nls(struct nls_table *nls)
 {
-	module_put(nls->owner);
+	if (nls)
+		module_put(nls->owner);
 }
 
 static const wchar_t charset2uni[256] = {
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index b38f944f066..cfce53cb65d 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1550,6 +1550,7 @@ const struct address_space_operations ntfs_aops = {
 	.migratepage	= buffer_migrate_page,	/* Move a page cache page from
 						   one physical page to an
 						   other. */
+	.error_remove_page = generic_error_remove_page,
 };
 
 /**
@@ -1569,6 +1570,7 @@ const struct address_space_operations ntfs_mst_aops = {
 	.migratepage	= buffer_migrate_page,	/* Move a page cache page from
 						   one physical page to an
 						   other. */
+	.error_remove_page = generic_error_remove_page,
 };
 
 #ifdef NTFS_RW
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index abaaa1cbf8d..80b04770e8e 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -201,8 +201,7 @@ use_utf8:
 						v, old_nls->charset);
 				nls_map = old_nls;
 			} else /* nls_map */ {
-				if (old_nls)
-					unload_nls(old_nls);
+				unload_nls(old_nls);
 			}
 		} else if (!strcmp(p, "utf8")) {
 			bool val = false;
@@ -2427,10 +2426,9 @@ static void ntfs_put_super(struct super_block *sb)
 		ntfs_free(vol->upcase);
 		vol->upcase = NULL;
 	}
-	if (vol->nls_map) {
-		unload_nls(vol->nls_map);
-		vol->nls_map = NULL;
-	}
+
+	unload_nls(vol->nls_map);
+
 	sb->s_fs_info = NULL;
 	kfree(vol);
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 72e76062a90..deb2b132ae5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2022,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = {
 	.releasepage		= ocfs2_releasepage,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate	= block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 81eff8e5832..01cf8cc3d28 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 75997b4deaf..ca96bce50e1 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c5c88124096..ca46002ec10 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -27,7 +27,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4d9e6b288dd..0334000676d 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/delay.h>
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83a9f2972ac..437698e9465 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f8b653fcd4d..83bcaf266b3 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 43e6e328056..d9fa3d22e17 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 98569e86c61..52ec020ea78 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 756f5b0998e..00f53b2aea7 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 24feb449a1d..4cc3c890a2c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/random.h>
 #include <linux/statfs.h>
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 579dd1b1110..e3421030a69 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -38,7 +38,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/utsname.h>
 #include <linux/namei.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 171e052c07b..c7bff4f603f 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -97,7 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"Committed_AS:   %8lu kB\n"
 		"VmallocTotal:   %8lu kB\n"
 		"VmallocUsed:    %8lu kB\n"
-		"VmallocChunk:   %8lu kB\n",
+		"VmallocChunk:   %8lu kB\n"
+#ifdef CONFIG_MEMORY_FAILURE
+		"HardwareCorrupted: %8lu kB\n"
+#endif
+		,
 		K(i.totalram),
 		K(i.freeram),
 		K(i.bufferram),
@@ -144,6 +148,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		(unsigned long)VMALLOC_TOTAL >> 10,
 		vmi.used >> 10,
 		vmi.largest_chunk >> 10
+#ifdef CONFIG_MEMORY_FAILURE
+		,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+#endif
 		);
 
 	hugetlb_report_meminfo(m);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9b1e4e9a16b..f667e8aeabd 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 
 	/* careful: calling conventions are nasty here */
 	res = count;
-	error = table->proc_handler(table, write, filp, buf, &res, ppos);
+	error = table->proc_handler(table, write, buf, &res, ppos);
 	if (!error)
 		error = res;
 out:
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 0c10a0b3f14..766b1d45605 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -4,13 +4,18 @@
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
+#include <linux/kernel_stat.h>
 #include <asm/cputime.h>
 
 static int uptime_proc_show(struct seq_file *m, void *v)
 {
 	struct timespec uptime;
 	struct timespec idle;
-	cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
+	int i;
+	cputime_t idletime = cputime_zero;
+
+	for_each_possible_cpu(i)
+		idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	monotonic_to_bootbased(&uptime);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 11f0c06316d..32fae4040eb 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -69,14 +69,11 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 	/* make various checks */
 	order = get_order(newsize);
 	if (unlikely(order >= MAX_ORDER))
-		goto too_big;
+		return -EFBIG;
 
-	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-	if (limit != RLIM_INFINITY && newsize > limit)
-		goto fsize_exceeded;
-
-	if (newsize > inode->i_sb->s_maxbytes)
-		goto too_big;
+	ret = inode_newsize_ok(inode, newsize);
+	if (ret)
+		return ret;
 
 	i_size_write(inode, newsize);
 
@@ -118,12 +115,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 
 	return 0;
 
- fsize_exceeded:
-	send_sig(SIGXFSZ, current, 0);
- too_big:
-	return -EFBIG;
-
- add_error:
+add_error:
 	while (loop < npages)
 		__free_page(pages + loop++);
 	return ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 6c8c55dec2b..3ac28987f22 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -839,9 +839,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
 
 	pos = *ppos;
-	retval = -EINVAL;
-	if (unlikely(pos < 0))
-		goto fput_out;
 	if (unlikely(pos + count > max)) {
 		retval = -EOVERFLOW;
 		if (pos >= max)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 47f132df0c3..c117fa80d1e 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
 	pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
 
 	root = romfs_iget(sb, pos);
-	if (!root)
+	if (IS_ERR(root))
 		goto error;
 
 	sb->s_root = d_alloc_root(root);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 6c959275f2d..eae7d9dbf3f 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -429,20 +429,21 @@ EXPORT_SYMBOL(mangle_path);
  */
 int seq_path(struct seq_file *m, struct path *path, char *esc)
 {
-	if (m->count < m->size) {
-		char *s = m->buf + m->count;
-		char *p = d_path(path, s, m->size - m->count);
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -1;
+
+	if (size) {
+		char *p = d_path(path, buf, size);
 		if (!IS_ERR(p)) {
-			s = mangle_path(s, p, esc);
-			if (s) {
-				p = m->buf + m->count;
-				m->count = s - m->buf;
-				return s - p;
-			}
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
 		}
 	}
-	m->count = m->size;
-	return -1;
+	seq_commit(m, res);
+
+	return res;
 }
 EXPORT_SYMBOL(seq_path);
 
@@ -454,26 +455,28 @@ EXPORT_SYMBOL(seq_path);
 int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
 		  char *esc)
 {
-	int err = -ENAMETOOLONG;
-	if (m->count < m->size) {
-		char *s = m->buf + m->count;
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -ENAMETOOLONG;
+
+	if (size) {
 		char *p;
 
 		spin_lock(&dcache_lock);
-		p = __d_path(path, root, s, m->size - m->count);
+		p = __d_path(path, root, buf, size);
 		spin_unlock(&dcache_lock);
-		err = PTR_ERR(p);
+		res = PTR_ERR(p);
 		if (!IS_ERR(p)) {
-			s = mangle_path(s, p, esc);
-			if (s) {
-				p = m->buf + m->count;
-				m->count = s - m->buf;
-				return 0;
-			}
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
+			else
+				res = -ENAMETOOLONG;
 		}
 	}
-	m->count = m->size;
-	return err;
+	seq_commit(m, res);
+
+	return res < 0 ? res : 0;
 }
 
 /*
@@ -481,20 +484,21 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
  */
 int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
 {
-	if (m->count < m->size) {
-		char *s = m->buf + m->count;
-		char *p = dentry_path(dentry, s, m->size - m->count);
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -1;
+
+	if (size) {
+		char *p = dentry_path(dentry, buf, size);
 		if (!IS_ERR(p)) {
-			s = mangle_path(s, p, esc);
-			if (s) {
-				p = m->buf + m->count;
-				m->count = s - m->buf;
-				return s - p;
-			}
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
 		}
 	}
-	m->count = m->size;
-	return -1;
+	seq_commit(m, res);
+
+	return res;
 }
 
 int seq_bitmap(struct seq_file *m, const unsigned long *bits,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1402d2d54f5..1c4c8f08997 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -459,14 +459,8 @@ smb_show_options(struct seq_file *s, struct vfsmount *m)
 static void
 smb_unload_nls(struct smb_sb_info *server)
 {
-	if (server->remote_nls) {
-		unload_nls(server->remote_nls);
-		server->remote_nls = NULL;
-	}
-	if (server->local_nls) {
-		unload_nls(server->local_nls);
-		server->local_nls = NULL;
-	}
+	unload_nls(server->remote_nls);
+	unload_nls(server->local_nls);
 }
 
 static void
diff --git a/fs/super.c b/fs/super.c
index 0e7207b9815..19eb70b374b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -465,6 +465,48 @@ rescan:
 }
 
 EXPORT_SYMBOL(get_super);
+
+/**
+ * get_active_super - get an active reference to the superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device given.  Returns the superblock with an active
+ * reference and s_umount held exclusively or %NULL if none was found.
+ */
+struct super_block *get_active_super(struct block_device *bdev)
+{
+	struct super_block *sb;
+
+	if (!bdev)
+		return NULL;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (sb->s_bdev != bdev)
+			continue;
+
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_write(&sb->s_umount);
+		if (sb->s_root) {
+			spin_lock(&sb_lock);
+			if (sb->s_count > S_BIAS) {
+				atomic_inc(&sb->s_active);
+				sb->s_count--;
+				spin_unlock(&sb_lock);
+				return sb;
+			}
+			spin_unlock(&sb_lock);
+		}
+		up_write(&sb->s_umount);
+		put_super(sb);
+		yield();
+		spin_lock(&sb_lock);
+	}
+	spin_unlock(&sb_lock);
+	return NULL;
+}
  
 struct super_block * user_get_super(dev_t dev)
 {
@@ -527,11 +569,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
 	int retval;
 	int remount_rw;
-	
+
+	if (sb->s_frozen != SB_UNFROZEN)
+		return -EBUSY;
+
 #ifdef CONFIG_BLOCK
 	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
 		return -EACCES;
 #endif
+
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
@@ -743,9 +789,14 @@ int get_sb_bdev(struct file_system_type *fs_type,
 	 * will protect the lockfs code from trying to start a snapshot
 	 * while we are mounting
 	 */
-	down(&bdev->bd_mount_sem);
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		error = -EBUSY;
+		goto error_bdev;
+	}
 	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
-	up(&bdev->bd_mount_sem);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	if (IS_ERR(s))
 		goto error_s;
 
@@ -892,6 +943,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
  	if (error)
  		goto out_sb;
 
+	/*
+	 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
+	 * but s_maxbytes was an unsigned long long for many releases. Throw
+	 * this warning for a little while to try and catch filesystems that
+	 * violate this rule. This warning should be either removed or
+	 * converted to a BUG() in 2.6.34.
+	 */
+	WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+		"negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
+
 	mnt->mnt_mountpoint = mnt->mnt_root;
 	mnt->mnt_parent = mnt;
 	up_write(&mnt->mnt_sb->s_umount);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d5e5559e31d..381854461b2 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1635,4 +1635,5 @@ const struct address_space_operations xfs_address_space_operations = {
 	.direct_IO		= xfs_vm_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 916c0ffb608..c5bc67c4e3b 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -26,7 +26,6 @@ STATIC int
 xfs_stats_clear_proc_handler(
 	ctl_table	*ctl,
 	int		write,
-	struct file	*filp,
 	void		__user *buffer,
 	size_t		*lenp,
 	loff_t		*ppos)
@@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler(
 	int		c, ret, *valp = ctl->data;
 	__uint32_t	vn_active;
 
-	ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
 
 	if (!ret && write && *valp) {
 		printk("XFS Clearing xfsstats\n");
diff --git a/include/acpi/button.h b/include/acpi/button.h
new file mode 100644
index 00000000000..97eea0e4c01
--- /dev/null
+++ b/include/acpi/button.h
@@ -0,0 +1,25 @@
+#ifndef ACPI_BUTTON_H
+#define ACPI_BUTTON_H
+
+#include <linux/notifier.h>
+
+#if defined(CONFIG_ACPI_BUTTON) || defined(CONFIG_ACPI_BUTTON_MODULE)
+extern int acpi_lid_notifier_register(struct notifier_block *nb);
+extern int acpi_lid_notifier_unregister(struct notifier_block *nb);
+extern int acpi_lid_open(void);
+#else
+static inline int acpi_lid_notifier_register(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline int acpi_lid_notifier_unregister(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline int acpi_lid_open(void)
+{
+	return 1;
+}
+#endif /* defined(CONFIG_ACPI_BUTTON) || defined(CONFIG_ACPI_BUTTON_MODULE) */
+
+#endif /* ACPI_BUTTON_H */
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index 4d3e48373e7..0c3dd860392 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -73,6 +73,19 @@
 #define F_SETSIG	10	/* for sockets. */
 #define F_GETSIG	11	/* for sockets. */
 #endif
+#ifndef F_SETOWN_EX
+#define F_SETOWN_EX	12
+#define F_GETOWN_EX	13
+#endif
+
+#define F_OWNER_TID	0
+#define F_OWNER_PID	1
+#define F_OWNER_GID	2
+
+struct f_owner_ex {
+	int	type;
+	pid_t	pid;
+};
 
 /* for F_[GET|SET]FL */
 #define FD_CLOEXEC	1	/* actually anything with low bit set goes */
diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index dd63bd38864..5ee13b2fd22 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -34,6 +34,7 @@
 #define MADV_REMOVE	9		/* remove these pages & resources */
 #define MADV_DONTFORK	10		/* don't inherit across fork */
 #define MADV_DOFORK	11		/* do inherit across fork */
+#define MADV_HWPOISON	100		/* poison a page for testing */
 
 #define MADV_MERGEABLE   12		/* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 13		/* KSM may not merge identical pages */
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index c840719a8c5..942d30b5aab 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -82,6 +82,7 @@ typedef struct siginfo {
 #ifdef __ARCH_SI_TRAPNO
 			int _trapno;	/* TRAP # which caused the signal */
 #endif
+			short _addr_lsb; /* LSB of the reported address */
 		} _sigfault;
 
 		/* SIGPOLL */
@@ -112,6 +113,7 @@ typedef struct siginfo {
 #ifdef __ARCH_SI_TRAPNO
 #define si_trapno	_sifields._sigfault._trapno
 #endif
+#define si_addr_lsb	_sifields._sigfault._addr_lsb
 #define si_band		_sifields._sigpoll._band
 #define si_fd		_sifields._sigpoll._fd
 
@@ -192,7 +194,11 @@ typedef struct siginfo {
 #define BUS_ADRALN	(__SI_FAULT|1)	/* invalid address alignment */
 #define BUS_ADRERR	(__SI_FAULT|2)	/* non-existant physical address */
 #define BUS_OBJERR	(__SI_FAULT|3)	/* object specific hardware error */
-#define NSIGBUS		3
+/* hardware memory error consumed on a machine check: action required */
+#define BUS_MCEERR_AR	(__SI_FAULT|4)
+/* hardware memory error detected in process but not consumed: action optional*/
+#define BUS_MCEERR_AO	(__SI_FAULT|5)
+#define NSIGBUS		5
 
 /*
  * SIGTRAP si_codes
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index 88bada2ebc4..510df36dd5d 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -37,9 +37,6 @@
 #ifndef parent_node
 #define parent_node(node)	((void)(node),0)
 #endif
-#ifndef node_to_cpumask
-#define node_to_cpumask(node)	((void)node, cpu_online_map)
-#endif
 #ifndef cpumask_of_node
 #define cpumask_of_node(node)	((void)node, cpu_online_mask)
 #endif
@@ -55,18 +52,4 @@
 
 #endif	/* CONFIG_NUMA */
 
-/*
- * returns pointer to cpumask for specified node
- * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
- */
-#ifndef node_to_cpumask_ptr
-
-#define	node_to_cpumask_ptr(v, node) 					\
-		cpumask_t _##v = node_to_cpumask(node);			\
-		const cpumask_t *v = &_##v
-
-#define node_to_cpumask_ptr_next(v, node)				\
-			  _##v = node_to_cpumask(node)
-#endif
-
 #endif /* _ASM_GENERIC_TOPOLOGY_H */
diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h
index 853508499d2..3f6e545609b 100644
--- a/include/drm/drm_pciids.h
+++ b/include/drm/drm_pciids.h
@@ -552,6 +552,7 @@
 	{0x8086, 0x2e12, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
 	{0x8086, 0x2e22, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
 	{0x8086, 0x2e32, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
+	{0x8086, 0x2e42, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
 	{0x8086, 0xa001, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
 	{0x8086, 0xa011, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
 	{0x8086, 0x35e8, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \
diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index 8e1e92583fb..7e0cb1da92e 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -185,6 +185,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_GEM_GET_APERTURE 0x23
 #define DRM_I915_GEM_MMAP_GTT	0x24
 #define DRM_I915_GET_PIPE_FROM_CRTC_ID	0x25
+#define DRM_I915_GEM_MADVISE	0x26
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -221,6 +222,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_GET_TILING	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_GET_TILING, struct drm_i915_gem_get_tiling)
 #define DRM_IOCTL_I915_GEM_GET_APERTURE	DRM_IOR  (DRM_COMMAND_BASE + DRM_I915_GEM_GET_APERTURE, struct drm_i915_gem_get_aperture)
 #define DRM_IOCTL_I915_GET_PIPE_FROM_CRTC_ID DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_PIPE_FROM_CRTC_ID, struct drm_intel_get_pipe_from_crtc_id)
+#define DRM_IOCTL_I915_GEM_MADVISE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MADVISE, struct drm_i915_gem_madvise)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -667,4 +669,21 @@ struct drm_i915_get_pipe_from_crtc_id {
 	__u32 pipe;
 };
 
+#define I915_MADV_WILLNEED 0
+#define I915_MADV_DONTNEED 1
+#define __I915_MADV_PURGED 2 /* internal state */
+
+struct drm_i915_gem_madvise {
+	/** Handle of the buffer to change the backing store advice */
+	__u32 handle;
+
+	/* Advice: either the buffer will be needed again in the near future,
+	 *         or wont be and could be discarded under memory pressure.
+	 */
+	__u32 madv;
+
+	/** Whether the backing store still exists. */
+	__u32 retained;
+};
+
 #endif				/* _I915_DRM_H_ */
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 5fc2ef8d97f..a1c486a88e8 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -58,25 +58,60 @@ struct dma_chan_ref {
  * array.
  * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
  * dependency chain
- * @ASYNC_TX_DEP_ACK: ack the dependency descriptor.  Useful for chaining.
+ * @ASYNC_TX_FENCE: specify that the next operation in the dependency
+ * chain uses this operation's result as an input
  */
 enum async_tx_flags {
 	ASYNC_TX_XOR_ZERO_DST	 = (1 << 0),
 	ASYNC_TX_XOR_DROP_DST	 = (1 << 1),
-	ASYNC_TX_ACK		 = (1 << 3),
-	ASYNC_TX_DEP_ACK	 = (1 << 4),
+	ASYNC_TX_ACK		 = (1 << 2),
+	ASYNC_TX_FENCE		 = (1 << 3),
+};
+
+/**
+ * struct async_submit_ctl - async_tx submission/completion modifiers
+ * @flags: submission modifiers
+ * @depend_tx: parent dependency of the current operation being submitted
+ * @cb_fn: callback routine to run at operation completion
+ * @cb_param: parameter for the callback routine
+ * @scribble: caller provided space for dma/page address conversions
+ */
+struct async_submit_ctl {
+	enum async_tx_flags flags;
+	struct dma_async_tx_descriptor *depend_tx;
+	dma_async_tx_callback cb_fn;
+	void *cb_param;
+	void *scribble;
 };
 
 #ifdef CONFIG_DMA_ENGINE
 #define async_tx_issue_pending_all dma_issue_pending_all
+
+/**
+ * async_tx_issue_pending - send pending descriptor to the hardware channel
+ * @tx: descriptor handle to retrieve hardware context
+ *
+ * Note: any dependent operations will have already been issued by
+ * async_tx_channel_switch, or (in the case of no channel switch) will
+ * be already pending on this channel.
+ */
+static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx)
+{
+	if (likely(tx)) {
+		struct dma_chan *chan = tx->chan;
+		struct dma_device *dma = chan->device;
+
+		dma->device_issue_pending(chan);
+	}
+}
 #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL
 #include <asm/async_tx.h>
 #else
 #define async_tx_find_channel(dep, type, dst, dst_count, src, src_count, len) \
 	 __async_tx_find_channel(dep, type)
 struct dma_chan *
-__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
-	enum dma_transaction_type tx_type);
+__async_tx_find_channel(struct async_submit_ctl *submit,
+			enum dma_transaction_type tx_type);
 #endif /* CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL */
 #else
 static inline void async_tx_issue_pending_all(void)
@@ -84,10 +119,16 @@ static inline void async_tx_issue_pending_all(void)
 	do { } while (0);
 }
 
+static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx)
+{
+	do { } while (0);
+}
+
 static inline struct dma_chan *
-async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
-	enum dma_transaction_type tx_type, struct page **dst, int dst_count,
-	struct page **src, int src_count, size_t len)
+async_tx_find_channel(struct async_submit_ctl *submit,
+		      enum dma_transaction_type tx_type, struct page **dst,
+		      int dst_count, struct page **src, int src_count,
+		      size_t len)
 {
 	return NULL;
 }
@@ -99,46 +140,70 @@ async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
  * @cb_fn_param: parameter to pass to the callback routine
  */
 static inline void
-async_tx_sync_epilog(dma_async_tx_callback cb_fn, void *cb_fn_param)
+async_tx_sync_epilog(struct async_submit_ctl *submit)
 {
-	if (cb_fn)
-		cb_fn(cb_fn_param);
+	if (submit->cb_fn)
+		submit->cb_fn(submit->cb_param);
 }
 
-void
-async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
-	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+typedef union {
+	unsigned long addr;
+	struct page *page;
+	dma_addr_t dma;
+} addr_conv_t;
+
+static inline void
+init_async_submit(struct async_submit_ctl *args, enum async_tx_flags flags,
+		  struct dma_async_tx_descriptor *tx,
+		  dma_async_tx_callback cb_fn, void *cb_param,
+		  addr_conv_t *scribble)
+{
+	args->flags = flags;
+	args->depend_tx = tx;
+	args->cb_fn = cb_fn;
+	args->cb_param = cb_param;
+	args->scribble = scribble;
+}
+
+void async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
+		     struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_xor(struct page *dest, struct page **src_list, unsigned int offset,
-	int src_cnt, size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+	  int src_cnt, size_t len, struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
-async_xor_zero_sum(struct page *dest, struct page **src_list,
-	unsigned int offset, int src_cnt, size_t len,
-	u32 *result, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
+	      int src_cnt, size_t len, enum sum_check_flags *result,
+	      struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
-	unsigned int src_offset, size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+	     unsigned int src_offset, size_t len,
+	     struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_memset(struct page *dest, int val, unsigned int offset,
-	size_t len, enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+	     size_t len, struct async_submit_ctl *submit);
+
+struct dma_async_tx_descriptor *async_trigger_callback(struct async_submit_ctl *submit);
+
+struct dma_async_tx_descriptor *
+async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt,
+		   size_t len, struct async_submit_ctl *submit);
+
+struct dma_async_tx_descriptor *
+async_syndrome_val(struct page **blocks, unsigned int offset, int src_cnt,
+		   size_t len, enum sum_check_flags *pqres, struct page *spare,
+		   struct async_submit_ctl *submit);
+
+struct dma_async_tx_descriptor *
+async_raid6_2data_recov(int src_num, size_t bytes, int faila, int failb,
+			struct page **ptrs, struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
-async_trigger_callback(enum async_tx_flags flags,
-	struct dma_async_tx_descriptor *depend_tx,
-	dma_async_tx_callback cb_fn, void *cb_fn_param);
+async_raid6_datap_recov(int src_num, size_t bytes, int faila,
+			struct page **ptrs, struct async_submit_ctl *submit);
 
 void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
 #endif /* _ASYNC_TX_H_ */
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 2046b5b8af4..aece486ac73 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -120,7 +120,7 @@ extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
 extern int prepare_bprm_creds(struct linux_binprm *bprm);
 extern void install_exec_creds(struct linux_binprm *bprm);
 extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
-extern int set_binfmt(struct linux_binfmt *new);
+extern void set_binfmt(struct linux_binfmt *new);
 extern void free_bprm(struct linux_binprm *);
 
 #endif /* __KERNEL__ */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 90bba9e6228..b62bb9294d0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,6 +141,38 @@ enum {
 	CGRP_WAIT_ON_RMDIR,
 };
 
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+	CGROUP_FILE_PROCS,
+	CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+	/*
+	 * used to find which pidlist is wanted. doesn't change as long as
+	 * this particular list stays in the list.
+	 */
+	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+	/* array of xids */
+	pid_t *list;
+	/* how many elements the above list has */
+	int length;
+	/* how many files are using the current array */
+	int use_count;
+	/* each of these stored in a list by its cgroup */
+	struct list_head links;
+	/* pointer to the cgroup we belong to, for list removal purposes */
+	struct cgroup *owner;
+	/* protects the other fields */
+	struct rw_semaphore mutex;
+};
+
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
@@ -179,11 +211,12 @@ struct cgroup {
 	 */
 	struct list_head release_list;
 
-	/* pids_mutex protects pids_list and cached pid arrays. */
-	struct rw_semaphore pids_mutex;
-
-	/* Linked list of struct cgroup_pids */
-	struct list_head pids_list;
+	/*
+	 * list of pidlists, up to two for each namespace (one for procs, one
+	 * for tasks); created on demand.
+	 */
+	struct list_head pidlists;
+	struct mutex pidlist_mutex;
 
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
@@ -227,6 +260,9 @@ struct css_set {
 	 * during subsystem registration (at boot time).
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+	/* For RCU-protected deletion */
+	struct rcu_head rcu_head;
 };
 
 /*
@@ -389,10 +425,11 @@ struct cgroup_subsys {
 						  struct cgroup *cgrp);
 	int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
-	int (*can_attach)(struct cgroup_subsys *ss,
-			  struct cgroup *cgrp, struct task_struct *tsk);
+	int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			  struct task_struct *tsk, bool threadgroup);
 	void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			struct cgroup *old_cgrp, struct task_struct *tsk);
+			struct cgroup *old_cgrp, struct task_struct *tsk,
+			bool threadgroup);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
 	void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
 	int (*populate)(struct cgroup_subsys *ss,
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 7f627775c94..ddb7a97c78c 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -27,8 +27,8 @@
  *
  * configfs Copyright (C) 2005 Oracle.  All rights reserved.
  *
- * Please read Documentation/filesystems/configfs.txt before using the
- * configfs interface, ESPECIALLY the parts about reference counts and
+ * Please read Documentation/filesystems/configfs/configfs.txt before using
+ * the configfs interface, ESPECIALLY the parts about reference counts and
  * item destructors.
  */
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 9b1d458aac6..789cf5f920c 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -3,444 +3,37 @@
 
 /*
  * Cpumasks provide a bitmap suitable for representing the
- * set of CPU's in a system, one bit position per CPU number.
- *
- * The new cpumask_ ops take a "struct cpumask *"; the old ones
- * use cpumask_t.
- *
- * See detailed comments in the file linux/bitmap.h describing the
- * data type on which these cpumasks are based.
- *
- * For details of cpumask_scnprintf() and cpumask_parse_user(),
- * see bitmap_scnprintf() and bitmap_parse_user() in lib/bitmap.c.
- * For details of cpulist_scnprintf() and cpulist_parse(), see
- * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
- * For details of cpu_remap(), see bitmap_bitremap in lib/bitmap.c
- * For details of cpus_remap(), see bitmap_remap in lib/bitmap.c.
- * For details of cpus_onto(), see bitmap_onto in lib/bitmap.c.
- * For details of cpus_fold(), see bitmap_fold in lib/bitmap.c.
- *
- * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
- * Note: The alternate operations with the suffix "_nr" are used
- *       to limit the range of the loop to nr_cpu_ids instead of
- *       NR_CPUS when NR_CPUS > 64 for performance reasons.
- *       If NR_CPUS is <= 64 then most assembler bitmask
- *       operators execute faster with a constant range, so
- *       the operator will continue to use NR_CPUS.
- *
- *       Another consideration is that nr_cpu_ids is initialized
- *       to NR_CPUS and isn't lowered until the possible cpus are
- *       discovered (including any disabled cpus).  So early uses
- *       will span the entire range of NR_CPUS.
- * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
- *
- * The obsolescent cpumask operations are:
- *
- * void cpu_set(cpu, mask)		turn on bit 'cpu' in mask
- * void cpu_clear(cpu, mask)		turn off bit 'cpu' in mask
- * void cpus_setall(mask)		set all bits
- * void cpus_clear(mask)		clear all bits
- * int cpu_isset(cpu, mask)		true iff bit 'cpu' set in mask
- * int cpu_test_and_set(cpu, mask)	test and set bit 'cpu' in mask
- *
- * int cpus_and(dst, src1, src2)	dst = src1 & src2  [intersection]
- * void cpus_or(dst, src1, src2)	dst = src1 | src2  [union]
- * void cpus_xor(dst, src1, src2)	dst = src1 ^ src2
- * int cpus_andnot(dst, src1, src2)	dst = src1 & ~src2
- * void cpus_complement(dst, src)	dst = ~src
- *
- * int cpus_equal(mask1, mask2)		Does mask1 == mask2?
- * int cpus_intersects(mask1, mask2)	Do mask1 and mask2 intersect?
- * int cpus_subset(mask1, mask2)	Is mask1 a subset of mask2?
- * int cpus_empty(mask)			Is mask empty (no bits sets)?
- * int cpus_full(mask)			Is mask full (all bits sets)?
- * int cpus_weight(mask)		Hamming weigh - number of set bits
- * int cpus_weight_nr(mask)		Same using nr_cpu_ids instead of NR_CPUS
- *
- * void cpus_shift_right(dst, src, n)	Shift right
- * void cpus_shift_left(dst, src, n)	Shift left
- *
- * int first_cpu(mask)			Number lowest set bit, or NR_CPUS
- * int next_cpu(cpu, mask)		Next cpu past 'cpu', or NR_CPUS
- * int next_cpu_nr(cpu, mask)		Next cpu past 'cpu', or nr_cpu_ids
- *
- * cpumask_t cpumask_of_cpu(cpu)	Return cpumask with bit 'cpu' set
- *					(can be used as an lvalue)
- * CPU_MASK_ALL				Initializer - all bits set
- * CPU_MASK_NONE			Initializer - no bits set
- * unsigned long *cpus_addr(mask)	Array of unsigned long's in mask
- *
- * CPUMASK_ALLOC kmalloc's a structure that is a composite of many cpumask_t
- * variables, and CPUMASK_PTR provides pointers to each field.
- *
- * The structure should be defined something like this:
- * struct my_cpumasks {
- *	cpumask_t mask1;
- *	cpumask_t mask2;
- * };
- *
- * Usage is then:
- *	CPUMASK_ALLOC(my_cpumasks);
- *	CPUMASK_PTR(mask1, my_cpumasks);
- *	CPUMASK_PTR(mask2, my_cpumasks);
- *
- *	--- DO NOT reference cpumask_t pointers until this check ---
- *	if (my_cpumasks == NULL)
- *		"kmalloc failed"...
- *
- * References are now pointers to the cpumask_t variables (*mask1, ...)
- *
- *if NR_CPUS > BITS_PER_LONG
- *   CPUMASK_ALLOC(m)			Declares and allocates struct m *m =
- *						kmalloc(sizeof(*m), GFP_KERNEL)
- *   CPUMASK_FREE(m)			Macro for kfree(m)
- *else
- *   CPUMASK_ALLOC(m)			Declares struct m _m, *m = &_m
- *   CPUMASK_FREE(m)			Nop
- *endif
- *   CPUMASK_PTR(v, m)			Declares cpumask_t *v = &(m->v)
- * ------------------------------------------------------------------------
- *
- * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing
- * int cpumask_parse_user(ubuf, ulen, mask)	Parse ascii string as cpumask
- * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing
- * int cpulist_parse(buf, map)		Parse ascii string as cpulist
- * int cpu_remap(oldbit, old, new)	newbit = map(old, new)(oldbit)
- * void cpus_remap(dst, src, old, new)	*dst = map(old, new)(src)
- * void cpus_onto(dst, orig, relmap)	*dst = orig relative to relmap
- * void cpus_fold(dst, orig, sz)	dst bits = orig bits mod sz
- *
- * for_each_cpu_mask(cpu, mask)		for-loop cpu over mask using NR_CPUS
- * for_each_cpu_mask_nr(cpu, mask)	for-loop cpu over mask using nr_cpu_ids
- *
- * int num_online_cpus()		Number of online CPUs
- * int num_possible_cpus()		Number of all possible CPUs
- * int num_present_cpus()		Number of present CPUs
- *
- * int cpu_online(cpu)			Is some cpu online?
- * int cpu_possible(cpu)		Is some cpu possible?
- * int cpu_present(cpu)			Is some cpu present (can schedule)?
- *
- * int any_online_cpu(mask)		First online cpu in mask
- *
- * for_each_possible_cpu(cpu)		for-loop cpu over cpu_possible_map
- * for_each_online_cpu(cpu)		for-loop cpu over cpu_online_map
- * for_each_present_cpu(cpu)		for-loop cpu over cpu_present_map
- *
- * Subtlety:
- * 1) The 'type-checked' form of cpu_isset() causes gcc (3.3.2, anyway)
- *    to generate slightly worse code.  Note for example the additional
- *    40 lines of assembly code compiling the "for each possible cpu"
- *    loops buried in the disk_stat_read() macros calls when compiling
- *    drivers/block/genhd.c (arch i386, CONFIG_SMP=y).  So use a simple
- *    one-line #define for cpu_isset(), instead of wrapping an inline
- *    inside a macro, the way we do the other calls.
+ * set of CPU's in a system, one bit position per CPU number.  In general,
+ * only nr_cpu_ids (<= NR_CPUS) bits are valid.
  */
-
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/bitmap.h>
 
 typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
-extern cpumask_t _unused_cpumask_arg_;
-
-#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-#define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
-static inline void __cpu_set(int cpu, volatile cpumask_t *dstp)
-{
-	set_bit(cpu, dstp->bits);
-}
-
-#define cpu_clear(cpu, dst) __cpu_clear((cpu), &(dst))
-static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
-{
-	clear_bit(cpu, dstp->bits);
-}
-
-#define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
-static inline void __cpus_setall(cpumask_t *dstp, int nbits)
-{
-	bitmap_fill(dstp->bits, nbits);
-}
-
-#define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
-static inline void __cpus_clear(cpumask_t *dstp, int nbits)
-{
-	bitmap_zero(dstp->bits, nbits);
-}
-
-/* No static inline type checking - see Subtlety (1) above. */
-#define cpu_isset(cpu, cpumask) test_bit((cpu), (cpumask).bits)
-
-#define cpu_test_and_set(cpu, cpumask) __cpu_test_and_set((cpu), &(cpumask))
-static inline int __cpu_test_and_set(int cpu, cpumask_t *addr)
-{
-	return test_and_set_bit(cpu, addr->bits);
-}
-
-#define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
-static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
-static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
-static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_andnot(dst, src1, src2) \
-				__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
-static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_complement(dst, src) __cpus_complement(&(dst), &(src), NR_CPUS)
-static inline void __cpus_complement(cpumask_t *dstp,
-					const cpumask_t *srcp, int nbits)
-{
-	bitmap_complement(dstp->bits, srcp->bits, nbits);
-}
-
-#define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
-static inline int __cpus_equal(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_equal(src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS)
-static inline int __cpus_intersects(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS)
-static inline int __cpus_subset(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_subset(src1p->bits, src2p->bits, nbits);
-}
-
-#define cpus_empty(src) __cpus_empty(&(src), NR_CPUS)
-static inline int __cpus_empty(const cpumask_t *srcp, int nbits)
-{
-	return bitmap_empty(srcp->bits, nbits);
-}
-
-#define cpus_full(cpumask) __cpus_full(&(cpumask), NR_CPUS)
-static inline int __cpus_full(const cpumask_t *srcp, int nbits)
-{
-	return bitmap_full(srcp->bits, nbits);
-}
-
-#define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
-static inline int __cpus_weight(const cpumask_t *srcp, int nbits)
-{
-	return bitmap_weight(srcp->bits, nbits);
-}
-
-#define cpus_shift_right(dst, src, n) \
-			__cpus_shift_right(&(dst), &(src), (n), NR_CPUS)
-static inline void __cpus_shift_right(cpumask_t *dstp,
-					const cpumask_t *srcp, int n, int nbits)
-{
-	bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
-}
-
-#define cpus_shift_left(dst, src, n) \
-			__cpus_shift_left(&(dst), &(src), (n), NR_CPUS)
-static inline void __cpus_shift_left(cpumask_t *dstp,
-					const cpumask_t *srcp, int n, int nbits)
-{
-	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
-}
-#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 
 /**
- * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
- * @bitmap: the bitmap
- *
- * There are a few places where cpumask_var_t isn't appropriate and
- * static cpumasks must be used (eg. very early boot), yet we don't
- * expose the definition of 'struct cpumask'.
- *
- * This does the conversion, and can be used as a constant initializer.
- */
-#define to_cpumask(bitmap)						\
-	((struct cpumask *)(1 ? (bitmap)				\
-			    : (void *)sizeof(__check_is_bitmap(bitmap))))
-
-static inline int __check_is_bitmap(const unsigned long *bitmap)
-{
-	return 1;
-}
-
-/*
- * Special-case data structure for "single bit set only" constant CPU masks.
+ * cpumask_bits - get the bits in a cpumask
+ * @maskp: the struct cpumask *
  *
- * We pre-generate all the 64 (or 32) possible bit positions, with enough
- * padding to the left and the right, and return the constant pointer
- * appropriately offset.
- */
-extern const unsigned long
-	cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
-
-static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
-{
-	const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
-	p -= cpu / BITS_PER_LONG;
-	return to_cpumask(p);
-}
-
-#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-/*
- * In cases where we take the address of the cpumask immediately,
- * gcc optimizes it out (it's a constant) and there's no huge stack
- * variable created:
+ * You should only assume nr_cpu_ids bits of this mask are valid.  This is
+ * a macro so it's const-correct.
  */
-#define cpumask_of_cpu(cpu) (*get_cpu_mask(cpu))
-
-
-#define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)
-
-#if NR_CPUS <= BITS_PER_LONG
-
-#define CPU_MASK_ALL							\
-(cpumask_t) { {								\
-	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
-} }
-
-#define CPU_MASK_ALL_PTR	(&CPU_MASK_ALL)
-
-#else
-
-#define CPU_MASK_ALL							\
-(cpumask_t) { {								\
-	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,			\
-	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
-} }
-
-/* cpu_mask_all is in init/main.c */
-extern cpumask_t cpu_mask_all;
-#define CPU_MASK_ALL_PTR	(&cpu_mask_all)
-
-#endif
-
-#define CPU_MASK_NONE							\
-(cpumask_t) { {								\
-	[0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL				\
-} }
-
-#define CPU_MASK_CPU0							\
-(cpumask_t) { {								\
-	[0] =  1UL							\
-} }
-
-#define cpus_addr(src) ((src).bits)
-
-#if NR_CPUS > BITS_PER_LONG
-#define	CPUMASK_ALLOC(m)	struct m *m = kmalloc(sizeof(*m), GFP_KERNEL)
-#define	CPUMASK_FREE(m)		kfree(m)
-#else
-#define	CPUMASK_ALLOC(m)	struct m _m, *m = &_m
-#define	CPUMASK_FREE(m)
-#endif
-#define	CPUMASK_PTR(v, m) 	cpumask_t *v = &(m->v)
-
-#define cpu_remap(oldbit, old, new) \
-		__cpu_remap((oldbit), &(old), &(new), NR_CPUS)
-static inline int __cpu_remap(int oldbit,
-		const cpumask_t *oldp, const cpumask_t *newp, int nbits)
-{
-	return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
-}
-
-#define cpus_remap(dst, src, old, new) \
-		__cpus_remap(&(dst), &(src), &(old), &(new), NR_CPUS)
-static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp,
-		const cpumask_t *oldp, const cpumask_t *newp, int nbits)
-{
-	bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
-}
-
-#define cpus_onto(dst, orig, relmap) \
-		__cpus_onto(&(dst), &(orig), &(relmap), NR_CPUS)
-static inline void __cpus_onto(cpumask_t *dstp, const cpumask_t *origp,
-		const cpumask_t *relmapp, int nbits)
-{
-	bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
-}
-
-#define cpus_fold(dst, orig, sz) \
-		__cpus_fold(&(dst), &(orig), sz, NR_CPUS)
-static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
-		int sz, int nbits)
-{
-	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
-}
-#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
+#define cpumask_bits(maskp) ((maskp)->bits)
 
 #if NR_CPUS == 1
-
 #define nr_cpu_ids		1
-#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-#define first_cpu(src)		({ (void)(src); 0; })
-#define next_cpu(n, src)	({ (void)(src); 1; })
-#define any_online_cpu(mask)	0
-#define for_each_cpu_mask(cpu, mask)	\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
-#else /* NR_CPUS > 1 */
-
+#else
 extern int nr_cpu_ids;
-#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-int __first_cpu(const cpumask_t *srcp);
-int __next_cpu(int n, const cpumask_t *srcp);
-int __any_online_cpu(const cpumask_t *mask);
-
-#define first_cpu(src)		__first_cpu(&(src))
-#define next_cpu(n, src)	__next_cpu((n), &(src))
-#define any_online_cpu(mask) __any_online_cpu(&(mask))
-#define for_each_cpu_mask(cpu, mask)			\
-	for ((cpu) = -1;				\
-		(cpu) = next_cpu((cpu), (mask)),	\
-		(cpu) < NR_CPUS; )
-#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 #endif
 
-#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-#if NR_CPUS <= 64
-
-#define next_cpu_nr(n, src)		next_cpu(n, src)
-#define cpus_weight_nr(cpumask)		cpus_weight(cpumask)
-#define for_each_cpu_mask_nr(cpu, mask)	for_each_cpu_mask(cpu, mask)
-
-#else /* NR_CPUS > 64 */
-
-int __next_cpu_nr(int n, const cpumask_t *srcp);
-#define next_cpu_nr(n, src)	__next_cpu_nr((n), &(src))
-#define cpus_weight_nr(cpumask)	__cpus_weight(&(cpumask), nr_cpu_ids)
-#define for_each_cpu_mask_nr(cpu, mask)			\
-	for ((cpu) = -1;				\
-		(cpu) = next_cpu_nr((cpu), (mask)),	\
-		(cpu) < nr_cpu_ids; )
-
-#endif /* NR_CPUS > 64 */
-#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+/* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
+ * not all bits may be allocated. */
+#define nr_cpumask_bits	nr_cpu_ids
+#else
+#define nr_cpumask_bits	NR_CPUS
+#endif
 
 /*
  * The following particular system cpumasks and operations manage
@@ -487,12 +80,6 @@ extern const struct cpumask *const cpu_online_mask;
 extern const struct cpumask *const cpu_present_mask;
 extern const struct cpumask *const cpu_active_mask;
 
-/* These strip const, as traditionally they weren't const. */
-#define cpu_possible_map	(*(cpumask_t *)cpu_possible_mask)
-#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
-#define cpu_present_map		(*(cpumask_t *)cpu_present_mask)
-#define cpu_active_map		(*(cpumask_t *)cpu_active_mask)
-
 #if NR_CPUS > 1
 #define num_online_cpus()	cpumask_weight(cpu_online_mask)
 #define num_possible_cpus()	cpumask_weight(cpu_possible_mask)
@@ -511,35 +98,6 @@ extern const struct cpumask *const cpu_active_mask;
 #define cpu_active(cpu)		((cpu) == 0)
 #endif
 
-#define cpu_is_offline(cpu)	unlikely(!cpu_online(cpu))
-
-/* These are the new versions of the cpumask operators: passed by pointer.
- * The older versions will be implemented in terms of these, then deleted. */
-#define cpumask_bits(maskp) ((maskp)->bits)
-
-#if NR_CPUS <= BITS_PER_LONG
-#define CPU_BITS_ALL						\
-{								\
-	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD	\
-}
-
-#else /* NR_CPUS > BITS_PER_LONG */
-
-#define CPU_BITS_ALL						\
-{								\
-	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,		\
-	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD		\
-}
-#endif /* NR_CPUS > BITS_PER_LONG */
-
-#ifdef CONFIG_CPUMASK_OFFSTACK
-/* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
- * not all bits may be allocated. */
-#define nr_cpumask_bits	nr_cpu_ids
-#else
-#define nr_cpumask_bits	NR_CPUS
-#endif
-
 /* verify cpu argument to cpumask_* operators */
 static inline unsigned int cpumask_check(unsigned int cpu)
 {
@@ -1100,4 +658,241 @@ void set_cpu_active(unsigned int cpu, bool active);
 void init_cpu_present(const struct cpumask *src);
 void init_cpu_possible(const struct cpumask *src);
 void init_cpu_online(const struct cpumask *src);
+
+/**
+ * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
+ * @bitmap: the bitmap
+ *
+ * There are a few places where cpumask_var_t isn't appropriate and
+ * static cpumasks must be used (eg. very early boot), yet we don't
+ * expose the definition of 'struct cpumask'.
+ *
+ * This does the conversion, and can be used as a constant initializer.
+ */
+#define to_cpumask(bitmap)						\
+	((struct cpumask *)(1 ? (bitmap)				\
+			    : (void *)sizeof(__check_is_bitmap(bitmap))))
+
+static inline int __check_is_bitmap(const unsigned long *bitmap)
+{
+	return 1;
+}
+
+/*
+ * Special-case data structure for "single bit set only" constant CPU masks.
+ *
+ * We pre-generate all the 64 (or 32) possible bit positions, with enough
+ * padding to the left and the right, and return the constant pointer
+ * appropriately offset.
+ */
+extern const unsigned long
+	cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
+
+static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
+{
+	const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
+	p -= cpu / BITS_PER_LONG;
+	return to_cpumask(p);
+}
+
+#define cpu_is_offline(cpu)	unlikely(!cpu_online(cpu))
+
+#if NR_CPUS <= BITS_PER_LONG
+#define CPU_BITS_ALL						\
+{								\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD	\
+}
+
+#else /* NR_CPUS > BITS_PER_LONG */
+
+#define CPU_BITS_ALL						\
+{								\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,		\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD		\
+}
+#endif /* NR_CPUS > BITS_PER_LONG */
+
+/*
+ *
+ * From here down, all obsolete.  Use cpumask_ variants!
+ *
+ */
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
+/* These strip const, as traditionally they weren't const. */
+#define cpu_possible_map	(*(cpumask_t *)cpu_possible_mask)
+#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
+#define cpu_present_map		(*(cpumask_t *)cpu_present_mask)
+#define cpu_active_map		(*(cpumask_t *)cpu_active_mask)
+
+#define cpumask_of_cpu(cpu) (*get_cpu_mask(cpu))
+
+#define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)
+
+#if NR_CPUS <= BITS_PER_LONG
+
+#define CPU_MASK_ALL							\
+(cpumask_t) { {								\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
+} }
+
+#else
+
+#define CPU_MASK_ALL							\
+(cpumask_t) { {								\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,			\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
+} }
+
+#endif
+
+#define CPU_MASK_NONE							\
+(cpumask_t) { {								\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL				\
+} }
+
+#define CPU_MASK_CPU0							\
+(cpumask_t) { {								\
+	[0] =  1UL							\
+} }
+
+#if NR_CPUS == 1
+#define first_cpu(src)		({ (void)(src); 0; })
+#define next_cpu(n, src)	({ (void)(src); 1; })
+#define any_online_cpu(mask)	0
+#define for_each_cpu_mask(cpu, mask)	\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
+#else /* NR_CPUS > 1 */
+int __first_cpu(const cpumask_t *srcp);
+int __next_cpu(int n, const cpumask_t *srcp);
+int __any_online_cpu(const cpumask_t *mask);
+
+#define first_cpu(src)		__first_cpu(&(src))
+#define next_cpu(n, src)	__next_cpu((n), &(src))
+#define any_online_cpu(mask) __any_online_cpu(&(mask))
+#define for_each_cpu_mask(cpu, mask)			\
+	for ((cpu) = -1;				\
+		(cpu) = next_cpu((cpu), (mask)),	\
+		(cpu) < NR_CPUS; )
+#endif /* SMP */
+
+#if NR_CPUS <= 64
+
+#define for_each_cpu_mask_nr(cpu, mask)	for_each_cpu_mask(cpu, mask)
+
+#else /* NR_CPUS > 64 */
+
+int __next_cpu_nr(int n, const cpumask_t *srcp);
+#define for_each_cpu_mask_nr(cpu, mask)			\
+	for ((cpu) = -1;				\
+		(cpu) = __next_cpu_nr((cpu), &(mask)),	\
+		(cpu) < nr_cpu_ids; )
+
+#endif /* NR_CPUS > 64 */
+
+#define cpus_addr(src) ((src).bits)
+
+#define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
+static inline void __cpu_set(int cpu, volatile cpumask_t *dstp)
+{
+	set_bit(cpu, dstp->bits);
+}
+
+#define cpu_clear(cpu, dst) __cpu_clear((cpu), &(dst))
+static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
+{
+	clear_bit(cpu, dstp->bits);
+}
+
+#define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
+static inline void __cpus_setall(cpumask_t *dstp, int nbits)
+{
+	bitmap_fill(dstp->bits, nbits);
+}
+
+#define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
+static inline void __cpus_clear(cpumask_t *dstp, int nbits)
+{
+	bitmap_zero(dstp->bits, nbits);
+}
+
+/* No static inline type checking - see Subtlety (1) above. */
+#define cpu_isset(cpu, cpumask) test_bit((cpu), (cpumask).bits)
+
+#define cpu_test_and_set(cpu, cpumask) __cpu_test_and_set((cpu), &(cpumask))
+static inline int __cpu_test_and_set(int cpu, cpumask_t *addr)
+{
+	return test_and_set_bit(cpu, addr->bits);
+}
+
+#define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
+static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_andnot(dst, src1, src2) \
+				__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
+static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_equal(const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_equal(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_intersects(const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_subset(const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_subset(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_empty(src) __cpus_empty(&(src), NR_CPUS)
+static inline int __cpus_empty(const cpumask_t *srcp, int nbits)
+{
+	return bitmap_empty(srcp->bits, nbits);
+}
+
+#define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
+static inline int __cpus_weight(const cpumask_t *srcp, int nbits)
+{
+	return bitmap_weight(srcp->bits, nbits);
+}
+
+#define cpus_shift_left(dst, src, n) \
+			__cpus_shift_left(&(dst), &(src), (n), NR_CPUS)
+static inline void __cpus_shift_left(cpumask_t *dstp,
+					const cpumask_t *srcp, int n, int nbits)
+{
+	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
+}
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
+
 #endif /* __LINUX_CPUMASK_H */
diff --git a/include/linux/cred.h b/include/linux/cred.h
index fb371601a3b..4e3387a89cb 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -176,23 +176,7 @@ extern void __invalid_creds(const struct cred *, const char *, unsigned);
 extern void __validate_process_creds(struct task_struct *,
 				     const char *, unsigned);
 
-static inline bool creds_are_invalid(const struct cred *cred)
-{
-	if (cred->magic != CRED_MAGIC)
-		return true;
-	if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
-		return true;
-#ifdef CONFIG_SECURITY_SELINUX
-	if (selinux_is_enabled()) {
-		if ((unsigned long) cred->security < PAGE_SIZE)
-			return true;
-		if ((*(u32 *)cred->security & 0xffffff00) ==
-		    (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
-			return true;
-	}
-#endif
-	return false;
-}
+extern bool creds_are_invalid(const struct cred *cred);
 
 static inline void __validate_creds(const struct cred *cred,
 				    const char *file, unsigned line)
diff --git a/include/linux/dca.h b/include/linux/dca.h
index 9c20c7e87d0..d27a7a05718 100644
--- a/include/linux/dca.h
+++ b/include/linux/dca.h
@@ -20,6 +20,9 @@
  */
 #ifndef DCA_H
 #define DCA_H
+
+#include <linux/pci.h>
+
 /* DCA Provider API */
 
 /* DCA Notifier Interface */
@@ -36,6 +39,12 @@ struct dca_provider {
 	int			 id;
 };
 
+struct dca_domain {
+	struct list_head	node;
+	struct list_head	dca_providers;
+	struct pci_bus		*pci_rc;
+};
+
 struct dca_ops {
 	int	(*add_requester)    (struct dca_provider *, struct device *);
 	int	(*remove_requester) (struct dca_provider *, struct device *);
@@ -47,7 +56,7 @@ struct dca_ops {
 struct dca_provider *alloc_dca_provider(struct dca_ops *ops, int priv_size);
 void free_dca_provider(struct dca_provider *dca);
 int register_dca_provider(struct dca_provider *dca, struct device *dev);
-void unregister_dca_provider(struct dca_provider *dca);
+void unregister_dca_provider(struct dca_provider *dca, struct device *dev);
 
 static inline void *dca_priv(struct dca_provider *dca)
 {
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index eb5c2ba2f81..fc1b930f246 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -9,7 +9,7 @@
  *	2 as published by the Free Software Foundation.
  *
  *  debugfs is for people to use instead of /proc or /sys.
- *  See Documentation/DocBook/kernel-api for more details.
+ *  See Documentation/DocBook/filesystems for more details.
  */
 
 #ifndef _DEBUGFS_H_
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index ffefba81c81..2b9f2ac7ed6 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -48,19 +48,20 @@ enum dma_status {
 
 /**
  * enum dma_transaction_type - DMA transaction types/indexes
+ *
+ * Note: The DMA_ASYNC_TX capability is not to be set by drivers.  It is
+ * automatically set as dma devices are registered.
  */
 enum dma_transaction_type {
 	DMA_MEMCPY,
 	DMA_XOR,
-	DMA_PQ_XOR,
-	DMA_DUAL_XOR,
-	DMA_PQ_UPDATE,
-	DMA_ZERO_SUM,
-	DMA_PQ_ZERO_SUM,
+	DMA_PQ,
+	DMA_XOR_VAL,
+	DMA_PQ_VAL,
 	DMA_MEMSET,
-	DMA_MEMCPY_CRC32C,
 	DMA_INTERRUPT,
 	DMA_PRIVATE,
+	DMA_ASYNC_TX,
 	DMA_SLAVE,
 };
 
@@ -70,18 +71,25 @@ enum dma_transaction_type {
 
 /**
  * enum dma_ctrl_flags - DMA flags to augment operation preparation,
- * 	control completion, and communicate status.
+ *  control completion, and communicate status.
  * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of
- * 	this transaction
+ *  this transaction
  * @DMA_CTRL_ACK - the descriptor cannot be reused until the client
- * 	acknowledges receipt, i.e. has has a chance to establish any
- * 	dependency chains
+ *  acknowledges receipt, i.e. has has a chance to establish any dependency
+ *  chains
  * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s)
  * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s)
  * @DMA_COMPL_SRC_UNMAP_SINGLE - set to do the source dma-unmapping as single
  * 	(if not set, do the source dma-unmapping as page)
  * @DMA_COMPL_DEST_UNMAP_SINGLE - set to do the destination dma-unmapping as single
  * 	(if not set, do the destination dma-unmapping as page)
+ * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q
+ * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P
+ * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as
+ *  sources that were the result of a previous operation, in the case of a PQ
+ *  operation it continues the calculation with new sources
+ * @DMA_PREP_FENCE - tell the driver that subsequent operations depend
+ *  on the result of this operation
  */
 enum dma_ctrl_flags {
 	DMA_PREP_INTERRUPT = (1 << 0),
@@ -90,9 +98,32 @@ enum dma_ctrl_flags {
 	DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3),
 	DMA_COMPL_SRC_UNMAP_SINGLE = (1 << 4),
 	DMA_COMPL_DEST_UNMAP_SINGLE = (1 << 5),
+	DMA_PREP_PQ_DISABLE_P = (1 << 6),
+	DMA_PREP_PQ_DISABLE_Q = (1 << 7),
+	DMA_PREP_CONTINUE = (1 << 8),
+	DMA_PREP_FENCE = (1 << 9),
 };
 
 /**
+ * enum sum_check_bits - bit position of pq_check_flags
+ */
+enum sum_check_bits {
+	SUM_CHECK_P = 0,
+	SUM_CHECK_Q = 1,
+};
+
+/**
+ * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations
+ * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise
+ * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise
+ */
+enum sum_check_flags {
+	SUM_CHECK_P_RESULT = (1 << SUM_CHECK_P),
+	SUM_CHECK_Q_RESULT = (1 << SUM_CHECK_Q),
+};
+
+
+/**
  * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t.
  * See linux/cpumask.h
  */
@@ -180,8 +211,6 @@ typedef void (*dma_async_tx_callback)(void *dma_async_param);
  * @flags: flags to augment operation preparation, control completion, and
  * 	communicate status
  * @phys: physical address of the descriptor
- * @tx_list: driver common field for operations that require multiple
- *	descriptors
  * @chan: target channel for this operation
  * @tx_submit: set the prepared descriptor(s) to be executed by the engine
  * @callback: routine to call after this operation is complete
@@ -195,7 +224,6 @@ struct dma_async_tx_descriptor {
 	dma_cookie_t cookie;
 	enum dma_ctrl_flags flags; /* not a 'long' to pack with cookie */
 	dma_addr_t phys;
-	struct list_head tx_list;
 	struct dma_chan *chan;
 	dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx);
 	dma_async_tx_callback callback;
@@ -213,6 +241,11 @@ struct dma_async_tx_descriptor {
  * @global_node: list_head for global dma_device_list
  * @cap_mask: one or more dma_capability flags
  * @max_xor: maximum number of xor sources, 0 if no capability
+ * @max_pq: maximum number of PQ sources and PQ-continue capability
+ * @copy_align: alignment shift for memcpy operations
+ * @xor_align: alignment shift for xor operations
+ * @pq_align: alignment shift for pq operations
+ * @fill_align: alignment shift for memset operations
  * @dev_id: unique device ID
  * @dev: struct device reference for dma mapping api
  * @device_alloc_chan_resources: allocate resources and return the
@@ -220,7 +253,9 @@ struct dma_async_tx_descriptor {
  * @device_free_chan_resources: release DMA channel's resources
  * @device_prep_dma_memcpy: prepares a memcpy operation
  * @device_prep_dma_xor: prepares a xor operation
- * @device_prep_dma_zero_sum: prepares a zero_sum operation
+ * @device_prep_dma_xor_val: prepares a xor validation operation
+ * @device_prep_dma_pq: prepares a pq operation
+ * @device_prep_dma_pq_val: prepares a pqzero_sum operation
  * @device_prep_dma_memset: prepares a memset operation
  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
  * @device_prep_slave_sg: prepares a slave dma operation
@@ -235,7 +270,13 @@ struct dma_device {
 	struct list_head channels;
 	struct list_head global_node;
 	dma_cap_mask_t  cap_mask;
-	int max_xor;
+	unsigned short max_xor;
+	unsigned short max_pq;
+	u8 copy_align;
+	u8 xor_align;
+	u8 pq_align;
+	u8 fill_align;
+	#define DMA_HAS_PQ_CONTINUE (1 << 15)
 
 	int dev_id;
 	struct device *dev;
@@ -249,9 +290,17 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
 		struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
 		unsigned int src_cnt, size_t len, unsigned long flags);
-	struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)(
+	struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)(
 		struct dma_chan *chan, dma_addr_t *src,	unsigned int src_cnt,
-		size_t len, u32 *result, unsigned long flags);
+		size_t len, enum sum_check_flags *result, unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_pq)(
+		struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
+		unsigned int src_cnt, const unsigned char *scf,
+		size_t len, unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_pq_val)(
+		struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
+		unsigned int src_cnt, const unsigned char *scf, size_t len,
+		enum sum_check_flags *pqres, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
 		struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
 		unsigned long flags);
@@ -270,6 +319,96 @@ struct dma_device {
 	void (*device_issue_pending)(struct dma_chan *chan);
 };
 
+static inline bool dmaengine_check_align(u8 align, size_t off1, size_t off2, size_t len)
+{
+	size_t mask;
+
+	if (!align)
+		return true;
+	mask = (1 << align) - 1;
+	if (mask & (off1 | off2 | len))
+		return false;
+	return true;
+}
+
+static inline bool is_dma_copy_aligned(struct dma_device *dev, size_t off1,
+				       size_t off2, size_t len)
+{
+	return dmaengine_check_align(dev->copy_align, off1, off2, len);
+}
+
+static inline bool is_dma_xor_aligned(struct dma_device *dev, size_t off1,
+				      size_t off2, size_t len)
+{
+	return dmaengine_check_align(dev->xor_align, off1, off2, len);
+}
+
+static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1,
+				     size_t off2, size_t len)
+{
+	return dmaengine_check_align(dev->pq_align, off1, off2, len);
+}
+
+static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1,
+				       size_t off2, size_t len)
+{
+	return dmaengine_check_align(dev->fill_align, off1, off2, len);
+}
+
+static inline void
+dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue)
+{
+	dma->max_pq = maxpq;
+	if (has_pq_continue)
+		dma->max_pq |= DMA_HAS_PQ_CONTINUE;
+}
+
+static inline bool dmaf_continue(enum dma_ctrl_flags flags)
+{
+	return (flags & DMA_PREP_CONTINUE) == DMA_PREP_CONTINUE;
+}
+
+static inline bool dmaf_p_disabled_continue(enum dma_ctrl_flags flags)
+{
+	enum dma_ctrl_flags mask = DMA_PREP_CONTINUE | DMA_PREP_PQ_DISABLE_P;
+
+	return (flags & mask) == mask;
+}
+
+static inline bool dma_dev_has_pq_continue(struct dma_device *dma)
+{
+	return (dma->max_pq & DMA_HAS_PQ_CONTINUE) == DMA_HAS_PQ_CONTINUE;
+}
+
+static unsigned short dma_dev_to_maxpq(struct dma_device *dma)
+{
+	return dma->max_pq & ~DMA_HAS_PQ_CONTINUE;
+}
+
+/* dma_maxpq - reduce maxpq in the face of continued operations
+ * @dma - dma device with PQ capability
+ * @flags - to check if DMA_PREP_CONTINUE and DMA_PREP_PQ_DISABLE_P are set
+ *
+ * When an engine does not support native continuation we need 3 extra
+ * source slots to reuse P and Q with the following coefficients:
+ * 1/ {00} * P : remove P from Q', but use it as a source for P'
+ * 2/ {01} * Q : use Q to continue Q' calculation
+ * 3/ {00} * Q : subtract Q from P' to cancel (2)
+ *
+ * In the case where P is disabled we only need 1 extra source:
+ * 1/ {01} * Q : use Q to continue Q' calculation
+ */
+static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags)
+{
+	if (dma_dev_has_pq_continue(dma) || !dmaf_continue(flags))
+		return dma_dev_to_maxpq(dma);
+	else if (dmaf_p_disabled_continue(flags))
+		return dma_dev_to_maxpq(dma) - 1;
+	else if (dmaf_continue(flags))
+		return dma_dev_to_maxpq(dma) - 3;
+	BUG();
+}
+
 /* --- public DMA engine API --- */
 
 #ifdef CONFIG_DMA_ENGINE
@@ -299,7 +438,11 @@ static inline void net_dmaengine_put(void)
 #ifdef CONFIG_ASYNC_TX_DMA
 #define async_dmaengine_get()	dmaengine_get()
 #define async_dmaengine_put()	dmaengine_put()
+#ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
+#define async_dma_find_channel(type) dma_find_channel(DMA_ASYNC_TX)
+#else
 #define async_dma_find_channel(type) dma_find_channel(type)
+#endif /* CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH */
 #else
 static inline void async_dmaengine_get(void)
 {
@@ -312,7 +455,7 @@ async_dma_find_channel(enum dma_transaction_type type)
 {
 	return NULL;
 }
-#endif
+#endif /* CONFIG_ASYNC_TX_DMA */
 
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 51803528b09..2adaa2529f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -595,6 +595,7 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
+	int (*error_remove_page)(struct address_space *, struct page *);
 };
 
 /*
@@ -640,7 +641,6 @@ struct block_device {
 	struct super_block *	bd_super;
 	int			bd_openers;
 	struct mutex		bd_mutex;	/* open/close mutex */
-	struct semaphore	bd_mount_sem;
 	struct list_head	bd_inodes;
 	void *			bd_holder;
 	int			bd_holders;
@@ -1315,7 +1315,7 @@ struct super_block {
 	unsigned long		s_blocksize;
 	unsigned char		s_blocksize_bits;
 	unsigned char		s_dirt;
-	unsigned long long	s_maxbytes;	/* Max file size */
+	loff_t			s_maxbytes;	/* Max file size */
 	struct file_system_type	*s_type;
 	const struct super_operations	*s_op;
 	const struct dquot_operations	*dq_op;
@@ -2156,6 +2156,7 @@ extern ino_t iunique(struct super_block *, ino_t);
 extern int inode_needs_sync(struct inode *inode);
 extern void generic_delete_inode(struct inode *inode);
 extern void generic_drop_inode(struct inode *inode);
+extern int generic_detach_inode(struct inode *inode);
 
 extern struct inode *ilookup5_nowait(struct super_block *sb,
 		unsigned long hashval, int (*test)(struct inode *, void *),
@@ -2334,6 +2335,7 @@ extern void get_filesystem(struct file_system_type *fs);
 extern void put_filesystem(struct file_system_type *fs);
 extern struct file_system_type *get_fs_type(const char *name);
 extern struct super_block *get_super(struct block_device *);
+extern struct super_block *get_active_super(struct block_device *bdev);
 extern struct super_block *user_get_super(dev_t);
 extern void drop_super(struct super_block *sb);
 
@@ -2381,7 +2383,8 @@ extern int buffer_migrate_page(struct address_space *,
 #define buffer_migrate_page NULL
 #endif
 
-extern int inode_change_ok(struct inode *, struct iattr *);
+extern int inode_change_ok(const struct inode *, struct iattr *);
+extern int inode_newsize_ok(const struct inode *, loff_t offset);
 extern int __must_check inode_setattr(struct inode *, struct iattr *);
 
 extern void file_update_time(struct file *file);
@@ -2467,7 +2470,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
 			  size_t len, loff_t *ppos);
 
 struct ctl_table;
-int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
+int proc_nr_files(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
 int __init get_filesystem_list(char *buf);
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3c0924a18da..cd3d2abaf30 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -19,7 +19,7 @@
 extern int ftrace_enabled;
 extern int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-		     struct file *filp, void __user *buffer, size_t *lenp,
+		     void __user *buffer, size_t *lenp,
 		     loff_t *ppos);
 
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
@@ -94,7 +94,7 @@ static inline void ftrace_start(void) { }
 extern int stack_tracer_enabled;
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
-		   struct file *file, void __user *buffer, size_t *lenp,
+		   void __user *buffer, size_t *lenp,
 		   loff_t *ppos);
 #endif
 
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 34956c8fdeb..8ec17997d94 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -4,11 +4,6 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 
-struct inode;
-struct mm_struct;
-struct task_struct;
-union ktime;
-
 /* Second argument to futex syscall */
 
 
@@ -129,6 +124,11 @@ struct robust_list_head {
 #define FUTEX_BITSET_MATCH_ANY	0xffffffff
 
 #ifdef __KERNEL__
+struct inode;
+struct mm_struct;
+struct task_struct;
+union ktime;
+
 long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 176e7ee73ef..11ab19ac6b3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,9 +20,9 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 }
 
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
-int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
-int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
-int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
+int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			struct page **, struct vm_area_struct **,
diff --git a/include/linux/i2c/adp5588.h b/include/linux/i2c/adp5588.h
new file mode 100644
index 00000000000..fc5db826b48
--- /dev/null
+++ b/include/linux/i2c/adp5588.h
@@ -0,0 +1,92 @@
+/*
+ * Analog Devices ADP5588 I/O Expander and QWERTY Keypad Controller
+ *
+ * Copyright 2009 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#ifndef _ADP5588_H
+#define _ADP5588_H
+
+#define DEV_ID 0x00		/* Device ID */
+#define CFG 0x01		/* Configuration Register1 */
+#define INT_STAT 0x02		/* Interrupt Status Register */
+#define KEY_LCK_EC_STAT 0x03	/* Key Lock and Event Counter Register */
+#define Key_EVENTA 0x04		/* Key Event Register A */
+#define Key_EVENTB 0x05		/* Key Event Register B */
+#define Key_EVENTC 0x06		/* Key Event Register C */
+#define Key_EVENTD 0x07		/* Key Event Register D */
+#define Key_EVENTE 0x08		/* Key Event Register E */
+#define Key_EVENTF 0x09		/* Key Event Register F */
+#define Key_EVENTG 0x0A		/* Key Event Register G */
+#define Key_EVENTH 0x0B		/* Key Event Register H */
+#define Key_EVENTI 0x0C		/* Key Event Register I */
+#define Key_EVENTJ 0x0D		/* Key Event Register J */
+#define KP_LCK_TMR 0x0E		/* Keypad Lock1 to Lock2 Timer */
+#define UNLOCK1 0x0F		/* Unlock Key1 */
+#define UNLOCK2 0x10		/* Unlock Key2 */
+#define GPIO_INT_STAT1 0x11	/* GPIO Interrupt Status */
+#define GPIO_INT_STAT2 0x12	/* GPIO Interrupt Status */
+#define GPIO_INT_STAT3 0x13	/* GPIO Interrupt Status */
+#define GPIO_DAT_STAT1 0x14	/* GPIO Data Status, Read twice to clear */
+#define GPIO_DAT_STAT2 0x15	/* GPIO Data Status, Read twice to clear */
+#define GPIO_DAT_STAT3 0x16	/* GPIO Data Status, Read twice to clear */
+#define GPIO_DAT_OUT1 0x17	/* GPIO DATA OUT */
+#define GPIO_DAT_OUT2 0x18	/* GPIO DATA OUT */
+#define GPIO_DAT_OUT3 0x19	/* GPIO DATA OUT */
+#define GPIO_INT_EN1 0x1A	/* GPIO Interrupt Enable */
+#define GPIO_INT_EN2 0x1B	/* GPIO Interrupt Enable */
+#define GPIO_INT_EN3 0x1C	/* GPIO Interrupt Enable */
+#define KP_GPIO1 0x1D		/* Keypad or GPIO Selection */
+#define KP_GPIO2 0x1E		/* Keypad or GPIO Selection */
+#define KP_GPIO3 0x1F		/* Keypad or GPIO Selection */
+#define GPI_EM1 0x20		/* GPI Event Mode 1 */
+#define GPI_EM2 0x21		/* GPI Event Mode 2 */
+#define GPI_EM3 0x22		/* GPI Event Mode 3 */
+#define GPIO_DIR1 0x23		/* GPIO Data Direction */
+#define GPIO_DIR2 0x24		/* GPIO Data Direction */
+#define GPIO_DIR3 0x25		/* GPIO Data Direction */
+#define GPIO_INT_LVL1 0x26	/* GPIO Edge/Level Detect */
+#define GPIO_INT_LVL2 0x27	/* GPIO Edge/Level Detect */
+#define GPIO_INT_LVL3 0x28	/* GPIO Edge/Level Detect */
+#define Debounce_DIS1 0x29	/* Debounce Disable */
+#define Debounce_DIS2 0x2A	/* Debounce Disable */
+#define Debounce_DIS3 0x2B	/* Debounce Disable */
+#define GPIO_PULL1 0x2C		/* GPIO Pull Disable */
+#define GPIO_PULL2 0x2D		/* GPIO Pull Disable */
+#define GPIO_PULL3 0x2E		/* GPIO Pull Disable */
+#define CMP_CFG_STAT 0x30	/* Comparator Configuration and Status Register */
+#define CMP_CONFG_SENS1 0x31	/* Sensor1 Comparator Configuration Register */
+#define CMP_CONFG_SENS2 0x32	/* L2 Light Sensor Reference Level, Output Falling for Sensor 1 */
+#define CMP1_LVL2_TRIP 0x33	/* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 1 */
+#define CMP1_LVL2_HYS 0x34	/* L3 Light Sensor Reference Level, Output Falling For Sensor 1 */
+#define CMP1_LVL3_TRIP 0x35	/* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 1 */
+#define CMP1_LVL3_HYS 0x36	/* Sensor 2 Comparator Configuration Register */
+#define CMP2_LVL2_TRIP 0x37	/* L2 Light Sensor Reference Level, Output Falling for Sensor 2 */
+#define CMP2_LVL2_HYS 0x38	/* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 2 */
+#define CMP2_LVL3_TRIP 0x39	/* L3 Light Sensor Reference Level, Output Falling For Sensor 2 */
+#define CMP2_LVL3_HYS 0x3A	/* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 2 */
+#define CMP1_ADC_DAT_R1 0x3B	/* Comparator 1 ADC data Register1 */
+#define CMP1_ADC_DAT_R2 0x3C	/* Comparator 1 ADC data Register2 */
+#define CMP2_ADC_DAT_R1 0x3D	/* Comparator 2 ADC data Register1 */
+#define CMP2_ADC_DAT_R2 0x3E	/* Comparator 2 ADC data Register2 */
+
+#define ADP5588_DEVICE_ID_MASK	0xF
+
+/* Put one of these structures in i2c_board_info platform_data */
+
+#define ADP5588_KEYMAPSIZE	80
+
+struct adp5588_kpad_platform_data {
+	int rows;			/* Number of rows */
+	int cols;			/* Number of columns */
+	const unsigned short *keymap;	/* Pointer to keymap */
+	unsigned short keymapsize;	/* Keymap size */
+	unsigned repeat:1;		/* Enable key repeat */
+	unsigned en_keylock:1;		/* Enable Key Lock feature */
+	unsigned short unlock_key1;	/* Unlock Key 1 */
+	unsigned short unlock_key2;	/* Unlock Key 2 */
+};
+
+#endif
diff --git a/include/linux/i2c/mcs5000_ts.h b/include/linux/i2c/mcs5000_ts.h
new file mode 100644
index 00000000000..5a117b5ca15
--- /dev/null
+++ b/include/linux/i2c/mcs5000_ts.h
@@ -0,0 +1,24 @@
+/*
+ * mcs5000_ts.h
+ *
+ * Copyright (C) 2009 Samsung Electronics Co.Ltd
+ * Author: Joonyoung Shim <jy0922.shim@samsung.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __LINUX_MCS5000_TS_H
+#define __LINUX_MCS5000_TS_H
+
+/* platform data for the MELFAS MCS-5000 touchscreen driver */
+struct mcs5000_ts_platform_data {
+	void (*cfg_pin)(void);
+	int x_size;
+	int y_size;
+};
+
+#endif	/* __LINUX_MCS5000_TS_H */
diff --git a/include/linux/i8042.h b/include/linux/i8042.h
index 7907a72403e..60c3360ef6a 100644
--- a/include/linux/i8042.h
+++ b/include/linux/i8042.h
@@ -7,6 +7,7 @@
  * the Free Software Foundation.
  */
 
+#include <linux/types.h>
 
 /*
  * Standard commands.
@@ -30,6 +31,35 @@
 #define I8042_CMD_MUX_PFX	0x0090
 #define I8042_CMD_MUX_SEND	0x1090
 
+struct serio;
+
+#if defined(CONFIG_SERIO_I8042) || defined(CONFIG_SERIO_I8042_MODULE)
+
+void i8042_lock_chip(void);
+void i8042_unlock_chip(void);
 int i8042_command(unsigned char *param, int command);
+bool i8042_check_port_owner(const struct serio *);
+
+#else
+
+void i8042_lock_chip(void)
+{
+}
+
+void i8042_unlock_chip(void)
+{
+}
+
+int i8042_command(unsigned char *param, int command)
+{
+	return -ENOSYS;
+}
+
+bool i8042_check_port_owner(const struct serio *serio)
+{
+	return false;
+}
+
+#endif
 
 #endif
diff --git a/include/linux/input.h b/include/linux/input.h
index 8b3bc3e0d14..0ccfc30cd40 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1123,7 +1123,7 @@ struct input_dev {
 	struct mutex mutex;
 
 	unsigned int users;
-	int going_away;
+	bool going_away;
 
 	struct device dev;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 8e9e151f811..b78cf819495 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -84,7 +84,6 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
  * struct irqaction - per interrupt action descriptor
  * @handler:	interrupt handler function
  * @flags:	flags (see IRQF_* above)
- * @mask:	no comment as it is useless and about to be removed
  * @name:	name of the device
  * @dev_id:	cookie to identify the device
  * @next:	pointer to the next irqaction for shared interrupts
@@ -97,7 +96,6 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
 struct irqaction {
 	irq_handler_t handler;
 	unsigned long flags;
-	cpumask_t mask;
 	const char *name;
 	void *dev_id;
 	struct irqaction *next;
diff --git a/include/linux/libps2.h b/include/linux/libps2.h
index fcf5fbe6a50..79603a6c356 100644
--- a/include/linux/libps2.h
+++ b/include/linux/libps2.h
@@ -44,6 +44,8 @@ struct ps2dev {
 void ps2_init(struct ps2dev *ps2dev, struct serio *serio);
 int ps2_sendbyte(struct ps2dev *ps2dev, unsigned char byte, int timeout);
 void ps2_drain(struct ps2dev *ps2dev, int maxbytes, int timeout);
+void ps2_begin_command(struct ps2dev *ps2dev);
+void ps2_end_command(struct ps2dev *ps2dev);
 int __ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command);
 int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command);
 int ps2_handle_ack(struct ps2dev *ps2dev, unsigned char data);
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 691f59171c6..5126cceb6ae 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -57,6 +57,7 @@
 
 #ifdef __ASSEMBLY__
 
+#ifndef LINKER_SCRIPT
 #define ALIGN __ALIGN
 #define ALIGN_STR __ALIGN_STR
 
@@ -66,6 +67,7 @@
   ALIGN; \
   name:
 #endif
+#endif /* LINKER_SCRIPT */
 
 #ifndef WEAK
 #define WEAK(name)	   \
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e46a0734ab6..bf9213b2db8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -118,6 +118,9 @@ static inline bool mem_cgroup_disabled(void)
 
 extern bool mem_cgroup_oom_called(struct task_struct *task);
 void mem_cgroup_update_mapped_file_stat(struct page *page, int val);
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+						gfp_t gfp_mask, int nid,
+						int zid);
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -276,6 +279,13 @@ static inline void mem_cgroup_update_mapped_file_stat(struct page *page,
 {
 }
 
+static inline
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+					    gfp_t gfp_mask, int nid, int zid)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6eae5e3144..df08551cb0a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -695,11 +695,12 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_SIGBUS	0x0002
 #define VM_FAULT_MAJOR	0x0004
 #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
+#define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned page */
 
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 
-#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS)
+#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
 
 /*
  * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
@@ -791,8 +792,14 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 	unmap_mapping_range(mapping, holebegin, holelen, 0);
 }
 
-extern int vmtruncate(struct inode * inode, loff_t offset);
-extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
+extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
+extern int vmtruncate(struct inode *inode, loff_t offset);
+extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end);
+
+int truncate_inode_page(struct address_space *mapping, struct page *page);
+int generic_error_remove_page(struct address_space *mapping, struct page *page);
+
+int invalidate_inode_page(struct page *page);
 
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -1279,7 +1286,7 @@ int in_gate_area_no_task(unsigned long addr);
 #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
-int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
+int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 			unsigned long lru_pages);
@@ -1308,5 +1315,12 @@ void vmemmap_populate_print_last(void);
 extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
 				 size_t size);
 extern void refund_locked_memory(struct mm_struct *mm, size_t size);
+
+extern void memory_failure(unsigned long pfn, int trapno);
+extern int __memory_failure(unsigned long pfn, int trapno, int ref);
+extern int sysctl_memory_failure_early_kill;
+extern int sysctl_memory_failure_recovery;
+extern atomic_long_t mce_bad_pages;
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0042090a4d7..21d6aa45206 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -240,6 +240,8 @@ struct mm_struct {
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
+	struct linux_binfmt *binfmt;
+
 	cpumask_t cpu_vm_mask;
 
 	/* Architecture-specific MM context */
@@ -259,11 +261,10 @@ struct mm_struct {
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	struct core_state *core_state; /* coredumping support */
-
-	/* aio bits */
+#ifdef CONFIG_AIO
 	spinlock_t		ioctx_lock;
 	struct hlist_head	ioctx_list;
-
+#endif
 #ifdef CONFIG_MM_OWNER
 	/*
 	 * "owner" points to a task that is regarded as the canonical
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 652ef01be58..6f7561730d8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -755,21 +755,20 @@ static inline int is_dma(struct zone *zone)
 
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
-struct file;
-int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
+int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
-int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
-			struct file *, void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
-			struct file *, void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 
 extern int numa_zonelist_order_handler(struct ctl_table *, int,
-			struct file *, void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN 16	/* string buffer size */
 
diff --git a/include/linux/module.h b/include/linux/module.h
index 1c755b2f937..482efc865ac 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -128,7 +128,10 @@ extern struct module __this_module;
  */
 #define MODULE_LICENSE(_license) MODULE_INFO(license, _license)
 
-/* Author, ideally of form NAME[, NAME]*[ and NAME] */
+/*
+ * Author(s), use "Name <email>" or just "Name", for multiple
+ * authors use multiple MODULE_AUTHOR() statements/lines.
+ */
 #define MODULE_AUTHOR(_author) MODULE_INFO(author, _author)
   
 /* What your module does. */
@@ -308,10 +311,14 @@ struct module
 #endif
 
 #ifdef CONFIG_KALLSYMS
-	/* We keep the symbol and string tables for kallsyms. */
-	Elf_Sym *symtab;
-	unsigned int num_symtab;
-	char *strtab;
+	/*
+	 * We keep the symbol and string tables for kallsyms.
+	 * The core_* fields below are temporary, loader-only (they
+	 * could really be discarded after module init).
+	 */
+	Elf_Sym *symtab, *core_symtab;
+	unsigned int num_symtab, core_num_syms;
+	char *strtab, *core_strtab;
 
 	/* Section attributes */
 	struct module_sect_attrs *sect_attrs;
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 080f6ba9e73..ab5d3126831 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -187,6 +187,7 @@ extern struct sock *netlink_kernel_create(struct net *net,
 extern void netlink_kernel_release(struct sock *sk);
 extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
+extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 13de789f0a5..6b202b17395 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -51,6 +51,9 @@
  * PG_buddy is set to indicate that the page is free and in the buddy system
  * (see mm/page_alloc.c).
  *
+ * PG_hwpoison indicates that a page got corrupted in hardware and contains
+ * data with incorrect ECC bits that triggered a machine check. Accessing is
+ * not safe since it may cause another machine check. Don't touch!
  */
 
 /*
@@ -102,6 +105,9 @@ enum pageflags {
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	PG_uncached,		/* Page has been mapped as uncached */
 #endif
+#ifdef CONFIG_MEMORY_FAILURE
+	PG_hwpoison,		/* hardware poisoned page. Don't touch */
+#endif
 	__NR_PAGEFLAGS,
 
 	/* Filesystems */
@@ -269,6 +275,15 @@ PAGEFLAG(Uncached, uncached)
 PAGEFLAG_FALSE(Uncached)
 #endif
 
+#ifdef CONFIG_MEMORY_FAILURE
+PAGEFLAG(HWPoison, hwpoison)
+TESTSETFLAG(HWPoison, hwpoison)
+#define __PG_HWPOISON (1UL << PG_hwpoison)
+#else
+PAGEFLAG_FALSE(HWPoison)
+#define __PG_HWPOISON 0
+#endif
+
 static inline int PageUptodate(struct page *page)
 {
 	int ret = test_bit(PG_uptodate, &(page)->flags);
@@ -393,7 +408,7 @@ static inline void __ClearPageTail(struct page *page)
 	 1 << PG_private | 1 << PG_private_2 | \
 	 1 << PG_buddy	 | 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 1 << PG_unevictable | __PG_MLOCKED)
+	 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index ada779f2417..4b938d4f3ac 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -38,6 +38,7 @@ enum {
 	PCG_LOCK,  /* page cgroup is locked */
 	PCG_CACHE, /* charged as cache */
 	PCG_USED, /* this object is in use. */
+	PCG_ACCT_LRU, /* page has been accounted for */
 };
 
 #define TESTPCGFLAG(uname, lname)			\
@@ -52,11 +53,23 @@ static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
 static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
 	{ clear_bit(PCG_##lname, &pc->flags);  }
 
+#define TESTCLEARPCGFLAG(uname, lname)			\
+static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)	\
+	{ return test_and_clear_bit(PCG_##lname, &pc->flags);  }
+
 /* Cache flag is set only once (at allocation) */
 TESTPCGFLAG(Cache, CACHE)
+CLEARPCGFLAG(Cache, CACHE)
+SETPCGFLAG(Cache, CACHE)
 
 TESTPCGFLAG(Used, USED)
 CLEARPCGFLAG(Used, USED)
+SETPCGFLAG(Used, USED)
+
+SETPCGFLAG(AcctLRU, ACCT_LRU)
+CLEARPCGFLAG(AcctLRU, ACCT_LRU)
+TESTPCGFLAG(AcctLRU, ACCT_LRU)
+TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
 
 static inline int page_cgroup_nid(struct page_cgroup *pc)
 {
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 7803565aa87..da1fda8623e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2527,6 +2527,16 @@
 #define PCI_DEVICE_ID_INTEL_E7525_MCH	0x359e
 #define PCI_DEVICE_ID_INTEL_IOAT_CNB	0x360b
 #define PCI_DEVICE_ID_INTEL_FBD_CNB	0x360c
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF0	0x3710
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF1	0x3711
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF2	0x3712
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF3	0x3713
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF4	0x3714
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF5	0x3715
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF6	0x3716
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF7	0x3717
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF8	0x3718
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF9	0x3719
 #define PCI_DEVICE_ID_INTEL_ICH10_0	0x3a14
 #define PCI_DEVICE_ID_INTEL_ICH10_1	0x3a16
 #define PCI_DEVICE_ID_INTEL_ICH10_2	0x3a18
diff --git a/include/linux/phonet.h b/include/linux/phonet.h
index 1ef5a078183..e5126cff9b2 100644
--- a/include/linux/phonet.h
+++ b/include/linux/phonet.h
@@ -38,6 +38,7 @@
 #define PNPIPE_IFINDEX		2
 
 #define PNADDR_ANY		0
+#define PNADDR_BROADCAST	0xFC
 #define PNPORT_RESOURCE_ROUTING	0
 
 /* Values for PNPIPE_ENCAP option */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 07bff666e65..931150566ad 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -88,4 +88,6 @@
 #define PR_TASK_PERF_EVENTS_DISABLE		31
 #define PR_TASK_PERF_EVENTS_ENABLE		32
 
+#define PR_MCE_KILL	33
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 953fc055e87..14a86bc7102 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -140,7 +140,7 @@ struct rchan_callbacks
 	 * cause relay_open() to create a single global buffer rather
 	 * than the default set of per-cpu buffers.
 	 *
-	 * See Documentation/filesystems/relayfs.txt for more info.
+	 * See Documentation/filesystems/relay.txt for more info.
 	 */
 	struct dentry *(*create_buf_file)(const char *filename,
 					  struct dentry *parent,
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 511f42fc681..731af71cddc 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -35,6 +35,10 @@ struct res_counter {
 	 */
 	unsigned long long limit;
 	/*
+	 * the limit that usage can be exceed
+	 */
+	unsigned long long soft_limit;
+	/*
 	 * the number of unsuccessful attempts to consume the resource
 	 */
 	unsigned long long failcnt;
@@ -87,6 +91,7 @@ enum {
 	RES_MAX_USAGE,
 	RES_LIMIT,
 	RES_FAILCNT,
+	RES_SOFT_LIMIT,
 };
 
 /*
@@ -109,7 +114,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
 int __must_check res_counter_charge_locked(struct res_counter *counter,
 		unsigned long val);
 int __must_check res_counter_charge(struct res_counter *counter,
-		unsigned long val, struct res_counter **limit_fail_at);
+		unsigned long val, struct res_counter **limit_fail_at,
+		struct res_counter **soft_limit_at);
 
 /*
  * uncharge - tell that some portion of the resource is released
@@ -122,7 +128,8 @@ int __must_check res_counter_charge(struct res_counter *counter,
  */
 
 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
-void res_counter_uncharge(struct res_counter *counter, unsigned long val);
+void res_counter_uncharge(struct res_counter *counter, unsigned long val,
+				bool *was_soft_limit_excess);
 
 static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
 {
@@ -132,6 +139,36 @@ static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
 	return false;
 }
 
+static inline bool res_counter_soft_limit_check_locked(struct res_counter *cnt)
+{
+	if (cnt->usage < cnt->soft_limit)
+		return true;
+
+	return false;
+}
+
+/**
+ * Get the difference between the usage and the soft limit
+ * @cnt: The counter
+ *
+ * Returns 0 if usage is less than or equal to soft limit
+ * The difference between usage and soft limit, otherwise.
+ */
+static inline unsigned long long
+res_counter_soft_limit_excess(struct res_counter *cnt)
+{
+	unsigned long long excess;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	if (cnt->usage <= cnt->soft_limit)
+		excess = 0;
+	else
+		excess = cnt->usage - cnt->soft_limit;
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return excess;
+}
+
 /*
  * Helper function to detect if the cgroup is within it's limit or
  * not. It's currently called from cgroup_rss_prepare()
@@ -147,6 +184,17 @@ static inline bool res_counter_check_under_limit(struct res_counter *cnt)
 	return ret;
 }
 
+static inline bool res_counter_check_under_soft_limit(struct res_counter *cnt)
+{
+	bool ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	ret = res_counter_soft_limit_check_locked(cnt);
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return ret;
+}
+
 static inline void res_counter_reset_max(struct res_counter *cnt)
 {
 	unsigned long flags;
@@ -180,4 +228,16 @@ static inline int res_counter_set_limit(struct res_counter *cnt,
 	return ret;
 }
 
+static inline int
+res_counter_set_soft_limit(struct res_counter *cnt,
+				unsigned long long soft_limit)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	cnt->soft_limit = soft_limit;
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return 0;
+}
+
 #endif
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 477841d29fc..cb0ba703260 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -81,7 +81,19 @@ static inline void page_dup_rmap(struct page *page)
  */
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *cnt, unsigned long *vm_flags);
-int try_to_unmap(struct page *, int ignore_refs);
+enum ttu_flags {
+	TTU_UNMAP = 0,			/* unmap mode */
+	TTU_MIGRATION = 1,		/* migration mode */
+	TTU_MUNLOCK = 2,		/* munlock mode */
+	TTU_ACTION_MASK = 0xff,
+
+	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
+	TTU_IGNORE_ACCESS = (1 << 9),	/* don't age */
+	TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+};
+#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
+
+int try_to_unmap(struct page *, enum ttu_flags flags);
 
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
@@ -108,6 +120,13 @@ int page_mkclean(struct page *);
  */
 int try_to_munlock(struct page *);
 
+/*
+ * Called by memory-failure.c to kill processes.
+ */
+struct anon_vma *page_lock_anon_vma(struct page *page);
+void page_unlock_anon_vma(struct anon_vma *anon_vma);
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbf2a3b4628..75e6e60bf58 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -309,7 +309,7 @@ extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-				    struct file *filp, void __user *buffer,
+				    void __user *buffer,
 				    size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 extern int softlockup_thresh;
@@ -331,7 +331,7 @@ extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_warnings;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-					 struct file *filp, void __user *buffer,
+					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
 #endif
 
@@ -1271,7 +1271,6 @@ struct task_struct {
 	struct mm_struct *mm, *active_mm;
 
 /* task state */
-	struct linux_binfmt *binfmt;
 	int exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
@@ -1735,6 +1734,7 @@ extern cputime_t task_gtime(struct task_struct *p);
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+#define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
@@ -1754,6 +1754,7 @@ extern cputime_t task_gtime(struct task_struct *p);
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_THREAD_BOUND	0x04000000	/* Thread bound to specific cpu */
+#define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
@@ -1817,10 +1818,13 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 	return 0;
 }
 #endif
+
+#ifndef CONFIG_CPUMASK_OFFSTACK
 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
 	return set_cpus_allowed_ptr(p, &new_mask);
 }
+#endif
 
 /*
  * Architectures can set this to 1 if they have specified
@@ -1903,7 +1907,7 @@ extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
-		struct file *file, void __user *buffer, size_t *length,
+		void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
 #ifdef CONFIG_SCHED_DEBUG
@@ -1921,7 +1925,7 @@ extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
 int sched_rt_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
 extern unsigned int sysctl_sched_compat_yield;
@@ -2056,6 +2060,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern int do_notify_parent(struct task_struct *, int);
+extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern void force_sig_specific(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
@@ -2333,7 +2338,10 @@ static inline int signal_pending(struct task_struct *p)
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
 }
 
-extern int __fatal_signal_pending(struct task_struct *p);
+static inline int __fatal_signal_pending(struct task_struct *p)
+{
+	return unlikely(sigismember(&p->pending.signal, SIGKILL));
+}
 
 static inline int fatal_signal_pending(struct task_struct *p)
 {
diff --git a/include/linux/security.h b/include/linux/security.h
index d050b66ab9e..239e40d0450 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -133,7 +133,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 		return PAGE_ALIGN(mmap_min_addr);
 	return hint;
 }
-extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp,
+extern int mmap_min_addr_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 #ifdef CONFIG_SECURITY
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 0c6a86b7959..8366d8f12e5 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -35,6 +35,44 @@ struct seq_operations {
 
 #define SEQ_SKIP 1
 
+/**
+ * seq_get_buf - get buffer to write arbitrary data to
+ * @m: the seq_file handle
+ * @bufp: the beginning of the buffer is stored here
+ *
+ * Return the number of bytes available in the buffer, or zero if
+ * there's no space.
+ */
+static inline size_t seq_get_buf(struct seq_file *m, char **bufp)
+{
+	BUG_ON(m->count > m->size);
+	if (m->count < m->size)
+		*bufp = m->buf + m->count;
+	else
+		*bufp = NULL;
+
+	return m->size - m->count;
+}
+
+/**
+ * seq_commit - commit data to the buffer
+ * @m: the seq_file handle
+ * @num: the number of bytes to commit
+ *
+ * Commit @num bytes of data written to a buffer previously acquired
+ * by seq_buf_get.  To signal an error condition, or that the data
+ * didn't fit in the available space, pass a negative @num value.
+ */
+static inline void seq_commit(struct seq_file *m, int num)
+{
+	if (num < 0) {
+		m->count = m->size;
+	} else {
+		BUG_ON(m->count + num > m->size);
+		m->count += num;
+	}
+}
+
 char *mangle_path(char *s, char *p, char *esc);
 int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
diff --git a/include/linux/signal.h b/include/linux/signal.h
index c7552836bd9..ab9272cc270 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -233,6 +233,8 @@ static inline int valid_signal(unsigned long sig)
 }
 
 extern int next_signal(struct sigpending *pending, sigset_t *mask);
+extern int do_send_sig_info(int sig, struct siginfo *info,
+				struct task_struct *p, bool group);
 extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
 extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
 extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9e3d8af0920..39c64bae776 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -73,15 +73,6 @@ int smp_call_function(void(*func)(void *info), void *info, int wait);
 void smp_call_function_many(const struct cpumask *mask,
 			    void (*func)(void *info), void *info, bool wait);
 
-/* Deprecated: Use smp_call_function_many which takes a pointer to the mask. */
-static inline int
-smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
-		       int wait)
-{
-	smp_call_function_many(&mask, func, info, wait);
-	return 0;
-}
-
 void __smp_call_function_single(int cpuid, struct call_single_data *data,
 				int wait);
 
@@ -144,8 +135,6 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
-#define smp_call_function_mask(mask, func, info, wait) \
-			(up_smp_call_function(func, info))
 #define smp_call_function_many(mask, func, info, wait) \
 			(up_smp_call_function(func, info))
 static inline void init_call_single_data(void)
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 7da466ba4b0..f5cc0898bc5 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -11,6 +11,7 @@
 
 #include <linux/uio.h>
 #include <asm/byteorder.h>
+#include <asm/unaligned.h>
 #include <linux/scatterlist.h>
 
 /*
@@ -117,14 +118,14 @@ static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int le
 static inline __be32 *
 xdr_encode_hyper(__be32 *p, __u64 val)
 {
-	*(__be64 *)p = cpu_to_be64(val);
+	put_unaligned_be64(val, p);
 	return p + 2;
 }
 
 static inline __be32 *
 xdr_decode_hyper(__be32 *p, __u64 *valp)
 {
-	*valp = be64_to_cpup((__be64 *)p);
+	*valp = get_unaligned_be64(p);
 	return p + 2;
 }
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6c990e658f4..4ec90019c1a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -34,16 +34,38 @@ static inline int current_is_kswapd(void)
  * the type/offset into the pte as 5/27 as well.
  */
 #define MAX_SWAPFILES_SHIFT	5
-#ifndef CONFIG_MIGRATION
-#define MAX_SWAPFILES		(1 << MAX_SWAPFILES_SHIFT)
+
+/*
+ * Use some of the swap files numbers for other purposes. This
+ * is a convenient way to hook into the VM to trigger special
+ * actions on faults.
+ */
+
+/*
+ * NUMA node memory migration support
+ */
+#ifdef CONFIG_MIGRATION
+#define SWP_MIGRATION_NUM 2
+#define SWP_MIGRATION_READ	(MAX_SWAPFILES + SWP_HWPOISON_NUM)
+#define SWP_MIGRATION_WRITE	(MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
 #else
-/* Use last two entries for page migration swap entries */
-#define MAX_SWAPFILES		((1 << MAX_SWAPFILES_SHIFT)-2)
-#define SWP_MIGRATION_READ	MAX_SWAPFILES
-#define SWP_MIGRATION_WRITE	(MAX_SWAPFILES + 1)
+#define SWP_MIGRATION_NUM 0
 #endif
 
 /*
+ * Handling of hardware poisoned pages with memory corruption.
+ */
+#ifdef CONFIG_MEMORY_FAILURE
+#define SWP_HWPOISON_NUM 1
+#define SWP_HWPOISON		MAX_SWAPFILES
+#else
+#define SWP_HWPOISON_NUM 0
+#endif
+
+#define MAX_SWAPFILES \
+	((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+
+/*
  * Magic header for a swap area. The first part of the union is
  * what the swap magic looks like for the old (limited to 128MB)
  * swap area format, the second part of the union adds - in the
@@ -217,6 +239,11 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 						  gfp_t gfp_mask, bool noswap,
 						  unsigned int swappiness);
+extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+						gfp_t gfp_mask, bool noswap,
+						unsigned int swappiness,
+						struct zone *zone,
+						int nid);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
@@ -240,7 +267,7 @@ extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
 
 extern unsigned long scan_unevictable_pages;
-extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
+extern int scan_unevictable_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 extern int scan_unevictable_register_node(struct node *node);
 extern void scan_unevictable_unregister_node(struct node *node);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 6ec39ab27b4..cd42e30b7c6 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -131,3 +131,41 @@ static inline int is_write_migration_entry(swp_entry_t entry)
 
 #endif
 
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Support for hardware poisoned pages
+ */
+static inline swp_entry_t make_hwpoison_entry(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+	return swp_entry(SWP_HWPOISON, page_to_pfn(page));
+}
+
+static inline int is_hwpoison_entry(swp_entry_t entry)
+{
+	return swp_type(entry) == SWP_HWPOISON;
+}
+#else
+
+static inline swp_entry_t make_hwpoison_entry(struct page *page)
+{
+	return swp_entry(0, 0);
+}
+
+static inline int is_hwpoison_entry(swp_entry_t swp)
+{
+	return 0;
+}
+#endif
+
+#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
+static inline int non_swap_entry(swp_entry_t entry)
+{
+	return swp_type(entry) >= MAX_SWAPFILES;
+}
+#else
+static inline int non_swap_entry(swp_entry_t entry)
+{
+	return 0;
+}
+#endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index e76d3b22a46..1e4743ee683 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -29,7 +29,6 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 
-struct file;
 struct completion;
 
 #define CTL_MAXNAME 10		/* how many path components do we allow in a
@@ -977,25 +976,25 @@ typedef int ctl_handler (struct ctl_table *table,
 			 void __user *oldval, size_t __user *oldlenp,
 			 void __user *newval, size_t newlen);
 
-typedef int proc_handler (struct ctl_table *ctl, int write, struct file * filp,
+typedef int proc_handler (struct ctl_table *ctl, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos);
 
-extern int proc_dostring(struct ctl_table *, int, struct file *,
+extern int proc_dostring(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
-extern int proc_dointvec(struct ctl_table *, int, struct file *,
+extern int proc_dointvec(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
-extern int proc_dointvec_minmax(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_minmax(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
-extern int proc_dointvec_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_jiffies(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
-extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
-extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
 				    void __user *, size_t *, loff_t *);
-extern int proc_doulongvec_minmax(struct ctl_table *, int, struct file *,
+extern int proc_doulongvec_minmax(struct ctl_table *, int,
 				  void __user *, size_t *, loff_t *);
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
-				      struct file *, void __user *, size_t *, loff_t *);
+				      void __user *, size_t *, loff_t *);
 
 extern int do_sysctl (int __user *name, int nlen,
 		      void __user *oldval, size_t __user *oldlenp,
diff --git a/include/linux/time.h b/include/linux/time.h
index 56787c09334..fe04e5ef6a5 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -155,6 +155,34 @@ extern void timekeeping_leap_insert(int leapsecond);
 struct tms;
 extern void do_sys_times(struct tms *);
 
+/*
+ * Similar to the struct tm in userspace <time.h>, but it needs to be here so
+ * that the kernel source is self contained.
+ */
+struct tm {
+	/*
+	 * the number of seconds after the minute, normally in the range
+	 * 0 to 59, but can be up to 60 to allow for leap seconds
+	 */
+	int tm_sec;
+	/* the number of minutes after the hour, in the range 0 to 59*/
+	int tm_min;
+	/* the number of hours past midnight, in the range 0 to 23 */
+	int tm_hour;
+	/* the day of the month, in the range 1 to 31 */
+	int tm_mday;
+	/* the number of months since January, in the range 0 to 11 */
+	int tm_mon;
+	/* the number of years since 1900 */
+	long tm_year;
+	/* the number of days since Sunday, in the range 0 to 6 */
+	int tm_wday;
+	/* the number of days since January 1, in the range 0 to 365 */
+	int tm_yday;
+};
+
+void time_to_tm(time_t totalsecs, int offset, struct tm *result);
+
 /**
  * timespec_to_ns - Convert timespec to nanoseconds
  * @ts:		pointer to the timespec variable to be converted
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 809b26c0709..fc0bf3edeb6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -211,12 +211,6 @@ int arch_update_cpu_topology(void);
 #ifndef topology_core_id
 #define topology_core_id(cpu)			((void)(cpu), 0)
 #endif
-#ifndef topology_thread_siblings
-#define topology_thread_siblings(cpu)		cpumask_of_cpu(cpu)
-#endif
-#ifndef topology_core_siblings
-#define topology_core_siblings(cpu)		cpumask_of_cpu(cpu)
-#endif
 #ifndef topology_thread_cpumask
 #define topology_thread_cpumask(cpu)		cpumask_of(cpu)
 #endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 17ba82efa48..1eb44a924e5 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -1,7 +1,7 @@
 /*
  * Tracing hooks
  *
- * Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2008-2009 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -463,22 +463,38 @@ static inline int tracehook_get_signal(struct task_struct *task,
 
 /**
  * tracehook_notify_jctl - report about job control stop/continue
- * @notify:		nonzero if this is the last thread in the group to stop
+ * @notify:		zero, %CLD_STOPPED or %CLD_CONTINUED
  * @why:		%CLD_STOPPED or %CLD_CONTINUED
  *
  * This is called when we might call do_notify_parent_cldstop().
- * It's called when about to stop for job control; we are already in
- * %TASK_STOPPED state, about to call schedule().  It's also called when
- * a delayed %CLD_STOPPED or %CLD_CONTINUED report is ready to be made.
  *
- * Return nonzero to generate a %SIGCHLD with @why, which is
- * normal if @notify is nonzero.
+ * @notify is zero if we would not ordinarily send a %SIGCHLD,
+ * or is the %CLD_STOPPED or %CLD_CONTINUED .si_code for %SIGCHLD.
  *
- * Called with no locks held.
+ * @why is %CLD_STOPPED when about to stop for job control;
+ * we are already in %TASK_STOPPED state, about to call schedule().
+ * It might also be that we have just exited (check %PF_EXITING),
+ * but need to report that a group-wide stop is complete.
+ *
+ * @why is %CLD_CONTINUED when waking up after job control stop and
+ * ready to make a delayed @notify report.
+ *
+ * Return the %CLD_* value for %SIGCHLD, or zero to generate no signal.
+ *
+ * Called with the siglock held.
  */
 static inline int tracehook_notify_jctl(int notify, int why)
 {
-	return notify || (current->ptrace & PT_PTRACED);
+	return notify ?: (current->ptrace & PT_PTRACED) ? why : 0;
+}
+
+/**
+ * tracehook_finish_jctl - report about return from job control stop
+ *
+ * This is called by do_signal_stop() after wakeup.
+ */
+static inline void tracehook_finish_jctl(void)
+{
 }
 
 #define DEATH_REAP			-1
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 63a3f7a8058..660a9de96f8 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -4,7 +4,7 @@
 /*
  * Kernel Tracepoint API.
  *
- * See Documentation/tracepoint.txt.
+ * See Documentation/trace/tracepoints.txt.
  *
  * (C) Copyright 2008 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
  *
diff --git a/include/linux/unaligned/be_byteshift.h b/include/linux/unaligned/be_byteshift.h
index 46dd12c5709..9356b24223a 100644
--- a/include/linux/unaligned/be_byteshift.h
+++ b/include/linux/unaligned/be_byteshift.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_UNALIGNED_BE_BYTESHIFT_H
 #define _LINUX_UNALIGNED_BE_BYTESHIFT_H
 
-#include <linux/kernel.h>
+#include <linux/types.h>
 
 static inline u16 __get_unaligned_be16(const u8 *p)
 {
diff --git a/include/linux/unaligned/le_byteshift.h b/include/linux/unaligned/le_byteshift.h
index 59777e951ba..be376fb79b6 100644
--- a/include/linux/unaligned/le_byteshift.h
+++ b/include/linux/unaligned/le_byteshift.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_UNALIGNED_LE_BYTESHIFT_H
 #define _LINUX_UNALIGNED_LE_BYTESHIFT_H
 
-#include <linux/kernel.h>
+#include <linux/types.h>
 
 static inline u16 __get_unaligned_le16(const u8 *p)
 {
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index bb69e256cd1..f8147305205 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -89,6 +89,7 @@ struct driver_info {
 #define FLAG_FRAMING_AX 0x0040		/* AX88772/178 packets */
 #define FLAG_WLAN	0x0080		/* use "wlan%d" names */
 #define FLAG_AVOID_UNLINK_URBS 0x0100	/* don't unlink urbs at usbnet_stop() */
+#define FLAG_SEND_ZLP	0x0200		/* hw requires ZLPs are sent */
 
 
 	/* init device ... can sleep, or cause probe() failure */
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 3656b300de3..69f39974c04 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -36,7 +36,6 @@ struct new_utsname {
 #include <linux/kref.h>
 #include <linux/nsproxy.h>
 #include <linux/err.h>
-#include <asm/atomic.h>
 
 struct uts_namespace {
 	struct kref kref;
diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h
index 923f9040ea2..2dfaa293ae8 100644
--- a/include/linux/vgaarb.h
+++ b/include/linux/vgaarb.h
@@ -1,5 +1,6 @@
 /*
- * vgaarb.c
+ * The VGA aribiter manages VGA space routing and VGA resource decode to
+ * allow multiple VGA devices to be used in a system in a safe way.
  *
  * (C) Copyright 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org>
  * (C) Copyright 2007 Paulo R. Zanoni <przanoni@gmail.com>
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 75cf58666ff..66ebddcff66 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -110,21 +110,20 @@ extern int laptop_mode;
 extern unsigned long determine_dirtyable_memory(void);
 
 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
 struct ctl_table;
-struct file;
-int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
+int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
 
 void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index b77c1478c99..a7fb54808a2 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -38,6 +38,8 @@
  * @P9_DEBUG_SLABS: memory management tracing
  * @P9_DEBUG_FCALL: verbose dump of protocol messages
  * @P9_DEBUG_FID: fid allocation/deallocation tracking
+ * @P9_DEBUG_PKT: packet marshalling/unmarshalling
+ * @P9_DEBUG_FSC: FS-cache tracing
  *
  * These flags are passed at mount time to turn on various levels of
  * verbosity and tracing which will be output to the system logs.
@@ -54,6 +56,7 @@ enum p9_debug_flags {
 	P9_DEBUG_FCALL =	(1<<8),
 	P9_DEBUG_FID =		(1<<9),
 	P9_DEBUG_PKT =		(1<<10),
+	P9_DEBUG_FSC =		(1<<11),
 };
 
 #ifdef CONFIG_NET_9P_DEBUG
diff --git a/include/net/ip.h b/include/net/ip.h
index 72c36926c26..5b26a0bd178 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -399,7 +399,7 @@ extern void	ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
  * fed into the routing cache should use these handlers.
  */
 int ipv4_doint_and_flush(ctl_table *ctl, int write,
-			 struct file* filp, void __user *buffer,
+			 void __user *buffer,
 			 size_t *lenp, loff_t *ppos);
 int ipv4_doint_and_flush_strategy(ctl_table *table,
 				  void __user *oldval, size_t __user *oldlenp,
diff --git a/include/net/ipip.h b/include/net/ipip.h
index 5d3036fa151..76e3ea6e2fe 100644
--- a/include/net/ipip.h
+++ b/include/net/ipip.h
@@ -12,7 +12,6 @@ struct ip_tunnel
 	struct ip_tunnel	*next;
 	struct net_device	*dev;
 
-	int			recursion;	/* Depth of hard_start_xmit recursion */
 	int			err_count;	/* Number of arrived ICMP errors */
 	unsigned long		err_time;	/* Time when the last ICMP error arrived */
 
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 1459ed3e269..f76f22d0572 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -55,7 +55,6 @@ enum {
 #include <net/neighbour.h>
 
 struct ctl_table;
-struct file;
 struct inet6_dev;
 struct net_device;
 struct net_proto_family;
@@ -139,7 +138,6 @@ extern int			igmp6_event_report(struct sk_buff *skb);
 #ifdef CONFIG_SYSCTL
 extern int 			ndisc_ifinfo_sysctl_change(struct ctl_table *ctl,
 							   int write,
-							   struct file * filp,
 							   void __user *buffer,
 							   size_t *lenp,
 							   loff_t *ppos);
diff --git a/init/Kconfig b/init/Kconfig
index 0aa6579504c..c7bac39d6c6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1006,14 +1006,6 @@ config SLUB_DEBUG
 	  SLUB sysfs support. /sys/slab will not exist and there will be
 	  no support for cache validation etc.
 
-config STRIP_ASM_SYMS
-	bool "Strip assembler-generated symbols during link"
-	default n
-	help
-	  Strip internal assembler-generated symbols during a link (symbols
-	  that look like '.Lxxx') so they don't pollute the output of
-	  get_wchan() and suchlike.
-
 config COMPAT_BRK
 	bool "Disable heap randomization"
 	default y
diff --git a/init/main.c b/init/main.c
index 6107223124e..7449819a480 100644
--- a/init/main.c
+++ b/init/main.c
@@ -18,7 +18,6 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
-#include <linux/utsname.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/smp_lock.h>
@@ -360,11 +359,6 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 
 #else
 
-#if NR_CPUS > BITS_PER_LONG
-cpumask_t cpu_mask_all __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_mask_all);
-#endif
-
 /* Setup number of possible processor ids */
 int nr_cpu_ids __read_mostly = NR_CPUS;
 EXPORT_SYMBOL(nr_cpu_ids);
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 40eab7314ae..7d3704750ef 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -27,18 +27,18 @@ static void *get_ipc(ctl_table *table)
 }
 
 #ifdef CONFIG_PROC_SYSCTL
-static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write,
 	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	memcpy(&ipc_table, table, sizeof(ipc_table));
 	ipc_table.data = get_ipc(table);
 
-	return proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
 }
 
 static int proc_ipc_callback_dointvec(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	size_t lenp_bef = *lenp;
@@ -47,7 +47,7 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
 	memcpy(&ipc_table, table, sizeof(ipc_table));
 	ipc_table.data = get_ipc(table);
 
-	rc = proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
 
 	if (write && !rc && lenp_bef == *lenp)
 		/*
@@ -61,13 +61,13 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
 }
 
 static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	memcpy(&ipc_table, table, sizeof(ipc_table));
 	ipc_table.data = get_ipc(table);
 
-	return proc_doulongvec_minmax(&ipc_table, write, filp, buffer,
+	return proc_doulongvec_minmax(&ipc_table, write, buffer,
 					lenp, ppos);
 }
 
@@ -95,7 +95,7 @@ static void ipc_auto_callback(int val)
 }
 
 static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	size_t lenp_bef = *lenp;
@@ -106,7 +106,7 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
 	ipc_table.data = get_ipc(table);
 	oldval = *((int *)(ipc_table.data));
 
-	rc = proc_dointvec_minmax(&ipc_table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
 
 	if (write && !rc && lenp_bef == *lenp) {
 		int newval = *((int *)(ipc_table.data));
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 24ae46dfe45..8a058711fc1 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -31,24 +31,24 @@ static void *get_mq(ctl_table *table)
 	return which;
 }
 
-static int proc_mq_dointvec(ctl_table *table, int write, struct file *filp,
+static int proc_mq_dointvec(ctl_table *table, int write,
 	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table mq_table;
 	memcpy(&mq_table, table, sizeof(mq_table));
 	mq_table.data = get_mq(table);
 
-	return proc_dointvec(&mq_table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
 }
 
 static int proc_mq_dointvec_minmax(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table mq_table;
 	memcpy(&mq_table, table, sizeof(mq_table));
 	mq_table.data = get_mq(table);
 
-	return proc_dointvec_minmax(&mq_table, write, filp, buffer,
+	return proc_dointvec_minmax(&mq_table, write, buffer,
 					lenp, ppos);
 }
 #else
diff --git a/kernel/Makefile b/kernel/Makefile
index 187c89b4783..b8d4cd8ac0b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
diff --git a/kernel/audit.c b/kernel/audit.c
index defc2e6f1e3..5feed232be9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		break;
 	}
 	case AUDIT_SIGNAL_INFO:
-		err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
-		if (err)
-			return err;
+		len = 0;
+		if (audit_sig_sid) {
+			err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
+			if (err)
+				return err;
+		}
 		sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
 		if (!sig_data) {
-			security_release_secctx(ctx, len);
+			if (audit_sig_sid)
+				security_release_secctx(ctx, len);
 			return -ENOMEM;
 		}
 		sig_data->uid = audit_sig_uid;
 		sig_data->pid = audit_sig_pid;
-		memcpy(sig_data->ctx, ctx, len);
-		security_release_secctx(ctx, len);
+		if (audit_sig_sid) {
+			memcpy(sig_data->ctx, ctx, len);
+			security_release_secctx(ctx, len);
+		}
 		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
 				0, 0, sig_data, sizeof(*sig_data) + len);
 		kfree(sig_data);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0e96dbc60ea..cc7e87936cb 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -45,8 +45,8 @@
 
 struct audit_watch {
 	atomic_t		count;	/* reference count */
-	char			*path;	/* insertion path */
 	dev_t			dev;	/* associated superblock device */
+	char			*path;	/* insertion path */
 	unsigned long		ino;	/* associated inode number */
 	struct audit_parent	*parent; /* associated parent */
 	struct list_head	wlist;	/* entry in parent->watches list */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 68d3c6a0ecd..267e484f019 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -168,12 +168,12 @@ struct audit_context {
 	int		    in_syscall;	/* 1 if task is in a syscall */
 	enum audit_state    state, current_state;
 	unsigned int	    serial;     /* serial number for record */
-	struct timespec	    ctime;      /* time of syscall entry */
 	int		    major;      /* syscall number */
+	struct timespec	    ctime;      /* time of syscall entry */
 	unsigned long	    argv[4];    /* syscall arguments */
-	int		    return_valid; /* return code is valid */
 	long		    return_code;/* syscall return code */
 	u64		    prio;
+	int		    return_valid; /* return code is valid */
 	int		    name_count;
 	struct audit_names  names[AUDIT_NAMES];
 	char *		    filterkey;	/* key for rule that triggered record */
@@ -198,8 +198,8 @@ struct audit_context {
 	char		    target_comm[TASK_COMM_LEN];
 
 	struct audit_tree_refs *trees, *first_trees;
-	int tree_count;
 	struct list_head killed_trees;
+	int tree_count;
 
 	int type;
 	union {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cd83d9933b6..7ccba4bc5e3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/cgroup.h>
+#include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -48,6 +49,8 @@
 #include <linux/namei.h>
 #include <linux/smp_lock.h>
 #include <linux/pid_namespace.h>
+#include <linux/idr.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 
 #include <asm/atomic.h>
 
@@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 
+#define MAX_CGROUP_ROOT_NAMELEN 64
+
 /*
  * A cgroupfs_root represents the root of a cgroup hierarchy,
  * and may be associated with a superblock to form an active
@@ -74,6 +79,9 @@ struct cgroupfs_root {
 	 */
 	unsigned long subsys_bits;
 
+	/* Unique id for this hierarchy. */
+	int hierarchy_id;
+
 	/* The bitmask of subsystems currently attached to this hierarchy */
 	unsigned long actual_subsys_bits;
 
@@ -94,6 +102,9 @@ struct cgroupfs_root {
 
 	/* The path to use for release notifications. */
 	char release_agent_path[PATH_MAX];
+
+	/* The name for this hierarchy - may be empty */
+	char name[MAX_CGROUP_ROOT_NAMELEN];
 };
 
 /*
@@ -141,6 +152,10 @@ struct css_id {
 static LIST_HEAD(roots);
 static int root_count;
 
+static DEFINE_IDA(hierarchy_ida);
+static int next_hierarchy_id;
+static DEFINE_SPINLOCK(hierarchy_id_lock);
+
 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
 #define dummytop (&rootnode.top_cgroup)
 
@@ -201,6 +216,7 @@ struct cg_cgroup_link {
 	 * cgroup, anchored on cgroup->css_sets
 	 */
 	struct list_head cgrp_link_list;
+	struct cgroup *cgrp;
 	/*
 	 * List running through cg_cgroup_links pointing at a
 	 * single css_set object, anchored on css_set->cg_links
@@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
 static DEFINE_RWLOCK(css_set_lock);
 static int css_set_count;
 
-/* hash table for cgroup groups. This improves the performance to
- * find an existing css_set */
+/*
+ * hash table for cgroup groups. This improves the performance to find
+ * an existing css_set. This hash doesn't (currently) take into
+ * account cgroups in empty hierarchies.
+ */
 #define CSS_SET_HASH_BITS	7
 #define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
 static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
 	return &css_set_table[index];
 }
 
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+	struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+	kfree(cg);
+}
+
 /* We don't maintain the lists running through each css_set to its
  * task until after the first call to cgroup_iter_start(). This
  * reduces the fork()/exit() overhead for people who have cgroups
  * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
 
-/* When we create or destroy a css_set, the operation simply
- * takes/releases a reference count on all the cgroups referenced
- * by subsystems in this css_set. This can end up multiple-counting
- * some cgroups, but that's OK - the ref-count is just a
- * busy/not-busy indicator; ensuring that we only count each cgroup
- * once would require taking a global lock to ensure that no
- * subsystems moved between hierarchies while we were doing so.
- *
- * Possible TODO: decide at boot time based on the number of
- * registered subsystems and the number of CPUs or NUMA nodes whether
- * it's better for performance to ref-count every subsystem, or to
- * take a global lock and only add one ref count to each hierarchy.
- */
-
-/*
- * unlink a css_set from the list and free it
- */
-static void unlink_css_set(struct css_set *cg)
+static void __put_css_set(struct css_set *cg, int taskexit)
 {
 	struct cg_cgroup_link *link;
 	struct cg_cgroup_link *saved_link;
-
-	hlist_del(&cg->hlist);
-	css_set_count--;
-
-	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
-				 cg_link_list) {
-		list_del(&link->cg_link_list);
-		list_del(&link->cgrp_link_list);
-		kfree(link);
-	}
-}
-
-static void __put_css_set(struct css_set *cg, int taskexit)
-{
-	int i;
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit)
 		write_unlock(&css_set_lock);
 		return;
 	}
-	unlink_css_set(cg);
-	write_unlock(&css_set_lock);
 
-	rcu_read_lock();
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
+	/* This css_set is dead. unlink it and release cgroup refcounts */
+	hlist_del(&cg->hlist);
+	css_set_count--;
+
+	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+				 cg_link_list) {
+		struct cgroup *cgrp = link->cgrp;
+		list_del(&link->cg_link_list);
+		list_del(&link->cgrp_link_list);
 		if (atomic_dec_and_test(&cgrp->count) &&
 		    notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
+
+		kfree(link);
 	}
-	rcu_read_unlock();
-	kfree(cg);
+
+	write_unlock(&css_set_lock);
+	call_rcu(&cg->rcu_head, free_css_set_rcu);
 }
 
 /*
@@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
 }
 
 /*
+ * compare_css_sets - helper function for find_existing_css_set().
+ * @cg: candidate css_set being tested
+ * @old_cg: existing css_set for a task
+ * @new_cgrp: cgroup that's being entered by the task
+ * @template: desired set of css pointers in css_set (pre-calculated)
+ *
+ * Returns true if "cg" matches "old_cg" except for the hierarchy
+ * which "new_cgrp" belongs to, for which it should match "new_cgrp".
+ */
+static bool compare_css_sets(struct css_set *cg,
+			     struct css_set *old_cg,
+			     struct cgroup *new_cgrp,
+			     struct cgroup_subsys_state *template[])
+{
+	struct list_head *l1, *l2;
+
+	if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
+		/* Not all subsystems matched */
+		return false;
+	}
+
+	/*
+	 * Compare cgroup pointers in order to distinguish between
+	 * different cgroups in heirarchies with no subsystems. We
+	 * could get by with just this check alone (and skip the
+	 * memcmp above) but on most setups the memcmp check will
+	 * avoid the need for this more expensive check on almost all
+	 * candidates.
+	 */
+
+	l1 = &cg->cg_links;
+	l2 = &old_cg->cg_links;
+	while (1) {
+		struct cg_cgroup_link *cgl1, *cgl2;
+		struct cgroup *cg1, *cg2;
+
+		l1 = l1->next;
+		l2 = l2->next;
+		/* See if we reached the end - both lists are equal length. */
+		if (l1 == &cg->cg_links) {
+			BUG_ON(l2 != &old_cg->cg_links);
+			break;
+		} else {
+			BUG_ON(l2 == &old_cg->cg_links);
+		}
+		/* Locate the cgroups associated with these links. */
+		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
+		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
+		cg1 = cgl1->cgrp;
+		cg2 = cgl2->cgrp;
+		/* Hierarchies should be linked in the same order. */
+		BUG_ON(cg1->root != cg2->root);
+
+		/*
+		 * If this hierarchy is the hierarchy of the cgroup
+		 * that's changing, then we need to check that this
+		 * css_set points to the new cgroup; if it's any other
+		 * hierarchy, then this css_set should point to the
+		 * same cgroup as the old css_set.
+		 */
+		if (cg1->root == new_cgrp->root) {
+			if (cg1 != new_cgrp)
+				return false;
+		} else {
+			if (cg1 != cg2)
+				return false;
+		}
+	}
+	return true;
+}
+
+/*
  * find_existing_css_set() is a helper for
  * find_css_set(), and checks to see whether an existing
  * css_set is suitable.
@@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set(
 
 	hhead = css_set_hash(template);
 	hlist_for_each_entry(cg, node, hhead, hlist) {
-		if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
-			/* All subsystems matched */
-			return cg;
-		}
+		if (!compare_css_sets(cg, oldcg, cgrp, template))
+			continue;
+
+		/* This css_set matches what we need */
+		return cg;
 	}
 
 	/* No existing cgroup group matched */
@@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links,
 	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
 				cgrp_link_list);
 	link->cg = cg;
+	link->cgrp = cgrp;
+	atomic_inc(&cgrp->count);
 	list_move(&link->cgrp_link_list, &cgrp->css_sets);
-	list_add(&link->cg_link_list, &cg->cg_links);
+	/*
+	 * Always add links to the tail of the list so that the list
+	 * is sorted by order of hierarchy creation
+	 */
+	list_add_tail(&link->cg_link_list, &cg->cg_links);
 }
 
 /*
@@ -451,11 +530,11 @@ static struct css_set *find_css_set(
 {
 	struct css_set *res;
 	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-	int i;
 
 	struct list_head tmp_cg_links;
 
 	struct hlist_head *hhead;
+	struct cg_cgroup_link *link;
 
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
@@ -489,20 +568,12 @@ static struct css_set *find_css_set(
 
 	write_lock(&css_set_lock);
 	/* Add reference counts and links from the new css_set. */
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		struct cgroup *cgrp = res->subsys[i]->cgroup;
-		struct cgroup_subsys *ss = subsys[i];
-		atomic_inc(&cgrp->count);
-		/*
-		 * We want to add a link once per cgroup, so we
-		 * only do it for the first subsystem in each
-		 * hierarchy
-		 */
-		if (ss->root->subsys_list.next == &ss->sibling)
-			link_css_set(&tmp_cg_links, res, cgrp);
+	list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
+		struct cgroup *c = link->cgrp;
+		if (c->root == cgrp->root)
+			c = cgrp;
+		link_css_set(&tmp_cg_links, res, c);
 	}
-	if (list_empty(&rootnode.subsys_list))
-		link_css_set(&tmp_cg_links, res, dummytop);
 
 	BUG_ON(!list_empty(&tmp_cg_links));
 
@@ -518,6 +589,41 @@ static struct css_set *find_css_set(
 }
 
 /*
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+					    struct cgroupfs_root *root)
+{
+	struct css_set *css;
+	struct cgroup *res = NULL;
+
+	BUG_ON(!mutex_is_locked(&cgroup_mutex));
+	read_lock(&css_set_lock);
+	/*
+	 * No need to lock the task - since we hold cgroup_mutex the
+	 * task can't change groups, so the only thing that can happen
+	 * is that it exits and its css is set back to init_css_set.
+	 */
+	css = task->cgroups;
+	if (css == &init_css_set) {
+		res = &root->top_cgroup;
+	} else {
+		struct cg_cgroup_link *link;
+		list_for_each_entry(link, &css->cg_links, cg_link_list) {
+			struct cgroup *c = link->cgrp;
+			if (c->root == root) {
+				res = c;
+				break;
+			}
+		}
+	}
+	read_unlock(&css_set_lock);
+	BUG_ON(!res);
+	return res;
+}
+
+/*
  * There is one global cgroup mutex. We also require taking
  * task_lock() when dereferencing a task's cgroup subsys pointers.
  * See "The task_lock() exception", at the end of this comment.
@@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		 */
 		deactivate_super(cgrp->root->sb);
 
+		/*
+		 * if we're getting rid of the cgroup, refcount should ensure
+		 * that there are no pidlists left.
+		 */
+		BUG_ON(!list_empty(&cgrp->pidlists));
+
 		call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
 	}
 	iput(inode);
@@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",noprefix");
 	if (strlen(root->release_agent_path))
 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+	if (strlen(root->name))
+		seq_printf(seq, ",name=%s", root->name);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
@@ -849,6 +963,12 @@ struct cgroup_sb_opts {
 	unsigned long subsys_bits;
 	unsigned long flags;
 	char *release_agent;
+	char *name;
+	/* User explicitly requested empty subsystem */
+	bool none;
+
+	struct cgroupfs_root *new_root;
+
 };
 
 /* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data,
 	mask = ~(1UL << cpuset_subsys_id);
 #endif
 
-	opts->subsys_bits = 0;
-	opts->flags = 0;
-	opts->release_agent = NULL;
+	memset(opts, 0, sizeof(*opts));
 
 	while ((token = strsep(&o, ",")) != NULL) {
 		if (!*token)
@@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data,
 				if (!ss->disabled)
 					opts->subsys_bits |= 1ul << i;
 			}
+		} else if (!strcmp(token, "none")) {
+			/* Explicitly have no subsystems */
+			opts->none = true;
 		} else if (!strcmp(token, "noprefix")) {
 			set_bit(ROOT_NOPREFIX, &opts->flags);
 		} else if (!strncmp(token, "release_agent=", 14)) {
 			/* Specifying two release agents is forbidden */
 			if (opts->release_agent)
 				return -EINVAL;
-			opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
+			opts->release_agent =
+				kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
 			if (!opts->release_agent)
 				return -ENOMEM;
-			strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
-			opts->release_agent[PATH_MAX - 1] = 0;
+		} else if (!strncmp(token, "name=", 5)) {
+			int i;
+			const char *name = token + 5;
+			/* Can't specify an empty name */
+			if (!strlen(name))
+				return -EINVAL;
+			/* Must match [\w.-]+ */
+			for (i = 0; i < strlen(name); i++) {
+				char c = name[i];
+				if (isalnum(c))
+					continue;
+				if ((c == '.') || (c == '-') || (c == '_'))
+					continue;
+				return -EINVAL;
+			}
+			/* Specifying two names is forbidden */
+			if (opts->name)
+				return -EINVAL;
+			opts->name = kstrndup(name,
+					      MAX_CGROUP_ROOT_NAMELEN,
+					      GFP_KERNEL);
+			if (!opts->name)
+				return -ENOMEM;
 		} else {
 			struct cgroup_subsys *ss;
 			int i;
@@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data,
 		}
 	}
 
+	/* Consistency checks */
+
 	/*
 	 * Option noprefix was introduced just for backward compatibility
 	 * with the old cpuset, so we allow noprefix only if mounting just
@@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data,
 	    (opts->subsys_bits & mask))
 		return -EINVAL;
 
-	/* We can't have an empty hierarchy */
-	if (!opts->subsys_bits)
+
+	/* Can't specify "none" and some subsystems */
+	if (opts->subsys_bits && opts->none)
+		return -EINVAL;
+
+	/*
+	 * We either have to specify by name or by subsystems. (So all
+	 * empty hierarchies must have a name).
+	 */
+	if (!opts->subsys_bits && !opts->name)
 		return -EINVAL;
 
 	return 0;
@@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 		goto out_unlock;
 	}
 
+	/* Don't allow name to change at remount */
+	if (opts.name && strcmp(opts.name, root->name)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
 	ret = rebind_subsystems(root, opts.subsys_bits);
 	if (ret)
 		goto out_unlock;
@@ -955,6 +1114,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 		strcpy(root->release_agent_path, opts.release_agent);
  out_unlock:
 	kfree(opts.release_agent);
+	kfree(opts.name);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	unlock_kernel();
@@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->css_sets);
 	INIT_LIST_HEAD(&cgrp->release_list);
-	INIT_LIST_HEAD(&cgrp->pids_list);
-	init_rwsem(&cgrp->pids_mutex);
+	INIT_LIST_HEAD(&cgrp->pidlists);
+	mutex_init(&cgrp->pidlist_mutex);
 }
+
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
@@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	init_cgroup_housekeeping(cgrp);
 }
 
+static bool init_root_id(struct cgroupfs_root *root)
+{
+	int ret = 0;
+
+	do {
+		if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
+			return false;
+		spin_lock(&hierarchy_id_lock);
+		/* Try to allocate the next unused ID */
+		ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
+					&root->hierarchy_id);
+		if (ret == -ENOSPC)
+			/* Try again starting from 0 */
+			ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
+		if (!ret) {
+			next_hierarchy_id = root->hierarchy_id + 1;
+		} else if (ret != -EAGAIN) {
+			/* Can only get here if the 31-bit IDR is full ... */
+			BUG_ON(ret);
+		}
+		spin_unlock(&hierarchy_id_lock);
+	} while (ret);
+	return true;
+}
+
 static int cgroup_test_super(struct super_block *sb, void *data)
 {
-	struct cgroupfs_root *new = data;
+	struct cgroup_sb_opts *opts = data;
 	struct cgroupfs_root *root = sb->s_fs_info;
 
-	/* First check subsystems */
-	if (new->subsys_bits != root->subsys_bits)
-	    return 0;
+	/* If we asked for a name then it must match */
+	if (opts->name && strcmp(opts->name, root->name))
+		return 0;
 
-	/* Next check flags */
-	if (new->flags != root->flags)
+	/*
+	 * If we asked for subsystems (or explicitly for no
+	 * subsystems) then they must match
+	 */
+	if ((opts->subsys_bits || opts->none)
+	    && (opts->subsys_bits != root->subsys_bits))
 		return 0;
 
 	return 1;
 }
 
+static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
+{
+	struct cgroupfs_root *root;
+
+	if (!opts->subsys_bits && !opts->none)
+		return NULL;
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+
+	if (!init_root_id(root)) {
+		kfree(root);
+		return ERR_PTR(-ENOMEM);
+	}
+	init_cgroup_root(root);
+
+	root->subsys_bits = opts->subsys_bits;
+	root->flags = opts->flags;
+	if (opts->release_agent)
+		strcpy(root->release_agent_path, opts->release_agent);
+	if (opts->name)
+		strcpy(root->name, opts->name);
+	return root;
+}
+
+static void cgroup_drop_root(struct cgroupfs_root *root)
+{
+	if (!root)
+		return;
+
+	BUG_ON(!root->hierarchy_id);
+	spin_lock(&hierarchy_id_lock);
+	ida_remove(&hierarchy_ida, root->hierarchy_id);
+	spin_unlock(&hierarchy_id_lock);
+	kfree(root);
+}
+
 static int cgroup_set_super(struct super_block *sb, void *data)
 {
 	int ret;
-	struct cgroupfs_root *root = data;
+	struct cgroup_sb_opts *opts = data;
+
+	/* If we don't have a new root, we can't set up a new sb */
+	if (!opts->new_root)
+		return -EINVAL;
+
+	BUG_ON(!opts->subsys_bits && !opts->none);
 
 	ret = set_anon_super(sb, NULL);
 	if (ret)
 		return ret;
 
-	sb->s_fs_info = root;
-	root->sb = sb;
+	sb->s_fs_info = opts->new_root;
+	opts->new_root->sb = sb;
 
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 			 void *data, struct vfsmount *mnt)
 {
 	struct cgroup_sb_opts opts;
+	struct cgroupfs_root *root;
 	int ret = 0;
 	struct super_block *sb;
-	struct cgroupfs_root *root;
-	struct list_head tmp_cg_links;
+	struct cgroupfs_root *new_root;
 
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
-	if (ret) {
-		kfree(opts.release_agent);
-		return ret;
-	}
-
-	root = kzalloc(sizeof(*root), GFP_KERNEL);
-	if (!root) {
-		kfree(opts.release_agent);
-		return -ENOMEM;
-	}
+	if (ret)
+		goto out_err;
 
-	init_cgroup_root(root);
-	root->subsys_bits = opts.subsys_bits;
-	root->flags = opts.flags;
-	if (opts.release_agent) {
-		strcpy(root->release_agent_path, opts.release_agent);
-		kfree(opts.release_agent);
+	/*
+	 * Allocate a new cgroup root. We may not need it if we're
+	 * reusing an existing hierarchy.
+	 */
+	new_root = cgroup_root_from_opts(&opts);
+	if (IS_ERR(new_root)) {
+		ret = PTR_ERR(new_root);
+		goto out_err;
 	}
+	opts.new_root = new_root;
 
-	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
-
+	/* Locate an existing or new sb for this hierarchy */
+	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
 	if (IS_ERR(sb)) {
-		kfree(root);
-		return PTR_ERR(sb);
+		ret = PTR_ERR(sb);
+		cgroup_drop_root(opts.new_root);
+		goto out_err;
 	}
 
-	if (sb->s_fs_info != root) {
-		/* Reusing an existing superblock */
-		BUG_ON(sb->s_root == NULL);
-		kfree(root);
-		root = NULL;
-	} else {
-		/* New superblock */
+	root = sb->s_fs_info;
+	BUG_ON(!root);
+	if (root == opts.new_root) {
+		/* We used the new root structure, so this is a new hierarchy */
+		struct list_head tmp_cg_links;
 		struct cgroup *root_cgrp = &root->top_cgroup;
 		struct inode *inode;
+		struct cgroupfs_root *existing_root;
 		int i;
 
 		BUG_ON(sb->s_root != NULL);
@@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
 
+		if (strlen(root->name)) {
+			/* Check for name clashes with existing mounts */
+			for_each_active_root(existing_root) {
+				if (!strcmp(existing_root->name, root->name)) {
+					ret = -EBUSY;
+					mutex_unlock(&cgroup_mutex);
+					mutex_unlock(&inode->i_mutex);
+					goto drop_new_super;
+				}
+			}
+		}
+
 		/*
 		 * We're accessing css_set_count without locking
 		 * css_set_lock here, but that's OK - it can only be
@@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		if (ret == -EBUSY) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&inode->i_mutex);
-			goto free_cg_links;
+			free_cg_links(&tmp_cg_links);
+			goto drop_new_super;
 		}
 
 		/* EBUSY should be the only error here */
@@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		BUG_ON(root->number_of_cgroups != 1);
 
 		cgroup_populate_dir(root_cgrp);
-		mutex_unlock(&inode->i_mutex);
 		mutex_unlock(&cgroup_mutex);
+		mutex_unlock(&inode->i_mutex);
+	} else {
+		/*
+		 * We re-used an existing hierarchy - the new root (if
+		 * any) is not needed
+		 */
+		cgroup_drop_root(opts.new_root);
 	}
 
 	simple_set_mnt(mnt, sb);
+	kfree(opts.release_agent);
+	kfree(opts.name);
 	return 0;
 
- free_cg_links:
-	free_cg_links(&tmp_cg_links);
  drop_new_super:
 	deactivate_locked_super(sb);
+ out_err:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+
 	return ret;
 }
 
@@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	mutex_unlock(&cgroup_mutex);
 
 	kill_litter_super(sb);
-	kfree(root);
+	cgroup_drop_root(root);
 }
 
 static struct file_system_type cgroup_fs_type = {
@@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 	return 0;
 }
 
-/*
- * Return the first subsystem attached to a cgroup's hierarchy, and
- * its subsystem id.
- */
-
-static void get_first_subsys(const struct cgroup *cgrp,
-			struct cgroup_subsys_state **css, int *subsys_id)
-{
-	const struct cgroupfs_root *root = cgrp->root;
-	const struct cgroup_subsys *test_ss;
-	BUG_ON(list_empty(&root->subsys_list));
-	test_ss = list_entry(root->subsys_list.next,
-			     struct cgroup_subsys, sibling);
-	if (css) {
-		*css = cgrp->subsys[test_ss->subsys_id];
-		BUG_ON(!*css);
-	}
-	if (subsys_id)
-		*subsys_id = test_ss->subsys_id;
-}
-
 /**
  * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
  * @cgrp: the cgroup the task is attaching to
@@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	struct css_set *cg;
 	struct css_set *newcg;
 	struct cgroupfs_root *root = cgrp->root;
-	int subsys_id;
-
-	get_first_subsys(cgrp, NULL, &subsys_id);
 
 	/* Nothing to do if the task is already in that cgroup */
-	oldcgrp = task_cgroup(tsk, subsys_id);
+	oldcgrp = task_cgroup_from_root(tsk, root);
 	if (cgrp == oldcgrp)
 		return 0;
 
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
-			retval = ss->can_attach(ss, cgrp, tsk);
+			retval = ss->can_attach(ss, cgrp, tsk, false);
 			if (retval)
 				return retval;
 		}
@@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
 	for_each_subsys(root, ss) {
 		if (ss->attach)
-			ss->attach(ss, cgrp, oldcgrp, tsk);
+			ss->attach(ss, cgrp, oldcgrp, tsk, false);
 	}
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
 	synchronize_rcu();
@@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 	return ret;
 }
 
-/* The various types of files and directories in a cgroup file system */
-enum cgroup_filetype {
-	FILE_ROOT,
-	FILE_DIR,
-	FILE_TASKLIST,
-	FILE_NOTIFY_ON_RELEASE,
-	FILE_RELEASE_AGENT,
-};
-
 /**
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
@@ -1876,7 +2095,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
  * the start of a css_set
  */
 static void cgroup_advance_iter(struct cgroup *cgrp,
-					  struct cgroup_iter *it)
+				struct cgroup_iter *it)
 {
 	struct list_head *l = it->cg_link;
 	struct cg_cgroup_link *link;
@@ -2129,7 +2348,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
 }
 
 /*
- * Stuff for reading the 'tasks' file.
+ * Stuff for reading the 'tasks'/'procs' files.
  *
  * Reading this file can return large amounts of data if a cgroup has
  * *lots* of attached tasks. So it may need several calls to read(),
@@ -2139,27 +2358,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
  */
 
 /*
- * Load into 'pidarray' up to 'npids' of the tasks using cgroup
- * 'cgrp'.  Return actual number of pids loaded.  No need to
- * task_lock(p) when reading out p->cgroup, since we're in an RCU
- * read section, so the css_set can't go away, and is
- * immutable after creation.
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+	if (PIDLIST_TOO_LARGE(count))
+		return vmalloc(count * sizeof(pid_t));
+	else
+		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+static void pidlist_free(void *p)
+{
+	if (is_vmalloc_addr(p))
+		vfree(p);
+	else
+		kfree(p);
+}
+static void *pidlist_resize(void *p, int newcount)
+{
+	void *newlist;
+	/* note: if new alloc fails, old p will still be valid either way */
+	if (is_vmalloc_addr(p)) {
+		newlist = vmalloc(newcount * sizeof(pid_t));
+		if (!newlist)
+			return NULL;
+		memcpy(newlist, p, newcount * sizeof(pid_t));
+		vfree(p);
+	} else {
+		newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
+	}
+	return newlist;
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * If the new stripped list is sufficiently smaller and there's enough memory
+ * to allocate a new buffer, will let go of the unneeded memory. Returns the
+ * number of unique elements.
+ */
+/* is the size difference enough that we should re-allocate the array? */
+#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
+static int pidlist_uniq(pid_t **p, int length)
+{
+	int src, dest = 1;
+	pid_t *list = *p;
+	pid_t *newlist;
+
+	/*
+	 * we presume the 0th element is unique, so i starts at 1. trivial
+	 * edge cases first; no work needs to be done for either
+	 */
+	if (length == 0 || length == 1)
+		return length;
+	/* src and dest walk down the list; dest counts unique elements */
+	for (src = 1; src < length; src++) {
+		/* find next unique element */
+		while (list[src] == list[src-1]) {
+			src++;
+			if (src == length)
+				goto after;
+		}
+		/* dest always points to where the next unique element goes */
+		list[dest] = list[src];
+		dest++;
+	}
+after:
+	/*
+	 * if the length difference is large enough, we want to allocate a
+	 * smaller buffer to save memory. if this fails due to out of memory,
+	 * we'll just stay with what we've got.
+	 */
+	if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
+		newlist = pidlist_resize(list, dest);
+		if (newlist)
+			*p = newlist;
+	}
+	return dest;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+	return *(pid_t *)a - *(pid_t *)b;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
  */
-static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+						  enum cgroup_filetype type)
 {
-	int n = 0, pid;
+	struct cgroup_pidlist *l;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+	/*
+	 * We can't drop the pidlist_mutex before taking the l->mutex in case
+	 * the last ref-holder is trying to remove l from the list at the same
+	 * time. Holding the pidlist_mutex precludes somebody taking whichever
+	 * list we find out from under us - compare release_pid_array().
+	 */
+	mutex_lock(&cgrp->pidlist_mutex);
+	list_for_each_entry(l, &cgrp->pidlists, links) {
+		if (l->key.type == type && l->key.ns == ns) {
+			/* found a matching list - drop the extra refcount */
+			put_pid_ns(ns);
+			/* make sure l doesn't vanish out from under us */
+			down_write(&l->mutex);
+			mutex_unlock(&cgrp->pidlist_mutex);
+			l->use_count++;
+			return l;
+		}
+	}
+	/* entry not found; create a new one */
+	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+	if (!l) {
+		mutex_unlock(&cgrp->pidlist_mutex);
+		put_pid_ns(ns);
+		return l;
+	}
+	init_rwsem(&l->mutex);
+	down_write(&l->mutex);
+	l->key.type = type;
+	l->key.ns = ns;
+	l->use_count = 0; /* don't increment here */
+	l->list = NULL;
+	l->owner = cgrp;
+	list_add(&l->links, &cgrp->pidlists);
+	mutex_unlock(&cgrp->pidlist_mutex);
+	return l;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+			      struct cgroup_pidlist **lp)
+{
+	pid_t *array;
+	int length;
+	int pid, n = 0; /* used for populating the array */
 	struct cgroup_iter it;
 	struct task_struct *tsk;
+	struct cgroup_pidlist *l;
+
+	/*
+	 * If cgroup gets more users after we read count, we won't have
+	 * enough space - tough.  This race is indistinguishable to the
+	 * caller from the case that the additional cgroup users didn't
+	 * show up until sometime later on.
+	 */
+	length = cgroup_task_count(cgrp);
+	array = pidlist_allocate(length);
+	if (!array)
+		return -ENOMEM;
+	/* now, populate the array */
 	cgroup_iter_start(cgrp, &it);
 	while ((tsk = cgroup_iter_next(cgrp, &it))) {
-		if (unlikely(n == npids))
+		if (unlikely(n == length))
 			break;
-		pid = task_pid_vnr(tsk);
-		if (pid > 0)
-			pidarray[n++] = pid;
+		/* get tgid or pid for procs or tasks file respectively */
+		if (type == CGROUP_FILE_PROCS)
+			pid = task_tgid_vnr(tsk);
+		else
+			pid = task_pid_vnr(tsk);
+		if (pid > 0) /* make sure to only use valid results */
+			array[n++] = pid;
 	}
 	cgroup_iter_end(cgrp, &it);
-	return n;
+	length = n;
+	/* now sort & (if procs) strip out duplicates */
+	sort(array, length, sizeof(pid_t), cmppid, NULL);
+	if (type == CGROUP_FILE_PROCS)
+		length = pidlist_uniq(&array, length);
+	l = cgroup_pidlist_find(cgrp, type);
+	if (!l) {
+		pidlist_free(array);
+		return -ENOMEM;
+	}
+	/* store array, freeing old if necessary - lock already held */
+	pidlist_free(l->list);
+	l->list = array;
+	l->length = length;
+	l->use_count++;
+	up_write(&l->mutex);
+	*lp = l;
+	return 0;
 }
 
 /**
@@ -2216,37 +2604,14 @@ err:
 	return ret;
 }
 
-/*
- * Cache pids for all threads in the same pid namespace that are
- * opening the same "tasks" file.
- */
-struct cgroup_pids {
-	/* The node in cgrp->pids_list */
-	struct list_head list;
-	/* The cgroup those pids belong to */
-	struct cgroup *cgrp;
-	/* The namepsace those pids belong to */
-	struct pid_namespace *ns;
-	/* Array of process ids in the cgroup */
-	pid_t *tasks_pids;
-	/* How many files are using the this tasks_pids array */
-	int use_count;
-	/* Length of the current tasks_pids array */
-	int length;
-};
-
-static int cmppid(const void *a, const void *b)
-{
-	return *(pid_t *)a - *(pid_t *)b;
-}
 
 /*
- * seq_file methods for the "tasks" file. The seq_file position is the
+ * seq_file methods for the tasks/procs files. The seq_file position is the
  * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->tasks_pids array.
+ * in the cgroup->l->list array.
  */
 
-static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 {
 	/*
 	 * Initially we receive a position value that corresponds to
@@ -2254,48 +2619,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
-	struct cgroup_pids *cp = s->private;
-	struct cgroup *cgrp = cp->cgrp;
+	struct cgroup_pidlist *l = s->private;
 	int index = 0, pid = *pos;
 	int *iter;
 
-	down_read(&cgrp->pids_mutex);
+	down_read(&l->mutex);
 	if (pid) {
-		int end = cp->length;
+		int end = l->length;
 
 		while (index < end) {
 			int mid = (index + end) / 2;
-			if (cp->tasks_pids[mid] == pid) {
+			if (l->list[mid] == pid) {
 				index = mid;
 				break;
-			} else if (cp->tasks_pids[mid] <= pid)
+			} else if (l->list[mid] <= pid)
 				index = mid + 1;
 			else
 				end = mid;
 		}
 	}
 	/* If we're off the end of the array, we're done */
-	if (index >= cp->length)
+	if (index >= l->length)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
-	iter = cp->tasks_pids + index;
+	iter = l->list + index;
 	*pos = *iter;
 	return iter;
 }
 
-static void cgroup_tasks_stop(struct seq_file *s, void *v)
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
-	struct cgroup_pids *cp = s->private;
-	struct cgroup *cgrp = cp->cgrp;
-	up_read(&cgrp->pids_mutex);
+	struct cgroup_pidlist *l = s->private;
+	up_read(&l->mutex);
 }
 
-static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct cgroup_pids *cp = s->private;
-	int *p = v;
-	int *end = cp->tasks_pids + cp->length;
-
+	struct cgroup_pidlist *l = s->private;
+	pid_t *p = v;
+	pid_t *end = l->list + l->length;
 	/*
 	 * Advance to the next pid in the array. If this goes off the
 	 * end, we're done
@@ -2309,124 +2671,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
 	}
 }
 
-static int cgroup_tasks_show(struct seq_file *s, void *v)
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
 {
 	return seq_printf(s, "%d\n", *(int *)v);
 }
 
-static const struct seq_operations cgroup_tasks_seq_operations = {
-	.start = cgroup_tasks_start,
-	.stop = cgroup_tasks_stop,
-	.next = cgroup_tasks_next,
-	.show = cgroup_tasks_show,
+/*
+ * seq_operations functions for iterating on pidlists through seq_file -
+ * independent of whether it's tasks or procs
+ */
+static const struct seq_operations cgroup_pidlist_seq_operations = {
+	.start = cgroup_pidlist_start,
+	.stop = cgroup_pidlist_stop,
+	.next = cgroup_pidlist_next,
+	.show = cgroup_pidlist_show,
 };
 
-static void release_cgroup_pid_array(struct cgroup_pids *cp)
+static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 {
-	struct cgroup *cgrp = cp->cgrp;
-
-	down_write(&cgrp->pids_mutex);
-	BUG_ON(!cp->use_count);
-	if (!--cp->use_count) {
-		list_del(&cp->list);
-		put_pid_ns(cp->ns);
-		kfree(cp->tasks_pids);
-		kfree(cp);
+	/*
+	 * the case where we're the last user of this particular pidlist will
+	 * have us remove it from the cgroup's list, which entails taking the
+	 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
+	 * pidlist_mutex, we have to take pidlist_mutex first.
+	 */
+	mutex_lock(&l->owner->pidlist_mutex);
+	down_write(&l->mutex);
+	BUG_ON(!l->use_count);
+	if (!--l->use_count) {
+		/* we're the last user if refcount is 0; remove and free */
+		list_del(&l->links);
+		mutex_unlock(&l->owner->pidlist_mutex);
+		pidlist_free(l->list);
+		put_pid_ns(l->key.ns);
+		up_write(&l->mutex);
+		kfree(l);
+		return;
 	}
-	up_write(&cgrp->pids_mutex);
+	mutex_unlock(&l->owner->pidlist_mutex);
+	up_write(&l->mutex);
 }
 
-static int cgroup_tasks_release(struct inode *inode, struct file *file)
+static int cgroup_pidlist_release(struct inode *inode, struct file *file)
 {
-	struct seq_file *seq;
-	struct cgroup_pids *cp;
-
+	struct cgroup_pidlist *l;
 	if (!(file->f_mode & FMODE_READ))
 		return 0;
-
-	seq = file->private_data;
-	cp = seq->private;
-
-	release_cgroup_pid_array(cp);
+	/*
+	 * the seq_file will only be initialized if the file was opened for
+	 * reading; hence we check if it's not null only in that case.
+	 */
+	l = ((struct seq_file *)file->private_data)->private;
+	cgroup_release_pid_array(l);
 	return seq_release(inode, file);
 }
 
-static struct file_operations cgroup_tasks_operations = {
+static const struct file_operations cgroup_pidlist_operations = {
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.write = cgroup_file_write,
-	.release = cgroup_tasks_release,
+	.release = cgroup_pidlist_release,
 };
 
 /*
- * Handle an open on 'tasks' file.  Prepare an array containing the
- * process id's of tasks currently attached to the cgroup being opened.
+ * The following functions handle opens on a file that displays a pidlist
+ * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
+ * in the cgroup.
  */
-
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
+/* helper function for the two below it */
+static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-	struct pid_namespace *ns = current->nsproxy->pid_ns;
-	struct cgroup_pids *cp;
-	pid_t *pidarray;
-	int npids;
+	struct cgroup_pidlist *l;
 	int retval;
 
 	/* Nothing to do for write-only files */
 	if (!(file->f_mode & FMODE_READ))
 		return 0;
 
-	/*
-	 * If cgroup gets more users after we read count, we won't have
-	 * enough space - tough.  This race is indistinguishable to the
-	 * caller from the case that the additional cgroup users didn't
-	 * show up until sometime later on.
-	 */
-	npids = cgroup_task_count(cgrp);
-	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
-	if (!pidarray)
-		return -ENOMEM;
-	npids = pid_array_load(pidarray, npids, cgrp);
-	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
-	/*
-	 * Store the array in the cgroup, freeing the old
-	 * array if necessary
-	 */
-	down_write(&cgrp->pids_mutex);
-
-	list_for_each_entry(cp, &cgrp->pids_list, list) {
-		if (ns == cp->ns)
-			goto found;
-	}
-
-	cp = kzalloc(sizeof(*cp), GFP_KERNEL);
-	if (!cp) {
-		up_write(&cgrp->pids_mutex);
-		kfree(pidarray);
-		return -ENOMEM;
-	}
-	cp->cgrp = cgrp;
-	cp->ns = ns;
-	get_pid_ns(ns);
-	list_add(&cp->list, &cgrp->pids_list);
-found:
-	kfree(cp->tasks_pids);
-	cp->tasks_pids = pidarray;
-	cp->length = npids;
-	cp->use_count++;
-	up_write(&cgrp->pids_mutex);
-
-	file->f_op = &cgroup_tasks_operations;
+	/* have the array populated */
+	retval = pidlist_array_load(cgrp, type, &l);
+	if (retval)
+		return retval;
+	/* configure file information */
+	file->f_op = &cgroup_pidlist_operations;
 
-	retval = seq_open(file, &cgroup_tasks_seq_operations);
+	retval = seq_open(file, &cgroup_pidlist_seq_operations);
 	if (retval) {
-		release_cgroup_pid_array(cp);
+		cgroup_release_pid_array(l);
 		return retval;
 	}
-	((struct seq_file *)file->private_data)->private = cp;
+	((struct seq_file *)file->private_data)->private = l;
 	return 0;
 }
+static int cgroup_tasks_open(struct inode *unused, struct file *file)
+{
+	return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
+}
+static int cgroup_procs_open(struct inode *unused, struct file *file)
+{
+	return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
+}
 
 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
 					    struct cftype *cft)
@@ -2449,21 +2794,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 /*
  * for the common functions, 'private' gives the type of file
  */
+/* for hysterical raisins, we can't put this on the older files */
+#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
 static struct cftype files[] = {
 	{
 		.name = "tasks",
 		.open = cgroup_tasks_open,
 		.write_u64 = cgroup_tasks_write,
-		.release = cgroup_tasks_release,
-		.private = FILE_TASKLIST,
+		.release = cgroup_pidlist_release,
 		.mode = S_IRUGO | S_IWUSR,
 	},
-
+	{
+		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
+		.open = cgroup_procs_open,
+		/* .write_u64 = cgroup_procs_write, TODO */
+		.release = cgroup_pidlist_release,
+		.mode = S_IRUGO,
+	},
 	{
 		.name = "notify_on_release",
 		.read_u64 = cgroup_read_notify_on_release,
 		.write_u64 = cgroup_write_notify_on_release,
-		.private = FILE_NOTIFY_ON_RELEASE,
 	},
 };
 
@@ -2472,7 +2823,6 @@ static struct cftype cft_release_agent = {
 	.read_seq_string = cgroup_release_agent_show,
 	.write_string = cgroup_release_agent_write,
 	.max_write_len = PATH_MAX,
-	.private = FILE_RELEASE_AGENT,
 };
 
 static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -2879,6 +3229,7 @@ int __init cgroup_init_early(void)
 	init_task.cgroups = &init_css_set;
 
 	init_css_set_link.cg = &init_css_set;
+	init_css_set_link.cgrp = dummytop;
 	list_add(&init_css_set_link.cgrp_link_list,
 		 &rootnode.top_cgroup.css_sets);
 	list_add(&init_css_set_link.cg_link_list,
@@ -2933,7 +3284,7 @@ int __init cgroup_init(void)
 	/* Add init_css_set to the hash table */
 	hhead = css_set_hash(init_css_set.subsys);
 	hlist_add_head(&init_css_set.hlist, hhead);
-
+	BUG_ON(!init_root_id(&rootnode));
 	err = register_filesystem(&cgroup_fs_type);
 	if (err < 0)
 		goto out;
@@ -2986,15 +3337,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
 	for_each_active_root(root) {
 		struct cgroup_subsys *ss;
 		struct cgroup *cgrp;
-		int subsys_id;
 		int count = 0;
 
-		seq_printf(m, "%lu:", root->subsys_bits);
+		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(root, ss)
 			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+		if (strlen(root->name))
+			seq_printf(m, "%sname=%s", count ? "," : "",
+				   root->name);
 		seq_putc(m, ':');
-		get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
-		cgrp = task_cgroup(tsk, subsys_id);
+		cgrp = task_cgroup_from_root(tsk, root);
 		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
 		if (retval < 0)
 			goto out_unlock;
@@ -3033,8 +3385,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 	mutex_lock(&cgroup_mutex);
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
-		seq_printf(m, "%s\t%lu\t%d\t%d\n",
-			   ss->name, ss->root->subsys_bits,
+		seq_printf(m, "%s\t%d\t%d\t%d\n",
+			   ss->name, ss->root->hierarchy_id,
 			   ss->root->number_of_cgroups, !ss->disabled);
 	}
 	mutex_unlock(&cgroup_mutex);
@@ -3320,13 +3672,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
 {
 	int ret;
 	struct cgroup *target;
-	int subsys_id;
 
 	if (cgrp == dummytop)
 		return 1;
 
-	get_first_subsys(cgrp, NULL, &subsys_id);
-	target = task_cgroup(task, subsys_id);
+	target = task_cgroup_from_root(task, cgrp->root);
 	while (cgrp != target && cgrp!= cgrp->top_cgroup)
 		cgrp = cgrp->parent;
 	ret = (cgrp == target);
@@ -3693,3 +4043,154 @@ css_get_next(struct cgroup_subsys *ss, int id,
 	return ret;
 }
 
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
+						   struct cgroup *cont)
+{
+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+	if (!css)
+		return ERR_PTR(-ENOMEM);
+
+	return css;
+}
+
+static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	kfree(cont->subsys[debug_subsys_id]);
+}
+
+static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
+{
+	return atomic_read(&cont->count);
+}
+
+static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
+{
+	return cgroup_task_count(cont);
+}
+
+static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+{
+	return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup *cont,
+					   struct cftype *cft)
+{
+	u64 count;
+
+	rcu_read_lock();
+	count = atomic_read(&current->cgroups->refcount);
+	rcu_read_unlock();
+	return count;
+}
+
+static int current_css_set_cg_links_read(struct cgroup *cont,
+					 struct cftype *cft,
+					 struct seq_file *seq)
+{
+	struct cg_cgroup_link *link;
+	struct css_set *cg;
+
+	read_lock(&css_set_lock);
+	rcu_read_lock();
+	cg = rcu_dereference(current->cgroups);
+	list_for_each_entry(link, &cg->cg_links, cg_link_list) {
+		struct cgroup *c = link->cgrp;
+		const char *name;
+
+		if (c->dentry)
+			name = c->dentry->d_name.name;
+		else
+			name = "?";
+		seq_printf(seq, "Root %d group %s\n",
+			   c->root->hierarchy_id, name);
+	}
+	rcu_read_unlock();
+	read_unlock(&css_set_lock);
+	return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct cgroup *cont,
+				 struct cftype *cft,
+				 struct seq_file *seq)
+{
+	struct cg_cgroup_link *link;
+
+	read_lock(&css_set_lock);
+	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
+		struct css_set *cg = link->cg;
+		struct task_struct *task;
+		int count = 0;
+		seq_printf(seq, "css_set %p\n", cg);
+		list_for_each_entry(task, &cg->tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
+				seq_puts(seq, "  ...\n");
+				break;
+			} else {
+				seq_printf(seq, "  task %d\n",
+					   task_pid_vnr(task));
+			}
+		}
+	}
+	read_unlock(&css_set_lock);
+	return 0;
+}
+
+static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+}
+
+static struct cftype debug_files[] =  {
+	{
+		.name = "cgroup_refcount",
+		.read_u64 = cgroup_refcount_read,
+	},
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.read_u64 = current_css_set_read,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.read_seq_string = current_css_set_cg_links_read,
+	},
+
+	{
+		.name = "cgroup_css_links",
+		.read_seq_string = cgroup_css_links_read,
+	},
+
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	},
+};
+
+static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	return cgroup_add_files(cont, ss, debug_files,
+				ARRAY_SIZE(debug_files));
+}
+
+struct cgroup_subsys debug_subsys = {
+	.name = "debug",
+	.create = debug_create,
+	.destroy = debug_destroy,
+	.populate = debug_populate,
+	.subsys_id = debug_subsys_id,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index 0c92d797baa..00000000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * kernel/cgroup_debug.c - Example cgroup subsystem that
- * exposes debug info
- *
- * Copyright (C) Google Inc, 2007
- *
- * Developed by Paul Menage (menage@google.com)
- *
- */
-
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
-
-#include <asm/atomic.h>
-
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
-						   struct cgroup *cont)
-{
-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-	if (!css)
-		return ERR_PTR(-ENOMEM);
-
-	return css;
-}
-
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-	kfree(cont->subsys[debug_subsys_id]);
-}
-
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
-{
-	return atomic_read(&cont->count);
-}
-
-static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
-{
-	u64 count;
-
-	count = cgroup_task_count(cont);
-	return count;
-}
-
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
-{
-	return (u64)(long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup *cont,
-					   struct cftype *cft)
-{
-	u64 count;
-
-	rcu_read_lock();
-	count = atomic_read(&current->cgroups->refcount);
-	rcu_read_unlock();
-	return count;
-}
-
-static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
-{
-	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
-}
-
-static struct cftype files[] =  {
-	{
-		.name = "cgroup_refcount",
-		.read_u64 = cgroup_refcount_read,
-	},
-	{
-		.name = "taskcount",
-		.read_u64 = taskcount_read,
-	},
-
-	{
-		.name = "current_css_set",
-		.read_u64 = current_css_set_read,
-	},
-
-	{
-		.name = "current_css_set_refcount",
-		.read_u64 = current_css_set_refcount_read,
-	},
-
-	{
-		.name = "releasable",
-		.read_u64 = releasable_read,
-	},
-};
-
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-	return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
-}
-
-struct cgroup_subsys debug_subsys = {
-	.name = "debug",
-	.create = debug_create,
-	.destroy = debug_destroy,
-	.populate = debug_populate,
-	.subsys_id = debug_subsys_id,
-};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2bcad..59e9ef6aab4 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
  */
 static int freezer_can_attach(struct cgroup_subsys *ss,
 			      struct cgroup *new_cgroup,
-			      struct task_struct *task)
+			      struct task_struct *task, bool threadgroup)
 {
 	struct freezer *freezer;
 
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
 	if (freezer->state == CGROUP_FROZEN)
 		return -EBUSY;
 
+	if (threadgroup) {
+		struct task_struct *c;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+			if (is_task_frozen_enough(c)) {
+				rcu_read_unlock();
+				return -EBUSY;
+			}
+		}
+		rcu_read_unlock();
+	}
+
 	return 0;
 }
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41bd50..b5cb469d254 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
 static cpumask_var_t cpus_attach;
 
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss,
-			     struct cgroup *cont, struct task_struct *tsk)
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+			     struct task_struct *tsk, bool threadgroup)
 {
+	int ret;
 	struct cpuset *cs = cgroup_cs(cont);
 
 	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
 	if (tsk->flags & PF_THREAD_BOUND)
 		return -EINVAL;
 
-	return security_task_setscheduler(tsk, 0, NULL);
+	ret = security_task_setscheduler(tsk, 0, NULL);
+	if (ret)
+		return ret;
+	if (threadgroup) {
+		struct task_struct *c;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+			ret = security_task_setscheduler(c, 0, NULL);
+			if (ret) {
+				rcu_read_unlock();
+				return ret;
+			}
+		}
+		rcu_read_unlock();
+	}
+	return 0;
+}
+
+static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
+			       struct cpuset *cs)
+{
+	int err;
+	/*
+	 * can_attach beforehand should guarantee that this doesn't fail.
+	 * TODO: have a better way to handle failure here
+	 */
+	err = set_cpus_allowed_ptr(tsk, cpus_attach);
+	WARN_ON_ONCE(err);
+
+	task_lock(tsk);
+	cpuset_change_task_nodemask(tsk, to);
+	task_unlock(tsk);
+	cpuset_update_task_spread_flag(cs, tsk);
+
 }
 
-static void cpuset_attach(struct cgroup_subsys *ss,
-			  struct cgroup *cont, struct cgroup *oldcont,
-			  struct task_struct *tsk)
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+			  struct cgroup *oldcont, struct task_struct *tsk,
+			  bool threadgroup)
 {
 	nodemask_t from, to;
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
-	int err;
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 		guarantee_online_cpus(cs, cpus_attach);
 		guarantee_online_mems(cs, &to);
 	}
-	err = set_cpus_allowed_ptr(tsk, cpus_attach);
-	if (err)
-		return;
 
-	task_lock(tsk);
-	cpuset_change_task_nodemask(tsk, &to);
-	task_unlock(tsk);
-	cpuset_update_task_spread_flag(cs, tsk);
+	/* do per-task migration stuff possibly for each in the threadgroup */
+	cpuset_attach_task(tsk, &to, cs);
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+			cpuset_attach_task(c, &to, cs);
+		}
+		rcu_read_unlock();
+	}
 
+	/* change mm; only needs to be done once even if threadgroup */
 	from = oldcs->mems_allowed;
 	to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
diff --git a/kernel/cred.c b/kernel/cred.c
index d7f7a01082e..dd76cfe5f5b 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as);
 
 #ifdef CONFIG_DEBUG_CREDENTIALS
 
+bool creds_are_invalid(const struct cred *cred)
+{
+	if (cred->magic != CRED_MAGIC)
+		return true;
+	if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
+		return true;
+#ifdef CONFIG_SECURITY_SELINUX
+	if (selinux_is_enabled()) {
+		if ((unsigned long) cred->security < PAGE_SIZE)
+			return true;
+		if ((*(u32 *)cred->security & 0xffffff00) ==
+		    (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
+			return true;
+	}
+#endif
+	return false;
+}
+EXPORT_SYMBOL(creds_are_invalid);
+
 /*
  * dump invalid credentials
  */
diff --git a/kernel/exit.c b/kernel/exit.c
index 60d6fdcc926..5859f598c95 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -976,8 +976,6 @@ NORET_TYPE void do_exit(long code)
 		disassociate_ctty(1);
 
 	module_put(task_thread_info(tsk)->exec_domain->module);
-	if (tsk->binfmt)
-		module_put(tsk->binfmt->module);
 
 	proc_exit_connector(tsk);
 
@@ -1097,28 +1095,28 @@ struct wait_opts {
 	int __user		*wo_stat;
 	struct rusage __user	*wo_rusage;
 
+	wait_queue_t		child_wait;
 	int			notask_error;
 };
 
-static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
+static inline
+struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 {
-	struct pid *pid = NULL;
-	if (type == PIDTYPE_PID)
-		pid = task->pids[type].pid;
-	else if (type < PIDTYPE_MAX)
-		pid = task->group_leader->pids[type].pid;
-	return pid;
+	if (type != PIDTYPE_PID)
+		task = task->group_leader;
+	return task->pids[type].pid;
 }
 
-static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
-	int err;
-
-	if (wo->wo_type < PIDTYPE_MAX) {
-		if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
-			return 0;
-	}
+	return	wo->wo_type == PIDTYPE_MAX ||
+		task_pid_type(p, wo->wo_type) == wo->wo_pid;
+}
 
+static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+{
+	if (!eligible_pid(wo, p))
+		return 0;
 	/* Wait for all children (clone and not) if __WALL is set;
 	 * otherwise, wait for clone children *only* if __WCLONE is
 	 * set; otherwise, wait for non-clone children *only*.  (Note:
@@ -1128,10 +1126,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
 	    && !(wo->wo_flags & __WALL))
 		return 0;
 
-	err = security_task_wait(p);
-	if (err)
-		return err;
-
 	return 1;
 }
 
@@ -1144,18 +1138,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
 
 	put_task_struct(p);
 	infop = wo->wo_info;
-	if (!retval)
-		retval = put_user(SIGCHLD, &infop->si_signo);
-	if (!retval)
-		retval = put_user(0, &infop->si_errno);
-	if (!retval)
-		retval = put_user((short)why, &infop->si_code);
-	if (!retval)
-		retval = put_user(pid, &infop->si_pid);
-	if (!retval)
-		retval = put_user(uid, &infop->si_uid);
-	if (!retval)
-		retval = put_user(status, &infop->si_status);
+	if (infop) {
+		if (!retval)
+			retval = put_user(SIGCHLD, &infop->si_signo);
+		if (!retval)
+			retval = put_user(0, &infop->si_errno);
+		if (!retval)
+			retval = put_user((short)why, &infop->si_code);
+		if (!retval)
+			retval = put_user(pid, &infop->si_pid);
+		if (!retval)
+			retval = put_user(uid, &infop->si_uid);
+		if (!retval)
+			retval = put_user(status, &infop->si_status);
+	}
 	if (!retval)
 		retval = pid;
 	return retval;
@@ -1485,13 +1481,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
  * then ->notask_error is 0 if @p is an eligible child,
  * or another error from security_task_wait(), or still -ECHILD.
  */
-static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
-				int ptrace, struct task_struct *p)
+static int wait_consider_task(struct wait_opts *wo, int ptrace,
+				struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
 	if (!ret)
 		return ret;
 
+	ret = security_task_wait(p);
 	if (unlikely(ret < 0)) {
 		/*
 		 * If we have not yet seen any eligible child,
@@ -1553,7 +1550,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 		 * Do not consider detached threads.
 		 */
 		if (!task_detached(p)) {
-			int ret = wait_consider_task(wo, tsk, 0, p);
+			int ret = wait_consider_task(wo, 0, p);
 			if (ret)
 				return ret;
 		}
@@ -1567,7 +1564,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
-		int ret = wait_consider_task(wo, tsk, 1, p);
+		int ret = wait_consider_task(wo, 1, p);
 		if (ret)
 			return ret;
 	}
@@ -1575,15 +1572,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	return 0;
 }
 
+static int child_wait_callback(wait_queue_t *wait, unsigned mode,
+				int sync, void *key)
+{
+	struct wait_opts *wo = container_of(wait, struct wait_opts,
+						child_wait);
+	struct task_struct *p = key;
+
+	if (!eligible_pid(wo, p))
+		return 0;
+
+	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
+		return 0;
+
+	return default_wake_function(wait, mode, sync, key);
+}
+
+void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
+{
+	__wake_up_sync_key(&parent->signal->wait_chldexit,
+				TASK_INTERRUPTIBLE, 1, p);
+}
+
 static long do_wait(struct wait_opts *wo)
 {
-	DECLARE_WAITQUEUE(wait, current);
 	struct task_struct *tsk;
 	int retval;
 
 	trace_sched_process_wait(wo->wo_pid);
 
-	add_wait_queue(&current->signal->wait_chldexit,&wait);
+	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+	wo->child_wait.private = current;
+	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 repeat:
 	/*
 	 * If there is nothing that can match our critiera just get out.
@@ -1624,32 +1644,7 @@ notask:
 	}
 end:
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&current->signal->wait_chldexit,&wait);
-	if (wo->wo_info) {
-		struct siginfo __user *infop = wo->wo_info;
-
-		if (retval > 0)
-			retval = 0;
-		else {
-			/*
-			 * For a WNOHANG return, clear out all the fields
-			 * we would set so the user can easily tell the
-			 * difference.
-			 */
-			if (!retval)
-				retval = put_user(0, &infop->si_signo);
-			if (!retval)
-				retval = put_user(0, &infop->si_errno);
-			if (!retval)
-				retval = put_user(0, &infop->si_code);
-			if (!retval)
-				retval = put_user(0, &infop->si_pid);
-			if (!retval)
-				retval = put_user(0, &infop->si_uid);
-			if (!retval)
-				retval = put_user(0, &infop->si_status);
-		}
-	}
+	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 	return retval;
 }
 
@@ -1694,6 +1689,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	wo.wo_stat	= NULL;
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
+
+	if (ret > 0) {
+		ret = 0;
+	} else if (infop) {
+		/*
+		 * For a WNOHANG return, clear out all the fields
+		 * we would set so the user can easily tell the
+		 * difference.
+		 */
+		if (!ret)
+			ret = put_user(0, &infop->si_signo);
+		if (!ret)
+			ret = put_user(0, &infop->si_errno);
+		if (!ret)
+			ret = put_user(0, &infop->si_code);
+		if (!ret)
+			ret = put_user(0, &infop->si_pid);
+		if (!ret)
+			ret = put_user(0, &infop->si_uid);
+		if (!ret)
+			ret = put_user(0, &infop->si_status);
+	}
+
 	put_pid(pid);
 
 	/* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index 51ad0b0b726..266c6af6ef1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -434,6 +434,14 @@ __setup("coredump_filter=", coredump_filter_setup);
 
 #include <linux/init_task.h>
 
+static void mm_init_aio(struct mm_struct *mm)
+{
+#ifdef CONFIG_AIO
+	spin_lock_init(&mm->ioctx_lock);
+	INIT_HLIST_HEAD(&mm->ioctx_list);
+#endif
+}
+
 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
 	atomic_set(&mm->mm_users, 1);
@@ -447,10 +455,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
-	spin_lock_init(&mm->ioctx_lock);
-	INIT_HLIST_HEAD(&mm->ioctx_list);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
+	mm_init_aio(mm);
 	mm_init_owner(mm, p);
 
 	if (likely(!mm_alloc_pgd(mm))) {
@@ -511,6 +518,8 @@ void mmput(struct mm_struct *mm)
 			spin_unlock(&mmlist_lock);
 		}
 		put_swap_token(mm);
+		if (mm->binfmt)
+			module_put(mm->binfmt->module);
 		mmdrop(mm);
 	}
 }
@@ -636,9 +645,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	mm->hiwater_rss = get_mm_rss(mm);
 	mm->hiwater_vm = mm->total_vm;
 
+	if (mm->binfmt && !try_module_get(mm->binfmt->module))
+		goto free_pt;
+
 	return mm;
 
 free_pt:
+	/* don't put binfmt in mmput, we haven't got module yet */
+	mm->binfmt = NULL;
 	mmput(mm);
 
 fail_nomem:
@@ -979,6 +993,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
 		return ERR_PTR(-EINVAL);
 
+	/*
+	 * Siblings of global init remain as zombies on exit since they are
+	 * not reaped by their parent (swapper). To solve this and to avoid
+	 * multi-rooted process trees, prevent global and container-inits
+	 * from creating siblings.
+	 */
+	if ((clone_flags & CLONE_PARENT) &&
+				current->signal->flags & SIGNAL_UNKILLABLE)
+		return ERR_PTR(-EINVAL);
+
 	retval = security_task_create(clone_flags);
 	if (retval)
 		goto fork_out;
@@ -1020,9 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (!try_module_get(task_thread_info(p)->exec_domain->module))
 		goto bad_fork_cleanup_count;
 
-	if (p->binfmt && !try_module_get(p->binfmt->module))
-		goto bad_fork_cleanup_put_domain;
-
 	p->did_exec = 0;
 	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
 	copy_flags(clone_flags, p);
@@ -1310,9 +1331,6 @@ bad_fork_cleanup_cgroup:
 #endif
 	cgroup_exit(p, cgroup_callbacks_done);
 	delayacct_tsk_free(p);
-	if (p->binfmt)
-		module_put(p->binfmt->module);
-bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 654efd09f6a..70a298d6da7 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
 	bool "Profile entire Kernel"
 	depends on GCOV_KERNEL
-	depends on S390 || X86 || (PPC && EXPERIMENTAL)
+	depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
 	default n
 	---help---
 	This options activates profiling for the entire kernel.
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 022a4927b78..d4e84174740 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
  * Process updating of timeout sysctl
  */
 int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-				  struct file *filp, void __user *buffer,
+				  void __user *buffer,
 				  size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (ret || !write)
 		goto out;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 689d20f3930..9fcb53a11f8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -143,7 +143,6 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
-	enum umh_wait wait = sub_info->wait;
 	int retval;
 
 	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
@@ -185,14 +184,10 @@ static int ____call_usermodehelper(void *data)
 	 */
 	set_user_nice(current, 0);
 
-	if (wait == UMH_WAIT_EXEC)
-		complete(sub_info->complete);
-
 	retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
 
 	/* Exec failed? */
-	if (wait != UMH_WAIT_EXEC)
-		sub_info->retval = retval;
+	sub_info->retval = retval;
 	do_exit(0);
 }
 
@@ -271,14 +266,16 @@ static void __call_usermodehelper(struct work_struct *work)
 
 	switch (wait) {
 	case UMH_NO_WAIT:
-	case UMH_WAIT_EXEC:
 		break;
 
 	case UMH_WAIT_PROC:
 		if (pid > 0)
 			break;
 		sub_info->retval = pid;
-		break;
+		/* FALLTHROUGH */
+
+	case UMH_WAIT_EXEC:
+		complete(sub_info->complete);
 	}
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index e6bc4b28aa6..5a29397ca4b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1797,6 +1797,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 	}
 }
 
+static void free_modinfo(struct module *mod)
+{
+	struct module_attribute *attr;
+	int i;
+
+	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+		if (attr->free)
+			attr->free(mod);
+	}
+}
+
 #ifdef CONFIG_KALLSYMS
 
 /* lookup symbol in given range of kernel_symbols */
@@ -1862,13 +1873,93 @@ static char elf_type(const Elf_Sym *sym,
 	return '?';
 }
 
+static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+                           unsigned int shnum)
+{
+	const Elf_Shdr *sec;
+
+	if (src->st_shndx == SHN_UNDEF
+	    || src->st_shndx >= shnum
+	    || !src->st_name)
+		return false;
+
+	sec = sechdrs + src->st_shndx;
+	if (!(sec->sh_flags & SHF_ALLOC)
+#ifndef CONFIG_KALLSYMS_ALL
+	    || !(sec->sh_flags & SHF_EXECINSTR)
+#endif
+	    || (sec->sh_entsize & INIT_OFFSET_MASK))
+		return false;
+
+	return true;
+}
+
+static unsigned long layout_symtab(struct module *mod,
+				   Elf_Shdr *sechdrs,
+				   unsigned int symindex,
+				   unsigned int strindex,
+				   const Elf_Ehdr *hdr,
+				   const char *secstrings,
+				   unsigned long *pstroffs,
+				   unsigned long *strmap)
+{
+	unsigned long symoffs;
+	Elf_Shdr *symsect = sechdrs + symindex;
+	Elf_Shdr *strsect = sechdrs + strindex;
+	const Elf_Sym *src;
+	const char *strtab;
+	unsigned int i, nsrc, ndst;
+
+	/* Put symbol section at end of init part of module. */
+	symsect->sh_flags |= SHF_ALLOC;
+	symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
+					 symindex) | INIT_OFFSET_MASK;
+	DEBUGP("\t%s\n", secstrings + symsect->sh_name);
+
+	src = (void *)hdr + symsect->sh_offset;
+	nsrc = symsect->sh_size / sizeof(*src);
+	strtab = (void *)hdr + strsect->sh_offset;
+	for (ndst = i = 1; i < nsrc; ++i, ++src)
+		if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
+			unsigned int j = src->st_name;
+
+			while(!__test_and_set_bit(j, strmap) && strtab[j])
+				++j;
+			++ndst;
+		}
+
+	/* Append room for core symbols at end of core part. */
+	symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
+	mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
+
+	/* Put string table section at end of init part of module. */
+	strsect->sh_flags |= SHF_ALLOC;
+	strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
+					 strindex) | INIT_OFFSET_MASK;
+	DEBUGP("\t%s\n", secstrings + strsect->sh_name);
+
+	/* Append room for core symbols' strings at end of core part. */
+	*pstroffs = mod->core_size;
+	__set_bit(0, strmap);
+	mod->core_size += bitmap_weight(strmap, strsect->sh_size);
+
+	return symoffs;
+}
+
 static void add_kallsyms(struct module *mod,
 			 Elf_Shdr *sechdrs,
+			 unsigned int shnum,
 			 unsigned int symindex,
 			 unsigned int strindex,
-			 const char *secstrings)
+			 unsigned long symoffs,
+			 unsigned long stroffs,
+			 const char *secstrings,
+			 unsigned long *strmap)
 {
-	unsigned int i;
+	unsigned int i, ndst;
+	const Elf_Sym *src;
+	Elf_Sym *dst;
+	char *s;
 
 	mod->symtab = (void *)sechdrs[symindex].sh_addr;
 	mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1878,13 +1969,44 @@ static void add_kallsyms(struct module *mod,
 	for (i = 0; i < mod->num_symtab; i++)
 		mod->symtab[i].st_info
 			= elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
+
+	mod->core_symtab = dst = mod->module_core + symoffs;
+	src = mod->symtab;
+	*dst = *src;
+	for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
+		if (!is_core_symbol(src, sechdrs, shnum))
+			continue;
+		dst[ndst] = *src;
+		dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
+		++ndst;
+	}
+	mod->core_num_syms = ndst;
+
+	mod->core_strtab = s = mod->module_core + stroffs;
+	for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
+		if (test_bit(i, strmap))
+			*++s = mod->strtab[i];
 }
 #else
+static inline unsigned long layout_symtab(struct module *mod,
+					  Elf_Shdr *sechdrs,
+					  unsigned int symindex,
+					  unsigned int strindex,
+					  const Elf_Hdr *hdr,
+					  const char *secstrings,
+					  unsigned long *pstroffs,
+					  unsigned long *strmap)
+{
+}
 static inline void add_kallsyms(struct module *mod,
 				Elf_Shdr *sechdrs,
+				unsigned int shnum,
 				unsigned int symindex,
 				unsigned int strindex,
-				const char *secstrings)
+				unsigned long symoffs,
+				unsigned long stroffs,
+				const char *secstrings,
+				const unsigned long *strmap)
 {
 }
 #endif /* CONFIG_KALLSYMS */
@@ -1959,6 +2081,9 @@ static noinline struct module *load_module(void __user *umod,
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+#ifdef CONFIG_KALLSYMS
+	unsigned long symoffs, stroffs, *strmap;
+#endif
 	mm_segment_t old_fs;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2040,11 +2165,6 @@ static noinline struct module *load_module(void __user *umod,
 	/* Don't keep modinfo and version sections. */
 	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
-#ifdef CONFIG_KALLSYMS
-	/* Keep symbol and string tables for decoding later. */
-	sechdrs[symindex].sh_flags |= SHF_ALLOC;
-	sechdrs[strindex].sh_flags |= SHF_ALLOC;
-#endif
 
 	/* Check module struct version now, before we try to use module. */
 	if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2080,6 +2200,13 @@ static noinline struct module *load_module(void __user *umod,
 		goto free_hdr;
 	}
 
+	strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
+			 * sizeof(long), GFP_KERNEL);
+	if (!strmap) {
+		err = -ENOMEM;
+		goto free_mod;
+	}
+
 	if (find_module(mod->name)) {
 		err = -EEXIST;
 		goto free_mod;
@@ -2109,6 +2236,8 @@ static noinline struct module *load_module(void __user *umod,
 	   this is done generically; there doesn't appear to be any
 	   special cases for the architectures. */
 	layout_sections(mod, hdr, sechdrs, secstrings);
+	symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
+				secstrings, &stroffs, strmap);
 
 	/* Do the allocs. */
 	ptr = module_alloc_update_bounds(mod->core_size);
@@ -2313,7 +2442,10 @@ static noinline struct module *load_module(void __user *umod,
 	percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
 		       sechdrs[pcpuindex].sh_size);
 
-	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
+	add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
+		     symoffs, stroffs, secstrings, strmap);
+	kfree(strmap);
+	strmap = NULL;
 
 	if (!mod->taints) {
 		struct _ddebug *debug;
@@ -2385,13 +2517,14 @@ static noinline struct module *load_module(void __user *umod,
 	synchronize_sched();
 	module_arch_cleanup(mod);
  cleanup:
+	free_modinfo(mod);
 	kobject_del(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
  free_unload:
 	module_unload_free(mod);
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
- free_init:
 	percpu_modfree(mod->refptr);
+ free_init:
 #endif
 	module_free(mod, mod->module_init);
  free_core:
@@ -2402,6 +2535,7 @@ static noinline struct module *load_module(void __user *umod,
 		percpu_modfree(percpu);
  free_mod:
 	kfree(args);
+	kfree(strmap);
  free_hdr:
 	vfree(hdr);
 	return ERR_PTR(err);
@@ -2491,6 +2625,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	/* Drop initial reference. */
 	module_put(mod);
 	trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+	mod->num_symtab = mod->core_num_syms;
+	mod->symtab = mod->core_symtab;
+	mod->strtab = mod->core_strtab;
+#endif
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f9e5a..2a5dfec8efe 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
  *       (hence either you are in the same cgroup as task, or in an
  *        ancestor cgroup thereof)
  */
-static int ns_can_attach(struct cgroup_subsys *ss,
-		struct cgroup *new_cgroup, struct task_struct *task)
+static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
+			 struct task_struct *task, bool threadgroup)
 {
 	if (current != task) {
 		if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
 	if (!cgroup_is_descendant(new_cgroup, task))
 		return -EPERM;
 
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+			if (!cgroup_is_descendant(new_cgroup, c)) {
+				rcu_read_unlock();
+				return -EPERM;
+			}
+		}
+		rcu_read_unlock();
+	}
+
 	return 0;
 }
 
diff --git a/kernel/params.c b/kernel/params.c
index 7f6912ced2b..9da58eabdcb 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/ctype.h>
 
 #if 0
 #define DEBUGP printk
@@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val)
 	}
 
 	for (i = 0; args[i]; i++) {
-		if (args[i] == ' ' && !in_quote)
+		if (isspace(args[i]) && !in_quote)
 			break;
 		if (equals == 0) {
 			if (args[i] == '=')
@@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val)
 		next = args + i;
 
 	/* Chew up trailing spaces. */
-	while (*next == ' ')
+	while (isspace(*next))
 		next++;
 	return next;
 }
@@ -138,7 +139,7 @@ int parse_args(const char *name,
 	DEBUGP("Parsing ARGS: %s\n", args);
 
 	/* Chew leading spaces */
-	while (*args == ' ')
+	while (isspace(*args))
 		args++;
 
 	while (*args) {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 821722ae58a..86b3796b043 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
 {
 	if (!(flags & CLONE_NEWPID))
 		return get_pid_ns(old_ns);
-	if (flags & CLONE_THREAD)
+	if (flags & (CLONE_THREAD|CLONE_PARENT))
 		return ERR_PTR(-EINVAL);
 	return create_pid_namespace(old_ns);
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8ba052c86d4..b101cdc4df3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -13,7 +13,6 @@
 
 #include <linux/module.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
 #include <linux/genhd.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 307c285af59..23bd09cd042 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
  * or self-reaping.  Do notification now if it would have happened earlier.
  * If it should reap itself, return true.
  *
- * If it's our own child, there is no notification to do.
- * But if our normal children self-reap, then this child
- * was prevented by ptrace and we must reap it now.
+ * If it's our own child, there is no notification to do. But if our normal
+ * children self-reap, then this child was prevented by ptrace and we must
+ * reap it now, in that case we must also wake up sub-threads sleeping in
+ * do_wait().
  */
 static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 {
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 		if (!task_detached(p) && thread_group_empty(p)) {
 			if (!same_thread_group(p->real_parent, tracer))
 				do_notify_parent(p, p->exit_signal);
-			else if (ignoring_children(tracer->sighand))
+			else if (ignoring_children(tracer->sighand)) {
+				__wake_up_parent(p, tracer);
 				p->exit_signal = -1;
+			}
 		}
 		if (task_detached(p)) {
 			/* Mark it as in the process of being reaped. */
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index e1338f07431..88faec23e83 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
 	spin_lock_init(&counter->lock);
 	counter->limit = RESOURCE_MAX;
+	counter->soft_limit = RESOURCE_MAX;
 	counter->parent = parent;
 }
 
@@ -36,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
 }
 
 int res_counter_charge(struct res_counter *counter, unsigned long val,
-			struct res_counter **limit_fail_at)
+			struct res_counter **limit_fail_at,
+			struct res_counter **soft_limit_fail_at)
 {
 	int ret;
 	unsigned long flags;
 	struct res_counter *c, *u;
 
 	*limit_fail_at = NULL;
+	if (soft_limit_fail_at)
+		*soft_limit_fail_at = NULL;
 	local_irq_save(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
 		ret = res_counter_charge_locked(c, val);
+		/*
+		 * With soft limits, we return the highest ancestor
+		 * that exceeds its soft limit
+		 */
+		if (soft_limit_fail_at &&
+			!res_counter_soft_limit_check_locked(c))
+			*soft_limit_fail_at = c;
 		spin_unlock(&c->lock);
 		if (ret < 0) {
 			*limit_fail_at = c;
@@ -74,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 	counter->usage -= val;
 }
 
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+void res_counter_uncharge(struct res_counter *counter, unsigned long val,
+				bool *was_soft_limit_excess)
 {
 	unsigned long flags;
 	struct res_counter *c;
@@ -82,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
 	local_irq_save(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
+		if (was_soft_limit_excess)
+			*was_soft_limit_excess =
+				!res_counter_soft_limit_check_locked(c);
 		res_counter_uncharge_locked(c, val);
 		spin_unlock(&c->lock);
 	}
@@ -101,6 +116,8 @@ res_counter_member(struct res_counter *counter, int member)
 		return &counter->limit;
 	case RES_FAILCNT:
 		return &counter->failcnt;
+	case RES_SOFT_LIMIT:
+		return &counter->soft_limit;
 	};
 
 	BUG();
diff --git a/kernel/sched.c b/kernel/sched.c
index 2f76e06bea5..ee61f454a98 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -10312,7 +10312,7 @@ static int sched_rt_global_constraints(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 int sched_rt_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
@@ -10323,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
 	old_period = sysctl_sched_rt_period;
 	old_runtime = sysctl_sched_rt_runtime;
 
-	ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (!ret && write) {
 		ret = sched_rt_global_constraints();
@@ -10377,8 +10377,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 }
 
 static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-		      struct task_struct *tsk)
+cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10388,15 +10387,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	if (tsk->sched_class != &fair_sched_class)
 		return -EINVAL;
 #endif
+	return 0;
+}
 
+static int
+cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		      struct task_struct *tsk, bool threadgroup)
+{
+	int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
+	if (retval)
+		return retval;
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+			retval = cpu_cgroup_can_attach_task(cgrp, c);
+			if (retval) {
+				rcu_read_unlock();
+				return retval;
+			}
+		}
+		rcu_read_unlock();
+	}
 	return 0;
 }
 
 static void
 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			struct cgroup *old_cont, struct task_struct *tsk)
+		  struct cgroup *old_cont, struct task_struct *tsk,
+		  bool threadgroup)
 {
 	sched_move_task(tsk);
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+			sched_move_task(c);
+		}
+		rcu_read_unlock();
+	}
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ecc637a0d59..4e777b47eed 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 
 #ifdef CONFIG_SCHED_DEBUG
 int sched_nr_latency_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
-	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (ret || !write)
 		return ret;
diff --git a/kernel/signal.c b/kernel/signal.c
index 64c5deeaca5..6705320784f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 
 		if (why) {
 			/*
-			 * The first thread which returns from finish_stop()
+			 * The first thread which returns from do_signal_stop()
 			 * will take ->siglock, notice SIGNAL_CLD_MASK, and
 			 * notify its parent. See get_signal_to_deliver().
 			 */
@@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	return send_signal(sig, info, t, 0);
 }
 
+int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
+			bool group)
+{
+	unsigned long flags;
+	int ret = -ESRCH;
+
+	if (lock_task_sighand(p, &flags)) {
+		ret = send_signal(sig, info, p, group);
+		unlock_task_sighand(p, &flags);
+	}
+
+	return ret;
+}
+
 /*
  * Force a signal that the process can't ignore: if necessary
  * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p)
 	}
 }
 
-int __fatal_signal_pending(struct task_struct *tsk)
-{
-	return sigismember(&tsk->pending.signal, SIGKILL);
-}
-EXPORT_SYMBOL(__fatal_signal_pending);
-
 struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
 {
 	struct sighand_struct *sighand;
@@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-	unsigned long flags;
-	int ret;
+	int ret = check_kill_permission(sig, info, p);
 
-	ret = check_kill_permission(sig, info, p);
-
-	if (!ret && sig) {
-		ret = -ESRCH;
-		if (lock_task_sighand(p, &flags)) {
-			ret = __group_send_sig_info(sig, info, p);
-			unlock_task_sighand(p, &flags);
-		}
-	}
+	if (!ret && sig)
+		ret = do_send_sig_info(sig, info, p, true);
 
 	return ret;
 }
@@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
  * These are for backward compatibility with the rest of the kernel source.
  */
 
-/*
- * The caller must ensure the task can't exit.
- */
 int
 send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-	int ret;
-	unsigned long flags;
-
 	/*
 	 * Make sure legacy kernel users don't send in bad values
 	 * (normal paths check this in check_kill_permission).
@@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	if (!valid_signal(sig))
 		return -EINVAL;
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
-	ret = specific_send_sig_info(sig, info, p);
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
-	return ret;
+	return do_send_sig_info(sig, info, p, false);
 }
 
 #define __si_special(priv) \
@@ -1383,15 +1374,6 @@ ret:
 }
 
 /*
- * Wake up any threads in the parent blocked in wait* syscalls.
- */
-static inline void __wake_up_parent(struct task_struct *p,
-				    struct task_struct *parent)
-{
-	wake_up_interruptible_sync(&parent->signal->wait_chldexit);
-}
-
-/*
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
  *
@@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code)
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
-static void
-finish_stop(int stop_count)
-{
-	/*
-	 * If there are no other threads in the group, or if there is
-	 * a group stop in progress and we are the last to stop,
-	 * report to the parent.  When ptraced, every thread reports itself.
-	 */
-	if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
-		read_lock(&tasklist_lock);
-		do_notify_parent_cldstop(current, CLD_STOPPED);
-		read_unlock(&tasklist_lock);
-	}
-
-	do {
-		schedule();
-	} while (try_to_freeze());
-	/*
-	 * Now we don't run again until continued.
-	 */
-	current->exit_code = 0;
-}
-
 /*
  * This performs the stopping for SIGSTOP and other stop signals.
  * We have to stop all threads in the thread group.
@@ -1705,15 +1664,9 @@ finish_stop(int stop_count)
 static int do_signal_stop(int signr)
 {
 	struct signal_struct *sig = current->signal;
-	int stop_count;
+	int notify;
 
-	if (sig->group_stop_count > 0) {
-		/*
-		 * There is a group stop in progress.  We don't need to
-		 * start another one.
-		 */
-		stop_count = --sig->group_stop_count;
-	} else {
+	if (!sig->group_stop_count) {
 		struct task_struct *t;
 
 		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr)
 		 */
 		sig->group_exit_code = signr;
 
-		stop_count = 0;
+		sig->group_stop_count = 1;
 		for (t = next_thread(current); t != current; t = next_thread(t))
 			/*
 			 * Setting state to TASK_STOPPED for a group
@@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr)
 			 */
 			if (!(t->flags & PF_EXITING) &&
 			    !task_is_stopped_or_traced(t)) {
-				stop_count++;
+				sig->group_stop_count++;
 				signal_wake_up(t, 0);
 			}
-		sig->group_stop_count = stop_count;
 	}
+	/*
+	 * If there are no other threads in the group, or if there is
+	 * a group stop in progress and we are the last to stop, report
+	 * to the parent.  When ptraced, every thread reports itself.
+	 */
+	notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
+	notify = tracehook_notify_jctl(notify, CLD_STOPPED);
+	/*
+	 * tracehook_notify_jctl() can drop and reacquire siglock, so
+	 * we keep ->group_stop_count != 0 before the call. If SIGCONT
+	 * or SIGKILL comes in between ->group_stop_count == 0.
+	 */
+	if (sig->group_stop_count) {
+		if (!--sig->group_stop_count)
+			sig->flags = SIGNAL_STOP_STOPPED;
+		current->exit_code = sig->group_exit_code;
+		__set_current_state(TASK_STOPPED);
+	}
+	spin_unlock_irq(&current->sighand->siglock);
 
-	if (stop_count == 0)
-		sig->flags = SIGNAL_STOP_STOPPED;
-	current->exit_code = sig->group_exit_code;
-	__set_current_state(TASK_STOPPED);
+	if (notify) {
+		read_lock(&tasklist_lock);
+		do_notify_parent_cldstop(current, notify);
+		read_unlock(&tasklist_lock);
+	}
+
+	/* Now we don't run again until woken by SIGCONT or SIGKILL */
+	do {
+		schedule();
+	} while (try_to_freeze());
+
+	tracehook_finish_jctl();
+	current->exit_code = 0;
 
-	spin_unlock_irq(&current->sighand->siglock);
-	finish_stop(stop_count);
 	return 1;
 }
 
@@ -1815,14 +1793,15 @@ relock:
 		int why = (signal->flags & SIGNAL_STOP_CONTINUED)
 				? CLD_CONTINUED : CLD_STOPPED;
 		signal->flags &= ~SIGNAL_CLD_MASK;
-		spin_unlock_irq(&sighand->siglock);
 
-		if (unlikely(!tracehook_notify_jctl(1, why)))
-			goto relock;
+		why = tracehook_notify_jctl(why, CLD_CONTINUED);
+		spin_unlock_irq(&sighand->siglock);
 
-		read_lock(&tasklist_lock);
-		do_notify_parent_cldstop(current->group_leader, why);
-		read_unlock(&tasklist_lock);
+		if (why) {
+			read_lock(&tasklist_lock);
+			do_notify_parent_cldstop(current->group_leader, why);
+			read_unlock(&tasklist_lock);
+		}
 		goto relock;
 	}
 
@@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk)
 	if (unlikely(tsk->signal->group_stop_count) &&
 			!--tsk->signal->group_stop_count) {
 		tsk->signal->flags = SIGNAL_STOP_STOPPED;
-		group_stop = 1;
+		group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
 	}
 out:
 	spin_unlock_irq(&tsk->sighand->siglock);
 
-	if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
+	if (unlikely(group_stop)) {
 		read_lock(&tasklist_lock);
-		do_notify_parent_cldstop(tsk, CLD_STOPPED);
+		do_notify_parent_cldstop(tsk, group_stop);
 		read_unlock(&tasklist_lock);
 	}
 }
@@ -2290,7 +2269,6 @@ static int
 do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 {
 	struct task_struct *p;
-	unsigned long flags;
 	int error = -ESRCH;
 
 	rcu_read_lock();
@@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 		/*
 		 * The null signal is a permissions and process existence
 		 * probe.  No signal is actually delivered.
-		 *
-		 * If lock_task_sighand() fails we pretend the task dies
-		 * after receiving the signal. The window is tiny, and the
-		 * signal is private anyway.
 		 */
-		if (!error && sig && lock_task_sighand(p, &flags)) {
-			error = specific_send_sig_info(sig, info, p);
-			unlock_task_sighand(p, &flags);
+		if (!error && sig) {
+			error = do_send_sig_info(sig, info, p, false);
+			/*
+			 * If lock_task_sighand() failed we pretend the task
+			 * dies after receiving the signal. The window is tiny,
+			 * and the signal is private anyway.
+			 */
+			if (unlikely(error == -ESRCH))
+				error = 0;
 		}
 	}
 	rcu_read_unlock();
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 09d7519557d..0d31135efbf 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
 #ifdef CONFIG_SYSCTL
-static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+static int slow_work_min_threads_sysctl(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 
-static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
 					void __user *, size_t *, loff_t *);
 #endif
 
@@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data)
  * Handle adjustment of the minimum number of threads
  */
 static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
-					struct file *filp, void __user *buffer,
+					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
-	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	int n;
 
 	if (ret == 0) {
@@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
  * Handle adjustment of the maximum number of threads
  */
 static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
-					struct file *filp, void __user *buffer,
+					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
-	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	int n;
 
 	if (ret == 0) {
diff --git a/kernel/smp.c b/kernel/smp.c
index fd47a256a24..c9d1c7835c2 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -347,13 +347,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 	generic_exec_single(cpu, data, wait);
 }
 
-/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
-
-#ifndef arch_send_call_function_ipi_mask
-# define arch_send_call_function_ipi_mask(maskp) \
-	 arch_send_call_function_ipi(*(maskp))
-#endif
-
 /**
  * smp_call_function_many(): Run a function on a set of other CPUs.
  * @mask: The set of cpus to run on (only runs on online subset).
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 88796c33083..81324d12eb3 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
 EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
 
 int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-			     struct file *filp, void __user *buffer,
+			     void __user *buffer,
 			     size_t *lenp, loff_t *ppos)
 {
 	touch_all_softlockup_watchdogs();
-	return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
 
 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index ebcb1561172..255475d163e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1542,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 				current->timer_slack_ns = arg2;
 			error = 0;
 			break;
+		case PR_MCE_KILL:
+			if (arg4 | arg5)
+				return -EINVAL;
+			switch (arg2) {
+			case 0:
+				if (arg3 != 0)
+					return -EINVAL;
+				current->flags &= ~PF_MCE_PROCESS;
+				break;
+			case 1:
+				current->flags |= PF_MCE_PROCESS;
+				if (arg3 != 0)
+					current->flags |= PF_MCE_EARLY;
+				else
+					current->flags &= ~PF_MCE_EARLY;
+				break;
+			default:
+				return -EINVAL;
+			}
+			error = 0;
+			break;
+
 		default:
 			error = -EINVAL;
 			break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 515bc230ac2..e06d0b8d195 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg);
 cond_syscall(compat_sys_sendmsg);
 cond_syscall(sys_recvmsg);
 cond_syscall(compat_sys_recvmsg);
+cond_syscall(compat_sys_recvfrom);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0dfaa47d7cb..0d949c51741 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -26,7 +26,6 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
-#include <linux/utsname.h>
 #include <linux/kmemcheck.h>
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
@@ -77,6 +76,7 @@ extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
 extern char core_pattern[];
+extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
@@ -163,9 +163,9 @@ extern int max_lock_depth;
 #endif
 
 #ifdef CONFIG_PROC_SYSCTL
-static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
@@ -424,6 +424,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "core_pipe_limit",
+		.data		= &core_pipe_limit,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #ifdef CONFIG_PROC_SYSCTL
 	{
 		.procname	= "tainted",
@@ -1390,6 +1398,31 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &scan_unevictable_handler,
 	},
+#ifdef CONFIG_MEMORY_FAILURE
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "memory_failure_early_kill",
+		.data		= &sysctl_memory_failure_early_kill,
+		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "memory_failure_recovery",
+		.data		= &sysctl_memory_failure_recovery,
+		.maxlen		= sizeof(sysctl_memory_failure_recovery),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
+
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
@@ -2218,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head)
 #ifdef CONFIG_PROC_SYSCTL
 
 static int _proc_do_string(void* data, int maxlen, int write,
-			   struct file *filp, void __user *buffer,
+			   void __user *buffer,
 			   size_t *lenp, loff_t *ppos)
 {
 	size_t len;
@@ -2279,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
  * proc_dostring - read a string sysctl
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2293,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
  *
  * Returns 0 on success.
  */
-int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+int proc_dostring(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	return _proc_do_string(table->data, table->maxlen, write, filp,
+	return _proc_do_string(table->data, table->maxlen, write,
 			       buffer, lenp, ppos);
 }
 
@@ -2321,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
 }
 
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
-		  int write, struct file *filp, void __user *buffer,
+		  int write, void __user *buffer,
 		  size_t *lenp, loff_t *ppos,
 		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
@@ -2428,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 #undef TMPBUFLEN
 }
 
-static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+static int do_proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos,
 		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
-	return __do_proc_dointvec(table->data, table, write, filp,
+	return __do_proc_dointvec(table->data, table, write,
 			buffer, lenp, ppos, conv, data);
 }
 
@@ -2442,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
  * proc_dointvec - read a vector of integers
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2452,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
  *
  * Returns 0 on success.
  */
-int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    NULL,NULL);
 }
 
@@ -2463,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
  * Taint values can only be increased
  * This means we can safely use a temporary.
  */
-static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
@@ -2475,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
 
 	t = *table;
 	t.data = &tmptaint;
-	err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
+	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
 	if (err < 0)
 		return err;
 
@@ -2527,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
  * proc_dointvec_minmax - read a vector of integers with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2540,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_minmax(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct do_proc_dointvec_minmax_conv_param param = {
 		.min = (int *) table->extra1,
 		.max = (int *) table->extra2,
 	};
-	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
 				do_proc_dointvec_minmax_conv, &param);
 }
 
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
-				     struct file *filp,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,
 				     unsigned long convmul,
@@ -2657,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
-				     struct file *filp,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
 	return __do_proc_doulongvec_minmax(table->data, table, write,
-			filp, buffer, lenp, ppos, convmul, convdiv);
+			buffer, lenp, ppos, convmul, convdiv);
 }
 
 /**
  * proc_doulongvec_minmax - read a vector of long integers with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2684,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
  *
  * Returns 0 on success.
  */
-int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
+    return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
 }
 
 /**
  * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2709,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
  * Returns 0 on success.
  */
 int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
-				      struct file *filp,
 				      void __user *buffer,
 				      size_t *lenp, loff_t *ppos)
 {
-    return do_proc_doulongvec_minmax(table, write, filp, buffer,
+    return do_proc_doulongvec_minmax(table, write, buffer,
 				     lenp, ppos, HZ, 1000l);
 }
 
@@ -2789,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
  * proc_dointvec_jiffies - read a vector of integers as seconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2801,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    do_proc_dointvec_jiffies_conv,NULL);
 }
 
@@ -2812,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
  * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: pointer to the file position
@@ -2824,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    do_proc_dointvec_userhz_jiffies_conv,NULL);
 }
 
@@ -2835,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
  * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2848,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
  *
  * Returns 0 on success.
  */
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
 			     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
 				do_proc_dointvec_ms_jiffies_conv, NULL);
 }
 
-static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+static int proc_do_cad_pid(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct pid *new_pid;
@@ -2864,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
 
 	tmp = pid_vnr(cad_pid);
 
-	r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
+	r = __do_proc_dointvec(&tmp, table, write, buffer,
 			       lenp, ppos, NULL, NULL);
 	if (r || !write)
 		return r;
@@ -2879,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
 
 #else /* CONFIG_PROC_FS */
 
-int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+int proc_dostring(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_minmax(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
 			     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
-				      struct file *filp,
 				      void __user *buffer,
 				      size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 0b0a6366c9d..ee266620b06 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 00000000000..86628e755f3
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
+ * This file is part of the GNU C Library.
+ * Contributed by Paul Eggert (eggert@twinsun.com).
+ *
+ * The GNU C Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * The GNU C Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the GNU C Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Converts the calendar time to broken-down time representation
+ * Based on code from glibc-2.6
+ *
+ * 2009-7-14:
+ *   Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
+ */
+
+#include <linux/time.h>
+#include <linux/module.h>
+
+/*
+ * Nonzero if YEAR is a leap year (every 4 years,
+ * except every 100th isn't, and every 400th is).
+ */
+static int __isleap(long year)
+{
+	return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
+}
+
+/* do a mathdiv for long type */
+static long math_div(long a, long b)
+{
+	return a / b - (a % b < 0);
+}
+
+/* How many leap years between y1 and y2, y1 must less or equal to y2 */
+static long leaps_between(long y1, long y2)
+{
+	long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
+		+ math_div(y1 - 1, 400);
+	long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
+		+ math_div(y2 - 1, 400);
+	return leaps2 - leaps1;
+}
+
+/* How many days come before each month (0-12). */
+static const unsigned short __mon_yday[2][13] = {
+	/* Normal years. */
+	{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
+	/* Leap years. */
+	{0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
+};
+
+#define SECS_PER_HOUR	(60 * 60)
+#define SECS_PER_DAY	(SECS_PER_HOUR * 24)
+
+/**
+ * time_to_tm - converts the calendar time to local broken-down time
+ *
+ * @totalsecs	the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ *		Coordinated Universal Time (UTC).
+ * @offset	offset seconds adding to totalsecs.
+ * @result	pointer to struct tm variable to receive broken-down time
+ */
+void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+{
+	long days, rem, y;
+	const unsigned short *ip;
+
+	days = totalsecs / SECS_PER_DAY;
+	rem = totalsecs % SECS_PER_DAY;
+	rem += offset;
+	while (rem < 0) {
+		rem += SECS_PER_DAY;
+		--days;
+	}
+	while (rem >= SECS_PER_DAY) {
+		rem -= SECS_PER_DAY;
+		++days;
+	}
+
+	result->tm_hour = rem / SECS_PER_HOUR;
+	rem %= SECS_PER_HOUR;
+	result->tm_min = rem / 60;
+	result->tm_sec = rem % 60;
+
+	/* January 1, 1970 was a Thursday. */
+	result->tm_wday = (4 + days) % 7;
+	if (result->tm_wday < 0)
+		result->tm_wday += 7;
+
+	y = 1970;
+
+	while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
+		/* Guess a corrected year, assuming 365 days per year. */
+		long yg = y + math_div(days, 365);
+
+		/* Adjust DAYS and Y to match the guessed year. */
+		days -= (yg - y) * 365 + leaps_between(y, yg);
+		y = yg;
+	}
+
+	result->tm_year = y - 1900;
+
+	result->tm_yday = days;
+
+	ip = __mon_yday[__isleap(y)];
+	for (y = 11; days < ip[y]; y--)
+		continue;
+	days -= ip[y];
+
+	result->tm_mon = y;
+	result->tm_mday = days + 1;
+}
+EXPORT_SYMBOL(time_to_tm);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 23df7771c93..a142579765b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3015,7 +3015,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-		     struct file *file, void __user *buffer, size_t *lenp,
+		     void __user *buffer, size_t *lenp,
 		     loff_t *ppos)
 {
 	int ret;
@@ -3025,7 +3025,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 	mutex_lock(&ftrace_lock);
 
-	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+	ret  = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
 		goto out;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6c0f6a8a22e..411af37f4be 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1984,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file)
 	if (current_trace)
 		*iter->trace = *current_trace;
 
-	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
 		goto fail;
 
-	cpumask_clear(iter->started);
-
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
@@ -4389,7 +4387,7 @@ __init static int tracer_alloc_buffers(void)
 	if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
 		goto out_free_buffer_mask;
 
-	if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
 		goto out_free_tracing_cpumask;
 
 	/* To save memory, keep the ring buffer size to its minimum */
@@ -4400,7 +4398,6 @@ __init static int tracer_alloc_buffers(void)
 
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
-	cpumask_clear(tracing_reader_cpumask);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
 	global_trace.buffer = ring_buffer_alloc(ring_buf_size,
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0f6facb050a..8504ac71e4e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
 
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
-		   struct file *file, void __user *buffer, size_t *lenp,
+		   void __user *buffer, size_t *lenp,
 		   loff_t *ppos)
 {
 	int ret;
 
 	mutex_lock(&stack_sysctl_mutex);
 
-	ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (ret || !write ||
 	    (last_stack_tracer_enabled == !!stack_tracer_enabled))
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 0314501688b..419209893d8 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -4,7 +4,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/mman.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 92359cc747a..69eae358a72 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
  *	Special case of dostring for the UTS structure. This has locks
  *	to observe. Should this be in kernel/sys.c ????
  */
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
+static int proc_do_uts_string(ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table uts_table;
 	int r;
 	memcpy(&uts_table, table, sizeof(uts_table));
 	uts_table.data = get_uts(table, write);
-	r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
+	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
 	put_uts(table, write, uts_table.data);
 	return r;
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d57b12f59c8..891155817bc 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -50,6 +50,14 @@ config MAGIC_SYSRQ
 	  keys are documented in <file:Documentation/sysrq.txt>. Don't say Y
 	  unless you really know what this hack does.
 
+config STRIP_ASM_SYMS
+	bool "Strip assembler-generated symbols during link"
+	default n
+	help
+	  Strip internal assembler-generated symbols during a link (symbols
+	  that look like '.Lxxx') so they don't pollute the output of
+	  get_wchan() and suchlike.
+
 config UNUSED_SYMBOLS
 	bool "Enable unused/obsolete exported symbols"
 	default y if X86
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
index 68dfce59c1b..fc686c7a0a0 100644
--- a/lib/decompress_inflate.c
+++ b/lib/decompress_inflate.c
@@ -27,6 +27,11 @@
 
 #define GZIP_IOBUF_SIZE (16*1024)
 
+static int nofill(void *buffer, unsigned int len)
+{
+	return -1;
+}
+
 /* Included from initramfs et al code */
 STATIC int INIT gunzip(unsigned char *buf, int len,
 		       int(*fill)(void*, unsigned int),
@@ -76,6 +81,9 @@ STATIC int INIT gunzip(unsigned char *buf, int len,
 		goto gunzip_nomem4;
 	}
 
+	if (!fill)
+		fill = nofill;
+
 	if (len == 0)
 		len = fill(zbuf, GZIP_IOBUF_SIZE);
 
diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c
index 0b954e04bd3..ca82fde81c8 100644
--- a/lib/decompress_unlzma.c
+++ b/lib/decompress_unlzma.c
@@ -82,6 +82,11 @@ struct rc {
 #define RC_MODEL_TOTAL_BITS 11
 
 
+static int nofill(void *buffer, unsigned int len)
+{
+	return -1;
+}
+
 /* Called twice: once at startup and once in rc_normalize() */
 static void INIT rc_read(struct rc *rc)
 {
@@ -97,7 +102,10 @@ static inline void INIT rc_init(struct rc *rc,
 				       int (*fill)(void*, unsigned int),
 				       char *buffer, int buffer_size)
 {
-	rc->fill = fill;
+	if (fill)
+		rc->fill = fill;
+	else
+		rc->fill = nofill;
 	rc->buffer = (uint8_t *)buffer;
 	rc->buffer_size = buffer_size;
 	rc->buffer_end = rc->buffer + rc->buffer_size;
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 73a14b8c6d1..b91839e9e89 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -671,7 +671,7 @@ static char *ip4_string(char *p, const u8 *addr, bool leading_zeros)
 	return p;
 }
 
-static char *ip6_compressed_string(char *p, const struct in6_addr *addr)
+static char *ip6_compressed_string(char *p, const char *addr)
 {
 	int i;
 	int j;
@@ -683,7 +683,12 @@ static char *ip6_compressed_string(char *p, const struct in6_addr *addr)
 	u8 hi;
 	u8 lo;
 	bool needcolon = false;
-	bool useIPv4 = ipv6_addr_v4mapped(addr) || ipv6_addr_is_isatap(addr);
+	bool useIPv4;
+	struct in6_addr in6;
+
+	memcpy(&in6, addr, sizeof(struct in6_addr));
+
+	useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6);
 
 	memset(zerolength, 0, sizeof(zerolength));
 
@@ -695,7 +700,7 @@ static char *ip6_compressed_string(char *p, const struct in6_addr *addr)
 	/* find position of longest 0 run */
 	for (i = 0; i < range; i++) {
 		for (j = i; j < range; j++) {
-			if (addr->s6_addr16[j] != 0)
+			if (in6.s6_addr16[j] != 0)
 				break;
 			zerolength[i]++;
 		}
@@ -722,7 +727,7 @@ static char *ip6_compressed_string(char *p, const struct in6_addr *addr)
 			needcolon = false;
 		}
 		/* hex u16 without leading 0s */
-		word = ntohs(addr->s6_addr16[i]);
+		word = ntohs(in6.s6_addr16[i]);
 		hi = word >> 8;
 		lo = word & 0xff;
 		if (hi) {
@@ -741,19 +746,19 @@ static char *ip6_compressed_string(char *p, const struct in6_addr *addr)
 	if (useIPv4) {
 		if (needcolon)
 			*p++ = ':';
-		p = ip4_string(p, &addr->s6_addr[12], false);
+		p = ip4_string(p, &in6.s6_addr[12], false);
 	}
 
 	*p = '\0';
 	return p;
 }
 
-static char *ip6_string(char *p, const struct in6_addr *addr, const char *fmt)
+static char *ip6_string(char *p, const char *addr, const char *fmt)
 {
 	int i;
 	for (i = 0; i < 8; i++) {
-		p = pack_hex_byte(p, addr->s6_addr[2 * i]);
-		p = pack_hex_byte(p, addr->s6_addr[2 * i + 1]);
+		p = pack_hex_byte(p, *addr++);
+		p = pack_hex_byte(p, *addr++);
 		if (fmt[0] == 'I' && i != 7)
 			*p++ = ':';
 	}
@@ -768,9 +773,9 @@ static char *ip6_addr_string(char *buf, char *end, const u8 *addr,
 	char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")];
 
 	if (fmt[0] == 'I' && fmt[2] == 'c')
-		ip6_compressed_string(ip6_addr, (const struct in6_addr *)addr);
+		ip6_compressed_string(ip6_addr, addr);
 	else
-		ip6_string(ip6_addr, (const struct in6_addr *)addr, fmt);
+		ip6_string(ip6_addr, addr, fmt);
 
 	return string(buf, end, ip6_addr, spec);
 }
diff --git a/mm/Kconfig b/mm/Kconfig
index 71eb0b4cce8..24776072959 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -245,6 +245,20 @@ config DEFAULT_MMAP_MIN_ADDR
 	  /proc/sys/vm/mmap_min_addr tunable.
 
 
+config MEMORY_FAILURE
+	depends on MMU
+	depends on X86_MCE
+	bool "Enable recovery from hardware memory errors"
+	help
+	  Enables code to recover from some memory failures on systems
+	  with MCA recovery. This allows a system to continue running
+	  even when some of its memory has uncorrected errors. This requires
+	  special hardware support and typically ECC memory.
+
+config HWPOISON_INJECT
+	tristate "Poison pages injector"
+	depends on MEMORY_FAILURE && DEBUG_KERNEL
+
 config NOMMU_INITIAL_TRIM_EXCESS
 	int "Turn on mmap() excess space trimming before booting"
 	depends on !MMU
diff --git a/mm/Makefile b/mm/Makefile
index 88193d73cd1..515fd793c17 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -41,5 +41,7 @@ obj-$(CONFIG_SMP) += allocpercpu.o
 endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
+obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/filemap.c b/mm/filemap.c
index bcc7372aebb..6c84e598b4a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -58,7 +58,7 @@
 /*
  * Lock ordering:
  *
- *  ->i_mmap_lock		(vmtruncate)
+ *  ->i_mmap_lock		(truncate_pagecache)
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
@@ -104,6 +104,10 @@
  *
  *  ->task->proc_lock
  *    ->dcache_lock		(proc_pid_lookup)
+ *
+ *  (code doesn't rely on that order, so you could switch it around)
+ *  ->tasklist_lock             (memory_failure, collect_procs_ao)
+ *    ->i_mmap_lock
  */
 
 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 815dbd4a6dc..6f048fcc749 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1537,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 
 #ifdef CONFIG_SYSCTL
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-			   struct file *file, void __user *buffer,
+			   void __user *buffer,
 			   size_t *length, loff_t *ppos)
 {
 	struct hstate *h = &default_hstate;
@@ -1548,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 
 	table->data = &tmp;
 	table->maxlen = sizeof(unsigned long);
-	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
 	if (write)
 		h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1557,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 }
 
 int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
-			struct file *file, void __user *buffer,
+			void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	if (hugepages_treat_as_movable)
 		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
 	else
@@ -1569,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
 }
 
 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
-			struct file *file, void __user *buffer,
+			void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
 	struct hstate *h = &default_hstate;
@@ -1580,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 
 	table->data = &tmp;
 	table->maxlen = sizeof(unsigned long);
-	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
 	if (write) {
 		spin_lock(&hugetlb_lock);
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 00000000000..e1d85137f08
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,41 @@
+/* Inject a hwpoison memory failure on a arbitary pfn */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+
+static struct dentry *hwpoison_dir, *corrupt_pfn;
+
+static int hwpoison_inject(void *data, u64 val)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
+	return __memory_failure(val, 18, 0);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
+
+static void pfn_inject_exit(void)
+{
+	if (hwpoison_dir)
+		debugfs_remove_recursive(hwpoison_dir);
+}
+
+static int pfn_inject_init(void)
+{
+	hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
+	if (hwpoison_dir == NULL)
+		return -ENOMEM;
+	corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
+					  NULL, &hwpoison_fops);
+	if (corrupt_pfn == NULL) {
+		pfn_inject_exit();
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+module_init(pfn_inject_init);
+module_exit(pfn_inject_exit);
+MODULE_LICENSE("GPL");
diff --git a/mm/ksm.c b/mm/ksm.c
index 37cc3732509..f7edac356f4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/mmu_notifier.h>
+#include <linux/swap.h>
 #include <linux/ksm.h>
 
 #include <asm/tlbflush.h>
@@ -162,10 +163,10 @@ static unsigned long ksm_pages_unshared;
 static unsigned long ksm_rmap_items;
 
 /* Limit on the number of unswappable pages used */
-static unsigned long ksm_max_kernel_pages = 2000;
+static unsigned long ksm_max_kernel_pages;
 
 /* Number of pages ksmd should scan in one batch */
-static unsigned int ksm_thread_pages_to_scan = 200;
+static unsigned int ksm_thread_pages_to_scan = 100;
 
 /* Milliseconds ksmd should sleep between batches */
 static unsigned int ksm_thread_sleep_millisecs = 20;
@@ -173,7 +174,7 @@ static unsigned int ksm_thread_sleep_millisecs = 20;
 #define KSM_RUN_STOP	0
 #define KSM_RUN_MERGE	1
 #define KSM_RUN_UNMERGE	2
-static unsigned int ksm_run = KSM_RUN_MERGE;
+static unsigned int ksm_run = KSM_RUN_STOP;
 
 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
 static DEFINE_MUTEX(ksm_thread_mutex);
@@ -183,6 +184,11 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock);
 		sizeof(struct __struct), __alignof__(struct __struct),\
 		(__flags), NULL)
 
+static void __init ksm_init_max_kernel_pages(void)
+{
+	ksm_max_kernel_pages = nr_free_buffer_pages() / 4;
+}
+
 static int __init ksm_slab_init(void)
 {
 	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
@@ -1667,6 +1673,8 @@ static int __init ksm_init(void)
 	struct task_struct *ksm_thread;
 	int err;
 
+	ksm_init_max_kernel_pages();
+
 	err = ksm_slab_init();
 	if (err)
 		goto out;
diff --git a/mm/madvise.c b/mm/madvise.c
index d9ae2067952..35b1479b7c9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,6 +218,32 @@ static long madvise_remove(struct vm_area_struct *vma,
 	return error;
 }
 
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Error injection support for memory error handling.
+ */
+static int madvise_hwpoison(unsigned long start, unsigned long end)
+{
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	for (; start < end; start += PAGE_SIZE) {
+		struct page *p;
+		int ret = get_user_pages(current, current->mm, start, 1,
+						0, 0, &p, NULL);
+		if (ret != 1)
+			return ret;
+		printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
+		       page_to_pfn(p), start);
+		/* Ignore return value for now */
+		__memory_failure(page_to_pfn(p), 0, 1);
+		put_page(p);
+	}
+	return ret;
+}
+#endif
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -308,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 	int write;
 	size_t len;
 
+#ifdef CONFIG_MEMORY_FAILURE
+	if (behavior == MADV_HWPOISON)
+		return madvise_hwpoison(start, start+len_in);
+#endif
 	if (!madvise_behavior_valid(behavior))
 		return error;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9b10d875378..e2b98a6875c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
 #include <linux/mutex.h>
+#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/spinlock.h>
@@ -43,6 +44,7 @@
 
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
+struct mem_cgroup *root_mem_cgroup __read_mostly;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
 #endif
 
 static DEFINE_MUTEX(memcg_tasklist);	/* can be hold under cgroup_mutex */
+#define SOFTLIMIT_EVENTS_THRESH (1000)
 
 /*
  * Statistics for memory cgroup.
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_MAPPED_FILE,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
+	MEM_CGROUP_STAT_EVENTS,	/* sum of pagein + pageout for internal use */
+	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 
 	MEM_CGROUP_STAT_NSTATS,
 };
@@ -78,6 +83,20 @@ struct mem_cgroup_stat {
 	struct mem_cgroup_stat_cpu cpustat[0];
 };
 
+static inline void
+__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
+				enum mem_cgroup_stat_index idx)
+{
+	stat->count[idx] = 0;
+}
+
+static inline s64
+__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
+				enum mem_cgroup_stat_index idx)
+{
+	return stat->count[idx];
+}
+
 /*
  * For accounting under irq disable, no need for increment preempt count.
  */
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone {
 	unsigned long		count[NR_LRU_LISTS];
 
 	struct zone_reclaim_stat reclaim_stat;
+	struct rb_node		tree_node;	/* RB tree node */
+	unsigned long long	usage_in_excess;/* Set to the value by which */
+						/* the soft limit is exceeded*/
+	bool			on_tree;
+	struct mem_cgroup	*mem;		/* Back pointer, we cannot */
+						/* use container_of	   */
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info {
 };
 
 /*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+
+struct mem_cgroup_tree_per_zone {
+	struct rb_root rb_root;
+	spinlock_t lock;
+};
+
+struct mem_cgroup_tree_per_node {
+	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_tree {
+	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
+};
+
+static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+
+/*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -186,6 +231,13 @@ struct mem_cgroup {
 	struct mem_cgroup_stat stat;
 };
 
+/*
+ * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
+ * limit reclaim to prevent infinite loops, if they ever occur.
+ */
+#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
+#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
+
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -200,13 +252,8 @@ enum charge_type {
 #define PCGF_CACHE	(1UL << PCG_CACHE)
 #define PCGF_USED	(1UL << PCG_USED)
 #define PCGF_LOCK	(1UL << PCG_LOCK)
-static const unsigned long
-pcg_default_flags[NR_CHARGE_TYPE] = {
-	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
-	PCGF_USED | PCGF_LOCK, /* Anon */
-	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
-	0, /* FORCE */
-};
+/* Not used, but added here for completeness */
+#define PCGF_ACCT	(1UL << PCG_ACCT)
 
 /* for encoding cft->private value on file */
 #define _MEM			(0)
@@ -215,15 +262,241 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
 
+/*
+ * Reclaim flags for mem_cgroup_hierarchical_reclaim
+ */
+#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
+#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
+#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
+#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+#define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
+#define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
+
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 
+static struct mem_cgroup_per_zone *
+mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+{
+	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+}
+
+static struct mem_cgroup_per_zone *
+page_cgroup_zoneinfo(struct page_cgroup *pc)
+{
+	struct mem_cgroup *mem = pc->mem_cgroup;
+	int nid = page_cgroup_nid(pc);
+	int zid = page_cgroup_zid(pc);
+
+	if (!mem)
+		return NULL;
+
+	return mem_cgroup_zoneinfo(mem, nid, zid);
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_node_zone(int nid, int zid)
+{
+	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_from_page(struct page *page)
+{
+	int nid = page_to_nid(page);
+	int zid = page_zonenum(page);
+
+	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static void
+__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+				struct mem_cgroup_per_zone *mz,
+				struct mem_cgroup_tree_per_zone *mctz)
+{
+	struct rb_node **p = &mctz->rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct mem_cgroup_per_zone *mz_node;
+
+	if (mz->on_tree)
+		return;
+
+	mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+	while (*p) {
+		parent = *p;
+		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+					tree_node);
+		if (mz->usage_in_excess < mz_node->usage_in_excess)
+			p = &(*p)->rb_left;
+		/*
+		 * We can't avoid mem cgroups that are over their soft
+		 * limit by the same amount
+		 */
+		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&mz->tree_node, parent, p);
+	rb_insert_color(&mz->tree_node, &mctz->rb_root);
+	mz->on_tree = true;
+}
+
+static void
+__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+				struct mem_cgroup_per_zone *mz,
+				struct mem_cgroup_tree_per_zone *mctz)
+{
+	if (!mz->on_tree)
+		return;
+	rb_erase(&mz->tree_node, &mctz->rb_root);
+	mz->on_tree = false;
+}
+
+static void
+mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+				struct mem_cgroup_per_zone *mz,
+				struct mem_cgroup_tree_per_zone *mctz)
+{
+	spin_lock(&mctz->lock);
+	__mem_cgroup_insert_exceeded(mem, mz, mctz);
+	spin_unlock(&mctz->lock);
+}
+
+static void
+mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+				struct mem_cgroup_per_zone *mz,
+				struct mem_cgroup_tree_per_zone *mctz)
+{
+	spin_lock(&mctz->lock);
+	__mem_cgroup_remove_exceeded(mem, mz, mctz);
+	spin_unlock(&mctz->lock);
+}
+
+static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
+{
+	bool ret = false;
+	int cpu;
+	s64 val;
+	struct mem_cgroup_stat_cpu *cpustat;
+
+	cpu = get_cpu();
+	cpustat = &mem->stat.cpustat[cpu];
+	val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
+	if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
+		__mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
+		ret = true;
+	}
+	put_cpu();
+	return ret;
+}
+
+static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
+{
+	unsigned long long prev_usage_in_excess, new_usage_in_excess;
+	bool updated_tree = false;
+	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup_tree_per_zone *mctz;
+
+	mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
+	mctz = soft_limit_tree_from_page(page);
+
+	/*
+	 * We do updates in lazy mode, mem's are removed
+	 * lazily from the per-zone, per-node rb tree
+	 */
+	prev_usage_in_excess = mz->usage_in_excess;
+
+	new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+	if (prev_usage_in_excess) {
+		mem_cgroup_remove_exceeded(mem, mz, mctz);
+		updated_tree = true;
+	}
+	if (!new_usage_in_excess)
+		goto done;
+	mem_cgroup_insert_exceeded(mem, mz, mctz);
+
+done:
+	if (updated_tree) {
+		spin_lock(&mctz->lock);
+		mz->usage_in_excess = new_usage_in_excess;
+		spin_unlock(&mctz->lock);
+	}
+}
+
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
+{
+	int node, zone;
+	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup_tree_per_zone *mctz;
+
+	for_each_node_state(node, N_POSSIBLE) {
+		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+			mz = mem_cgroup_zoneinfo(mem, node, zone);
+			mctz = soft_limit_tree_node_zone(node, zone);
+			mem_cgroup_remove_exceeded(mem, mz, mctz);
+		}
+	}
+}
+
+static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
+{
+	return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
+}
+
+static struct mem_cgroup_per_zone *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+	struct rb_node *rightmost = NULL;
+	struct mem_cgroup_per_zone *mz = NULL;
+
+retry:
+	rightmost = rb_last(&mctz->rb_root);
+	if (!rightmost)
+		goto done;		/* Nothing to reclaim from */
+
+	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+	/*
+	 * Remove the node now but someone else can add it back,
+	 * we will to add it back at the end of reclaim to its correct
+	 * position in the tree.
+	 */
+	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+	if (!res_counter_soft_limit_excess(&mz->mem->res) ||
+		!css_tryget(&mz->mem->css))
+		goto retry;
+done:
+	return mz;
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+	struct mem_cgroup_per_zone *mz;
+
+	spin_lock(&mctz->lock);
+	mz = __mem_cgroup_largest_soft_limit_node(mctz);
+	spin_unlock(&mctz->lock);
+	return mz;
+}
+
+static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
+					 bool charge)
+{
+	int val = (charge) ? 1 : -1;
+	struct mem_cgroup_stat *stat = &mem->stat;
+	struct mem_cgroup_stat_cpu *cpustat;
+	int cpu = get_cpu();
+
+	cpustat = &stat->cpustat[cpu];
+	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
+	put_cpu();
+}
+
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 					 struct page_cgroup *pc,
 					 bool charge)
 {
-	int val = (charge)? 1 : -1;
+	int val = (charge) ? 1 : -1;
 	struct mem_cgroup_stat *stat = &mem->stat;
 	struct mem_cgroup_stat_cpu *cpustat;
 	int cpu = get_cpu();
@@ -240,28 +513,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 	else
 		__mem_cgroup_stat_add_safe(cpustat,
 				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
 	put_cpu();
 }
 
-static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
-{
-	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
-}
-
-static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct page_cgroup *pc)
-{
-	struct mem_cgroup *mem = pc->mem_cgroup;
-	int nid = page_cgroup_nid(pc);
-	int zid = page_cgroup_zid(pc);
-
-	if (!mem)
-		return NULL;
-
-	return mem_cgroup_zoneinfo(mem, nid, zid);
-}
-
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
 					enum lru_list idx)
 {
@@ -354,6 +609,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
 	return ret;
 }
 
+static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
+{
+	return (mem == root_mem_cgroup);
+}
+
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +631,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
 {
 	struct page_cgroup *pc;
-	struct mem_cgroup *mem;
 	struct mem_cgroup_per_zone *mz;
 
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	/* can happen while we handle swapcache. */
-	if (list_empty(&pc->lru) || !pc->mem_cgroup)
+	if (!TestClearPageCgroupAcctLRU(pc))
 		return;
+	VM_BUG_ON(!pc->mem_cgroup);
 	/*
 	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
 	 * removed from global LRU.
 	 */
 	mz = page_cgroup_zoneinfo(pc);
-	mem = pc->mem_cgroup;
 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+	if (mem_cgroup_is_root(pc->mem_cgroup))
+		return;
+	VM_BUG_ON(list_empty(&pc->lru));
 	list_del_init(&pc->lru);
 	return;
 }
@@ -410,8 +672,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
 	 */
 	smp_rmb();
-	/* unused page is not rotated. */
-	if (!PageCgroupUsed(pc))
+	/* unused or root page is not rotated. */
+	if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
 		return;
 	mz = page_cgroup_zoneinfo(pc);
 	list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +687,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
+	VM_BUG_ON(PageCgroupAcctLRU(pc));
 	/*
 	 * Used bit is set without atomic ops but after smp_wmb().
 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +698,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 
 	mz = page_cgroup_zoneinfo(pc);
 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
+	SetPageCgroupAcctLRU(pc);
+	if (mem_cgroup_is_root(pc->mem_cgroup))
+		return;
 	list_add(&pc->lru, &mz->lists[lru]);
 }
 
@@ -469,7 +735,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
 
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	/* link when the page is linked to LRU but page_cgroup isn't */
-	if (PageLRU(page) && list_empty(&pc->lru))
+	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
 		mem_cgroup_add_lru_list(page, page_lru(page));
 	spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
@@ -855,28 +1121,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
  * If shrink==true, for avoiding to free too much, this returns immedieately.
  */
 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
-				   gfp_t gfp_mask, bool noswap, bool shrink)
+						struct zone *zone,
+						gfp_t gfp_mask,
+						unsigned long reclaim_options)
 {
 	struct mem_cgroup *victim;
 	int ret, total = 0;
 	int loop = 0;
+	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
+	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
+	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
+	unsigned long excess = mem_cgroup_get_excess(root_mem);
 
 	/* If memsw_is_minimum==1, swap-out is of-no-use. */
 	if (root_mem->memsw_is_minimum)
 		noswap = true;
 
-	while (loop < 2) {
+	while (1) {
 		victim = mem_cgroup_select_victim(root_mem);
-		if (victim == root_mem)
+		if (victim == root_mem) {
 			loop++;
+			if (loop >= 2) {
+				/*
+				 * If we have not been able to reclaim
+				 * anything, it might because there are
+				 * no reclaimable pages under this hierarchy
+				 */
+				if (!check_soft || !total) {
+					css_put(&victim->css);
+					break;
+				}
+				/*
+				 * We want to do more targetted reclaim.
+				 * excess >> 2 is not to excessive so as to
+				 * reclaim too much, nor too less that we keep
+				 * coming back to reclaim from this cgroup
+				 */
+				if (total >= (excess >> 2) ||
+					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
+					css_put(&victim->css);
+					break;
+				}
+			}
+		}
 		if (!mem_cgroup_local_usage(&victim->stat)) {
 			/* this cgroup's local usage == 0 */
 			css_put(&victim->css);
 			continue;
 		}
 		/* we use swappiness of local cgroup */
-		ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
-						   get_swappiness(victim));
+		if (check_soft)
+			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
+				noswap, get_swappiness(victim), zone,
+				zone->zone_pgdat->node_id);
+		else
+			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
+						noswap, get_swappiness(victim));
 		css_put(&victim->css);
 		/*
 		 * At shrinking usage, we can't check we should stop here or
@@ -886,7 +1186,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 		if (shrink)
 			return ret;
 		total += ret;
-		if (mem_cgroup_check_under_limit(root_mem))
+		if (check_soft) {
+			if (res_counter_check_under_soft_limit(&root_mem->res))
+				return total;
+		} else if (mem_cgroup_check_under_limit(root_mem))
 			return 1 + total;
 	}
 	return total;
@@ -965,11 +1268,11 @@ done:
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
 			gfp_t gfp_mask, struct mem_cgroup **memcg,
-			bool oom)
+			bool oom, struct page *page)
 {
-	struct mem_cgroup *mem, *mem_over_limit;
+	struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-	struct res_counter *fail_res;
+	struct res_counter *fail_res, *soft_fail_res = NULL;
 
 	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
 		/* Don't account this! */
@@ -996,20 +1299,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 	VM_BUG_ON(css_is_removed(&mem->css));
 
 	while (1) {
-		int ret;
-		bool noswap = false;
+		int ret = 0;
+		unsigned long flags = 0;
 
-		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
+		if (mem_cgroup_is_root(mem))
+			goto done;
+		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
+						&soft_fail_res);
 		if (likely(!ret)) {
 			if (!do_swap_account)
 				break;
 			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
-							&fail_res);
+							&fail_res, NULL);
 			if (likely(!ret))
 				break;
 			/* mem+swap counter fails */
-			res_counter_uncharge(&mem->res, PAGE_SIZE);
-			noswap = true;
+			res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+			flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
 									memsw);
 		} else
@@ -1020,8 +1326,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		if (!(gfp_mask & __GFP_WAIT))
 			goto nomem;
 
-		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
-							noswap, false);
+		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+						gfp_mask, flags);
 		if (ret)
 			continue;
 
@@ -1046,13 +1352,24 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 			goto nomem;
 		}
 	}
+	/*
+	 * Insert just the ancestor, we should trickle down to the correct
+	 * cgroup for reclaim, since the other nodes will be below their
+	 * soft limit
+	 */
+	if (soft_fail_res) {
+		mem_over_soft_limit =
+			mem_cgroup_from_res_counter(soft_fail_res, res);
+		if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
+			mem_cgroup_update_tree(mem_over_soft_limit, page);
+	}
+done:
 	return 0;
 nomem:
 	css_put(&mem->css);
 	return -ENOMEM;
 }
 
-
 /*
  * A helper function to get mem_cgroup from ID. must be called under
  * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1119,15 +1436,38 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	lock_page_cgroup(pc);
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
-		res_counter_uncharge(&mem->res, PAGE_SIZE);
-		if (do_swap_account)
-			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+		if (!mem_cgroup_is_root(mem)) {
+			res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+			if (do_swap_account)
+				res_counter_uncharge(&mem->memsw, PAGE_SIZE,
+							NULL);
+		}
 		css_put(&mem->css);
 		return;
 	}
+
 	pc->mem_cgroup = mem;
+	/*
+	 * We access a page_cgroup asynchronously without lock_page_cgroup().
+	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
+	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
+	 * before USED bit, we need memory barrier here.
+	 * See mem_cgroup_add_lru_list(), etc.
+ 	 */
 	smp_wmb();
-	pc->flags = pcg_default_flags[ctype];
+	switch (ctype) {
+	case MEM_CGROUP_CHARGE_TYPE_CACHE:
+	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
+		SetPageCgroupCache(pc);
+		SetPageCgroupUsed(pc);
+		break;
+	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+		ClearPageCgroupCache(pc);
+		SetPageCgroupUsed(pc);
+		break;
+	default:
+		break;
+	}
 
 	mem_cgroup_charge_statistics(mem, pc, true);
 
@@ -1178,7 +1518,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 	if (pc->mem_cgroup != from)
 		goto out;
 
-	res_counter_uncharge(&from->res, PAGE_SIZE);
+	if (!mem_cgroup_is_root(from))
+		res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
 	mem_cgroup_charge_statistics(from, pc, false);
 
 	page = pc->page;
@@ -1197,8 +1538,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 						1);
 	}
 
-	if (do_swap_account)
-		res_counter_uncharge(&from->memsw, PAGE_SIZE);
+	if (do_swap_account && !mem_cgroup_is_root(from))
+		res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
 	css_put(&from->css);
 
 	css_get(&to->css);
@@ -1238,7 +1579,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 	parent = mem_cgroup_from_cont(pcg);
 
 
-	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
 	if (ret || !parent)
 		return ret;
 
@@ -1268,9 +1609,11 @@ uncharge:
 	/* drop extra refcnt by try_charge() */
 	css_put(&parent->css);
 	/* uncharge if move fails */
-	res_counter_uncharge(&parent->res, PAGE_SIZE);
-	if (do_swap_account)
-		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
+	if (!mem_cgroup_is_root(parent)) {
+		res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
+		if (do_swap_account)
+			res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
+	}
 	return ret;
 }
 
@@ -1295,7 +1638,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 	prefetchw(pc);
 
 	mem = memcg;
-	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
 	if (ret || !mem)
 		return ret;
 
@@ -1414,14 +1757,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 	if (!mem)
 		goto charge_cur_mm;
 	*ptr = mem;
-	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
 	/* drop extra refcnt from tryget */
 	css_put(&mem->css);
 	return ret;
 charge_cur_mm:
 	if (unlikely(!mm))
 		mm = &init_mm;
-	return __mem_cgroup_try_charge(mm, mask, ptr, true);
+	return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
 }
 
 static void
@@ -1459,7 +1802,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 			 * This recorded memcg can be obsolete one. So, avoid
 			 * calling css_tryget
 			 */
-			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+			if (!mem_cgroup_is_root(memcg))
+				res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
+							NULL);
+			mem_cgroup_swap_statistics(memcg, false);
 			mem_cgroup_put(memcg);
 		}
 		rcu_read_unlock();
@@ -1484,9 +1830,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 		return;
 	if (!mem)
 		return;
-	res_counter_uncharge(&mem->res, PAGE_SIZE);
-	if (do_swap_account)
-		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+	if (!mem_cgroup_is_root(mem)) {
+		res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+		if (do_swap_account)
+			res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+	}
 	css_put(&mem->css);
 }
 
@@ -1500,6 +1848,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
 	struct mem_cgroup_per_zone *mz;
+	bool soft_limit_excess = false;
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -1538,9 +1887,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 		break;
 	}
 
-	res_counter_uncharge(&mem->res, PAGE_SIZE);
-	if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
-		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+	if (!mem_cgroup_is_root(mem)) {
+		res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
+		if (do_swap_account &&
+				(ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
+			res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+	}
+	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+		mem_cgroup_swap_statistics(mem, true);
 	mem_cgroup_charge_statistics(mem, pc, false);
 
 	ClearPageCgroupUsed(pc);
@@ -1554,6 +1908,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	mz = page_cgroup_zoneinfo(pc);
 	unlock_page_cgroup(pc);
 
+	if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
+		mem_cgroup_update_tree(mem, page);
 	/* at swapout, this memcg will be accessed to record to swap */
 	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		css_put(&mem->css);
@@ -1629,7 +1985,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
 		 * We uncharge this because swap is freed.
 		 * This memcg can be obsolete one. We avoid calling css_tryget
 		 */
-		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+		if (!mem_cgroup_is_root(memcg))
+			res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
+		mem_cgroup_swap_statistics(memcg, false);
 		mem_cgroup_put(memcg);
 	}
 	rcu_read_unlock();
@@ -1658,7 +2016,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 	unlock_page_cgroup(pc);
 
 	if (mem) {
-		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+						page);
 		css_put(&mem->css);
 	}
 	*ptr = mem;
@@ -1798,8 +2157,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 		if (!ret)
 			break;
 
-		progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
-						   false, true);
+		progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
+						GFP_KERNEL,
+						MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 		/* Usage is reduced ? */
   		if (curusage >= oldusage)
@@ -1851,7 +2211,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 		if (!ret)
 			break;
 
-		mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true);
+		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
+						MEM_CGROUP_RECLAIM_NOSWAP |
+						MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
@@ -1862,6 +2224,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 	return ret;
 }
 
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+						gfp_t gfp_mask, int nid,
+						int zid)
+{
+	unsigned long nr_reclaimed = 0;
+	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+	unsigned long reclaimed;
+	int loop = 0;
+	struct mem_cgroup_tree_per_zone *mctz;
+
+	if (order > 0)
+		return 0;
+
+	mctz = soft_limit_tree_node_zone(nid, zid);
+	/*
+	 * This loop can run a while, specially if mem_cgroup's continuously
+	 * keep exceeding their soft limit and putting the system under
+	 * pressure
+	 */
+	do {
+		if (next_mz)
+			mz = next_mz;
+		else
+			mz = mem_cgroup_largest_soft_limit_node(mctz);
+		if (!mz)
+			break;
+
+		reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
+						gfp_mask,
+						MEM_CGROUP_RECLAIM_SOFT);
+		nr_reclaimed += reclaimed;
+		spin_lock(&mctz->lock);
+
+		/*
+		 * If we failed to reclaim anything from this memory cgroup
+		 * it is time to move on to the next cgroup
+		 */
+		next_mz = NULL;
+		if (!reclaimed) {
+			do {
+				/*
+				 * Loop until we find yet another one.
+				 *
+				 * By the time we get the soft_limit lock
+				 * again, someone might have aded the
+				 * group back on the RB tree. Iterate to
+				 * make sure we get a different mem.
+				 * mem_cgroup_largest_soft_limit_node returns
+				 * NULL if no other cgroup is present on
+				 * the tree
+				 */
+				next_mz =
+				__mem_cgroup_largest_soft_limit_node(mctz);
+				if (next_mz == mz) {
+					css_put(&next_mz->mem->css);
+					next_mz = NULL;
+				} else /* next_mz == NULL or other memcg */
+					break;
+			} while (1);
+		}
+		mz->usage_in_excess =
+			res_counter_soft_limit_excess(&mz->mem->res);
+		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+		/*
+		 * One school of thought says that we should not add
+		 * back the node to the tree if reclaim returns 0.
+		 * But our reclaim could return 0, simply because due
+		 * to priority we are exposing a smaller subset of
+		 * memory to reclaim from. Consider this as a longer
+		 * term TODO.
+		 */
+		if (mz->usage_in_excess)
+			__mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
+		spin_unlock(&mctz->lock);
+		css_put(&mz->mem->css);
+		loop++;
+		/*
+		 * Could not reclaim anything and there are no more
+		 * mem cgroups to try or we seem to be looping without
+		 * reclaiming anything.
+		 */
+		if (!nr_reclaimed &&
+			(next_mz == NULL ||
+			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+			break;
+	} while (!nr_reclaimed);
+	if (next_mz)
+		css_put(&next_mz->mem->css);
+	return nr_reclaimed;
+}
+
 /*
  * This routine traverse page_cgroup in given list and drop them all.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2046,20 +2499,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 	return retval;
 }
 
+struct mem_cgroup_idx_data {
+	s64 val;
+	enum mem_cgroup_stat_index idx;
+};
+
+static int
+mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
+{
+	struct mem_cgroup_idx_data *d = data;
+	d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
+	return 0;
+}
+
+static void
+mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
+				enum mem_cgroup_stat_index idx, s64 *val)
+{
+	struct mem_cgroup_idx_data d;
+	d.idx = idx;
+	d.val = 0;
+	mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
+	*val = d.val;
+}
+
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-	u64 val = 0;
+	u64 idx_val, val;
 	int type, name;
 
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	switch (type) {
 	case _MEM:
-		val = res_counter_read_u64(&mem->res, name);
+		if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
+			mem_cgroup_get_recursive_idx_stat(mem,
+				MEM_CGROUP_STAT_CACHE, &idx_val);
+			val = idx_val;
+			mem_cgroup_get_recursive_idx_stat(mem,
+				MEM_CGROUP_STAT_RSS, &idx_val);
+			val += idx_val;
+			val <<= PAGE_SHIFT;
+		} else
+			val = res_counter_read_u64(&mem->res, name);
 		break;
 	case _MEMSWAP:
-		val = res_counter_read_u64(&mem->memsw, name);
+		if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
+			mem_cgroup_get_recursive_idx_stat(mem,
+				MEM_CGROUP_STAT_CACHE, &idx_val);
+			val = idx_val;
+			mem_cgroup_get_recursive_idx_stat(mem,
+				MEM_CGROUP_STAT_RSS, &idx_val);
+			val += idx_val;
+			mem_cgroup_get_recursive_idx_stat(mem,
+				MEM_CGROUP_STAT_SWAPOUT, &idx_val);
+			val <<= PAGE_SHIFT;
+		} else
+			val = res_counter_read_u64(&mem->memsw, name);
 		break;
 	default:
 		BUG();
@@ -2083,6 +2580,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 	name = MEMFILE_ATTR(cft->private);
 	switch (name) {
 	case RES_LIMIT:
+		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
+			ret = -EINVAL;
+			break;
+		}
 		/* This function does all necessary parse...reuse it */
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
@@ -2092,6 +2593,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 		else
 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
 		break;
+	case RES_SOFT_LIMIT:
+		ret = res_counter_memparse_write_strategy(buffer, &val);
+		if (ret)
+			break;
+		/*
+		 * For memsw, soft limits are hard to implement in terms
+		 * of semantics, for now, we support soft limits for
+		 * control without swap
+		 */
+		if (type == _MEM)
+			ret = res_counter_set_soft_limit(&memcg->res, val);
+		else
+			ret = -EINVAL;
+		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
@@ -2149,6 +2664,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 			res_counter_reset_failcnt(&mem->memsw);
 		break;
 	}
+
 	return 0;
 }
 
@@ -2160,6 +2676,7 @@ enum {
 	MCS_MAPPED_FILE,
 	MCS_PGPGIN,
 	MCS_PGPGOUT,
+	MCS_SWAP,
 	MCS_INACTIVE_ANON,
 	MCS_ACTIVE_ANON,
 	MCS_INACTIVE_FILE,
@@ -2181,6 +2698,7 @@ struct {
 	{"mapped_file", "total_mapped_file"},
 	{"pgpgin", "total_pgpgin"},
 	{"pgpgout", "total_pgpgout"},
+	{"swap", "total_swap"},
 	{"inactive_anon", "total_inactive_anon"},
 	{"active_anon", "total_active_anon"},
 	{"inactive_file", "total_inactive_file"},
@@ -2205,6 +2723,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
 	s->stat[MCS_PGPGIN] += val;
 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
 	s->stat[MCS_PGPGOUT] += val;
+	if (do_swap_account) {
+		val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
+		s->stat[MCS_SWAP] += val * PAGE_SIZE;
+	}
 
 	/* per zone stat */
 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -2236,8 +2758,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 	memset(&mystat, 0, sizeof(mystat));
 	mem_cgroup_get_local_stat(mem_cont, &mystat);
 
-	for (i = 0; i < NR_MCS_STAT; i++)
+	for (i = 0; i < NR_MCS_STAT; i++) {
+		if (i == MCS_SWAP && !do_swap_account)
+			continue;
 		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
+	}
 
 	/* Hierarchical information */
 	{
@@ -2250,9 +2775,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 
 	memset(&mystat, 0, sizeof(mystat));
 	mem_cgroup_get_total_stat(mem_cont, &mystat);
-	for (i = 0; i < NR_MCS_STAT; i++)
+	for (i = 0; i < NR_MCS_STAT; i++) {
+		if (i == MCS_SWAP && !do_swap_account)
+			continue;
 		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
-
+	}
 
 #ifdef CONFIG_DEBUG_VM
 	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
@@ -2345,6 +2872,12 @@ static struct cftype mem_cgroup_files[] = {
 		.read_u64 = mem_cgroup_read,
 	},
 	{
+		.name = "soft_limit_in_bytes",
+		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
+		.write_string = mem_cgroup_write,
+		.read_u64 = mem_cgroup_read,
+	},
+	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
@@ -2438,6 +2971,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 		mz = &pn->zoneinfo[zone];
 		for_each_lru(l)
 			INIT_LIST_HEAD(&mz->lists[l]);
+		mz->usage_in_excess = 0;
+		mz->on_tree = false;
+		mz->mem = mem;
 	}
 	return 0;
 }
@@ -2483,6 +3019,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
 {
 	int node;
 
+	mem_cgroup_remove_from_trees(mem);
 	free_css_id(&mem_cgroup_subsys, &mem->css);
 
 	for_each_node_state(node, N_POSSIBLE)
@@ -2531,6 +3068,31 @@ static void __init enable_swap_cgroup(void)
 }
 #endif
 
+static int mem_cgroup_soft_limit_tree_init(void)
+{
+	struct mem_cgroup_tree_per_node *rtpn;
+	struct mem_cgroup_tree_per_zone *rtpz;
+	int tmp, node, zone;
+
+	for_each_node_state(node, N_POSSIBLE) {
+		tmp = node;
+		if (!node_state(node, N_NORMAL_MEMORY))
+			tmp = -1;
+		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
+		if (!rtpn)
+			return 1;
+
+		soft_limit_tree.rb_tree_per_node[node] = rtpn;
+
+		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+			rtpz = &rtpn->rb_tree_per_zone[zone];
+			rtpz->rb_root = RB_ROOT;
+			spin_lock_init(&rtpz->lock);
+		}
+	}
+	return 0;
+}
+
 static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
@@ -2545,10 +3107,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	for_each_node_state(node, N_POSSIBLE)
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
 			goto free_out;
+
 	/* root ? */
 	if (cont->parent == NULL) {
 		enable_swap_cgroup();
 		parent = NULL;
+		root_mem_cgroup = mem;
+		if (mem_cgroup_soft_limit_tree_init())
+			goto free_out;
+
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
 		mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +3144,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	return &mem->css;
 free_out:
 	__mem_cgroup_free(mem);
+	root_mem_cgroup = NULL;
 	return ERR_PTR(error);
 }
 
@@ -2612,7 +3180,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *cont,
 				struct cgroup *old_cont,
-				struct task_struct *p)
+				struct task_struct *p,
+				bool threadgroup)
 {
 	mutex_lock(&memcg_tasklist);
 	/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 00000000000..729d4b15b64
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,832 @@
+/*
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Authors: Andi Kleen, Fengguang Wu
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ *
+ * High level machine check handler. Handles pages reported by the
+ * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * failure.
+ *
+ * Handles page cache pages in various states.	The tricky part
+ * here is that we can access any page asynchronous to other VM
+ * users, because memory failures could happen anytime and anywhere,
+ * possibly violating some of their assumptions. This is why this code
+ * has to be extremely careful. Generally it tries to use normal locking
+ * rules, as in get the standard locks, even if that means the
+ * error handling takes potentially a long time.
+ *
+ * The operation to map back from RMAP chains to processes has to walk
+ * the complete process list and has non linear complexity with the number
+ * mappings. In short it can be quite slow. But since memory corruptions
+ * are rare we hope to get away with this.
+ */
+
+/*
+ * Notebook:
+ * - hugetlb needs more code
+ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
+ * - pass bad pages to kdump next kernel
+ */
+#define DEBUG 1		/* remove me in 2.6.34 */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/sched.h>
+#include <linux/rmap.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/backing-dev.h>
+#include "internal.h"
+
+int sysctl_memory_failure_early_kill __read_mostly = 0;
+
+int sysctl_memory_failure_recovery __read_mostly = 1;
+
+atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+
+/*
+ * Send all the processes who have the page mapped an ``action optional''
+ * signal.
+ */
+static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
+			unsigned long pfn)
+{
+	struct siginfo si;
+	int ret;
+
+	printk(KERN_ERR
+		"MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
+		pfn, t->comm, t->pid);
+	si.si_signo = SIGBUS;
+	si.si_errno = 0;
+	si.si_code = BUS_MCEERR_AO;
+	si.si_addr = (void *)addr;
+#ifdef __ARCH_SI_TRAPNO
+	si.si_trapno = trapno;
+#endif
+	si.si_addr_lsb = PAGE_SHIFT;
+	/*
+	 * Don't use force here, it's convenient if the signal
+	 * can be temporarily blocked.
+	 * This could cause a loop when the user sets SIGBUS
+	 * to SIG_IGN, but hopefully noone will do that?
+	 */
+	ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
+	if (ret < 0)
+		printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
+		       t->comm, t->pid, ret);
+	return ret;
+}
+
+/*
+ * Kill all processes that have a poisoned page mapped and then isolate
+ * the page.
+ *
+ * General strategy:
+ * Find all processes having the page mapped and kill them.
+ * But we keep a page reference around so that the page is not
+ * actually freed yet.
+ * Then stash the page away
+ *
+ * There's no convenient way to get back to mapped processes
+ * from the VMAs. So do a brute-force search over all
+ * running processes.
+ *
+ * Remember that machine checks are not common (or rather
+ * if they are common you have other problems), so this shouldn't
+ * be a performance issue.
+ *
+ * Also there are some races possible while we get from the
+ * error detection to actually handle it.
+ */
+
+struct to_kill {
+	struct list_head nd;
+	struct task_struct *tsk;
+	unsigned long addr;
+	unsigned addr_valid:1;
+};
+
+/*
+ * Failure handling: if we can't find or can't kill a process there's
+ * not much we can do.	We just print a message and ignore otherwise.
+ */
+
+/*
+ * Schedule a process for later kill.
+ * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ * TBD would GFP_NOIO be enough?
+ */
+static void add_to_kill(struct task_struct *tsk, struct page *p,
+		       struct vm_area_struct *vma,
+		       struct list_head *to_kill,
+		       struct to_kill **tkc)
+{
+	struct to_kill *tk;
+
+	if (*tkc) {
+		tk = *tkc;
+		*tkc = NULL;
+	} else {
+		tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
+		if (!tk) {
+			printk(KERN_ERR
+		"MCE: Out of memory while machine check handling\n");
+			return;
+		}
+	}
+	tk->addr = page_address_in_vma(p, vma);
+	tk->addr_valid = 1;
+
+	/*
+	 * In theory we don't have to kill when the page was
+	 * munmaped. But it could be also a mremap. Since that's
+	 * likely very rare kill anyways just out of paranoia, but use
+	 * a SIGKILL because the error is not contained anymore.
+	 */
+	if (tk->addr == -EFAULT) {
+		pr_debug("MCE: Unable to find user space address %lx in %s\n",
+			page_to_pfn(p), tsk->comm);
+		tk->addr_valid = 0;
+	}
+	get_task_struct(tsk);
+	tk->tsk = tsk;
+	list_add_tail(&tk->nd, to_kill);
+}
+
+/*
+ * Kill the processes that have been collected earlier.
+ *
+ * Only do anything when DOIT is set, otherwise just free the list
+ * (this is used for clean pages which do not need killing)
+ * Also when FAIL is set do a force kill because something went
+ * wrong earlier.
+ */
+static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
+			  int fail, unsigned long pfn)
+{
+	struct to_kill *tk, *next;
+
+	list_for_each_entry_safe (tk, next, to_kill, nd) {
+		if (doit) {
+			/*
+			 * In case something went wrong with munmaping
+			 * make sure the process doesn't catch the
+			 * signal and then access the memory. Just kill it.
+			 * the signal handlers
+			 */
+			if (fail || tk->addr_valid == 0) {
+				printk(KERN_ERR
+		"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+					pfn, tk->tsk->comm, tk->tsk->pid);
+				force_sig(SIGKILL, tk->tsk);
+			}
+
+			/*
+			 * In theory the process could have mapped
+			 * something else on the address in-between. We could
+			 * check for that, but we need to tell the
+			 * process anyways.
+			 */
+			else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
+					      pfn) < 0)
+				printk(KERN_ERR
+		"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+					pfn, tk->tsk->comm, tk->tsk->pid);
+		}
+		put_task_struct(tk->tsk);
+		kfree(tk);
+	}
+}
+
+static int task_early_kill(struct task_struct *tsk)
+{
+	if (!tsk->mm)
+		return 0;
+	if (tsk->flags & PF_MCE_PROCESS)
+		return !!(tsk->flags & PF_MCE_EARLY);
+	return sysctl_memory_failure_early_kill;
+}
+
+/*
+ * Collect processes when the error hit an anonymous page.
+ */
+static void collect_procs_anon(struct page *page, struct list_head *to_kill,
+			      struct to_kill **tkc)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+	struct anon_vma *av;
+
+	read_lock(&tasklist_lock);
+	av = page_lock_anon_vma(page);
+	if (av == NULL)	/* Not actually mapped anymore */
+		goto out;
+	for_each_process (tsk) {
+		if (!task_early_kill(tsk))
+			continue;
+		list_for_each_entry (vma, &av->head, anon_vma_node) {
+			if (!page_mapped_in_vma(page, vma))
+				continue;
+			if (vma->vm_mm == tsk->mm)
+				add_to_kill(tsk, page, vma, to_kill, tkc);
+		}
+	}
+	page_unlock_anon_vma(av);
+out:
+	read_unlock(&tasklist_lock);
+}
+
+/*
+ * Collect processes when the error hit a file mapped page.
+ */
+static void collect_procs_file(struct page *page, struct list_head *to_kill,
+			      struct to_kill **tkc)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+	struct prio_tree_iter iter;
+	struct address_space *mapping = page->mapping;
+
+	/*
+	 * A note on the locking order between the two locks.
+	 * We don't rely on this particular order.
+	 * If you have some other code that needs a different order
+	 * feel free to switch them around. Or add a reverse link
+	 * from mm_struct to task_struct, then this could be all
+	 * done without taking tasklist_lock and looping over all tasks.
+	 */
+
+	read_lock(&tasklist_lock);
+	spin_lock(&mapping->i_mmap_lock);
+	for_each_process(tsk) {
+		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+		if (!task_early_kill(tsk))
+			continue;
+
+		vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+				      pgoff) {
+			/*
+			 * Send early kill signal to tasks where a vma covers
+			 * the page but the corrupted page is not necessarily
+			 * mapped it in its pte.
+			 * Assume applications who requested early kill want
+			 * to be informed of all such data corruptions.
+			 */
+			if (vma->vm_mm == tsk->mm)
+				add_to_kill(tsk, page, vma, to_kill, tkc);
+		}
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+	read_unlock(&tasklist_lock);
+}
+
+/*
+ * Collect the processes who have the corrupted page mapped to kill.
+ * This is done in two steps for locking reasons.
+ * First preallocate one tokill structure outside the spin locks,
+ * so that we can kill at least one process reasonably reliable.
+ */
+static void collect_procs(struct page *page, struct list_head *tokill)
+{
+	struct to_kill *tk;
+
+	if (!page->mapping)
+		return;
+
+	tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
+	if (!tk)
+		return;
+	if (PageAnon(page))
+		collect_procs_anon(page, tokill, &tk);
+	else
+		collect_procs_file(page, tokill, &tk);
+	kfree(tk);
+}
+
+/*
+ * Error handlers for various types of pages.
+ */
+
+enum outcome {
+	FAILED,		/* Error handling failed */
+	DELAYED,	/* Will be handled later */
+	IGNORED,	/* Error safely ignored */
+	RECOVERED,	/* Successfully recovered */
+};
+
+static const char *action_name[] = {
+	[FAILED] = "Failed",
+	[DELAYED] = "Delayed",
+	[IGNORED] = "Ignored",
+	[RECOVERED] = "Recovered",
+};
+
+/*
+ * Error hit kernel page.
+ * Do nothing, try to be lucky and not touch this instead. For a few cases we
+ * could be more sophisticated.
+ */
+static int me_kernel(struct page *p, unsigned long pfn)
+{
+	return DELAYED;
+}
+
+/*
+ * Already poisoned page.
+ */
+static int me_ignore(struct page *p, unsigned long pfn)
+{
+	return IGNORED;
+}
+
+/*
+ * Page in unknown state. Do nothing.
+ */
+static int me_unknown(struct page *p, unsigned long pfn)
+{
+	printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
+	return FAILED;
+}
+
+/*
+ * Free memory
+ */
+static int me_free(struct page *p, unsigned long pfn)
+{
+	return DELAYED;
+}
+
+/*
+ * Clean (or cleaned) page cache page.
+ */
+static int me_pagecache_clean(struct page *p, unsigned long pfn)
+{
+	int err;
+	int ret = FAILED;
+	struct address_space *mapping;
+
+	if (!isolate_lru_page(p))
+		page_cache_release(p);
+
+	/*
+	 * For anonymous pages we're done the only reference left
+	 * should be the one m_f() holds.
+	 */
+	if (PageAnon(p))
+		return RECOVERED;
+
+	/*
+	 * Now truncate the page in the page cache. This is really
+	 * more like a "temporary hole punch"
+	 * Don't do this for block devices when someone else
+	 * has a reference, because it could be file system metadata
+	 * and that's not safe to truncate.
+	 */
+	mapping = page_mapping(p);
+	if (!mapping) {
+		/*
+		 * Page has been teared down in the meanwhile
+		 */
+		return FAILED;
+	}
+
+	/*
+	 * Truncation is a bit tricky. Enable it per file system for now.
+	 *
+	 * Open: to take i_mutex or not for this? Right now we don't.
+	 */
+	if (mapping->a_ops->error_remove_page) {
+		err = mapping->a_ops->error_remove_page(mapping, p);
+		if (err != 0) {
+			printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
+					pfn, err);
+		} else if (page_has_private(p) &&
+				!try_to_release_page(p, GFP_NOIO)) {
+			pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+		} else {
+			ret = RECOVERED;
+		}
+	} else {
+		/*
+		 * If the file system doesn't support it just invalidate
+		 * This fails on dirty or anything with private pages
+		 */
+		if (invalidate_inode_page(p))
+			ret = RECOVERED;
+		else
+			printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
+				pfn);
+	}
+	return ret;
+}
+
+/*
+ * Dirty cache page page
+ * Issues: when the error hit a hole page the error is not properly
+ * propagated.
+ */
+static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+{
+	struct address_space *mapping = page_mapping(p);
+
+	SetPageError(p);
+	/* TBD: print more information about the file. */
+	if (mapping) {
+		/*
+		 * IO error will be reported by write(), fsync(), etc.
+		 * who check the mapping.
+		 * This way the application knows that something went
+		 * wrong with its dirty file data.
+		 *
+		 * There's one open issue:
+		 *
+		 * The EIO will be only reported on the next IO
+		 * operation and then cleared through the IO map.
+		 * Normally Linux has two mechanisms to pass IO error
+		 * first through the AS_EIO flag in the address space
+		 * and then through the PageError flag in the page.
+		 * Since we drop pages on memory failure handling the
+		 * only mechanism open to use is through AS_AIO.
+		 *
+		 * This has the disadvantage that it gets cleared on
+		 * the first operation that returns an error, while
+		 * the PageError bit is more sticky and only cleared
+		 * when the page is reread or dropped.  If an
+		 * application assumes it will always get error on
+		 * fsync, but does other operations on the fd before
+		 * and the page is dropped inbetween then the error
+		 * will not be properly reported.
+		 *
+		 * This can already happen even without hwpoisoned
+		 * pages: first on metadata IO errors (which only
+		 * report through AS_EIO) or when the page is dropped
+		 * at the wrong time.
+		 *
+		 * So right now we assume that the application DTRT on
+		 * the first EIO, but we're not worse than other parts
+		 * of the kernel.
+		 */
+		mapping_set_error(mapping, EIO);
+	}
+
+	return me_pagecache_clean(p, pfn);
+}
+
+/*
+ * Clean and dirty swap cache.
+ *
+ * Dirty swap cache page is tricky to handle. The page could live both in page
+ * cache and swap cache(ie. page is freshly swapped in). So it could be
+ * referenced concurrently by 2 types of PTEs:
+ * normal PTEs and swap PTEs. We try to handle them consistently by calling
+ * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
+ * and then
+ *      - clear dirty bit to prevent IO
+ *      - remove from LRU
+ *      - but keep in the swap cache, so that when we return to it on
+ *        a later page fault, we know the application is accessing
+ *        corrupted data and shall be killed (we installed simple
+ *        interception code in do_swap_page to catch it).
+ *
+ * Clean swap cache pages can be directly isolated. A later page fault will
+ * bring in the known good data from disk.
+ */
+static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+{
+	int ret = FAILED;
+
+	ClearPageDirty(p);
+	/* Trigger EIO in shmem: */
+	ClearPageUptodate(p);
+
+	if (!isolate_lru_page(p)) {
+		page_cache_release(p);
+		ret = DELAYED;
+	}
+
+	return ret;
+}
+
+static int me_swapcache_clean(struct page *p, unsigned long pfn)
+{
+	int ret = FAILED;
+
+	if (!isolate_lru_page(p)) {
+		page_cache_release(p);
+		ret = RECOVERED;
+	}
+	delete_from_swap_cache(p);
+	return ret;
+}
+
+/*
+ * Huge pages. Needs work.
+ * Issues:
+ * No rmap support so we cannot find the original mapper. In theory could walk
+ * all MMs and look for the mappings, but that would be non atomic and racy.
+ * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
+ * like just walking the current process and hoping it has it mapped (that
+ * should be usually true for the common "shared database cache" case)
+ * Should handle free huge pages and dequeue them too, but this needs to
+ * handle huge page accounting correctly.
+ */
+static int me_huge_page(struct page *p, unsigned long pfn)
+{
+	return FAILED;
+}
+
+/*
+ * Various page states we can handle.
+ *
+ * A page state is defined by its current page->flags bits.
+ * The table matches them in order and calls the right handler.
+ *
+ * This is quite tricky because we can access page at any time
+ * in its live cycle, so all accesses have to be extremly careful.
+ *
+ * This is not complete. More states could be added.
+ * For any missing state don't attempt recovery.
+ */
+
+#define dirty		(1UL << PG_dirty)
+#define sc		(1UL << PG_swapcache)
+#define unevict		(1UL << PG_unevictable)
+#define mlock		(1UL << PG_mlocked)
+#define writeback	(1UL << PG_writeback)
+#define lru		(1UL << PG_lru)
+#define swapbacked	(1UL << PG_swapbacked)
+#define head		(1UL << PG_head)
+#define tail		(1UL << PG_tail)
+#define compound	(1UL << PG_compound)
+#define slab		(1UL << PG_slab)
+#define buddy		(1UL << PG_buddy)
+#define reserved	(1UL << PG_reserved)
+
+static struct page_state {
+	unsigned long mask;
+	unsigned long res;
+	char *msg;
+	int (*action)(struct page *p, unsigned long pfn);
+} error_states[] = {
+	{ reserved,	reserved,	"reserved kernel",	me_ignore },
+	{ buddy,	buddy,		"free kernel",	me_free },
+
+	/*
+	 * Could in theory check if slab page is free or if we can drop
+	 * currently unused objects without touching them. But just
+	 * treat it as standard kernel for now.
+	 */
+	{ slab,		slab,		"kernel slab",	me_kernel },
+
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+	{ head,		head,		"huge",		me_huge_page },
+	{ tail,		tail,		"huge",		me_huge_page },
+#else
+	{ compound,	compound,	"huge",		me_huge_page },
+#endif
+
+	{ sc|dirty,	sc|dirty,	"swapcache",	me_swapcache_dirty },
+	{ sc|dirty,	sc,		"swapcache",	me_swapcache_clean },
+
+	{ unevict|dirty, unevict|dirty,	"unevictable LRU", me_pagecache_dirty},
+	{ unevict,	unevict,	"unevictable LRU", me_pagecache_clean},
+
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
+	{ mlock|dirty,	mlock|dirty,	"mlocked LRU",	me_pagecache_dirty },
+	{ mlock,	mlock,		"mlocked LRU",	me_pagecache_clean },
+#endif
+
+	{ lru|dirty,	lru|dirty,	"LRU",		me_pagecache_dirty },
+	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },
+	{ swapbacked,	swapbacked,	"anonymous",	me_pagecache_clean },
+
+	/*
+	 * Catchall entry: must be at end.
+	 */
+	{ 0,		0,		"unknown page state",	me_unknown },
+};
+
+#undef lru
+
+static void action_result(unsigned long pfn, char *msg, int result)
+{
+	struct page *page = NULL;
+	if (pfn_valid(pfn))
+		page = pfn_to_page(pfn);
+
+	printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
+		pfn,
+		page && PageDirty(page) ? "dirty " : "",
+		msg, action_name[result]);
+}
+
+static int page_action(struct page_state *ps, struct page *p,
+			unsigned long pfn, int ref)
+{
+	int result;
+
+	result = ps->action(p, pfn);
+	action_result(pfn, ps->msg, result);
+	if (page_count(p) != 1 + ref)
+		printk(KERN_ERR
+		       "MCE %#lx: %s page still referenced by %d users\n",
+		       pfn, ps->msg, page_count(p) - 1);
+
+	/* Could do more checks here if page looks ok */
+	/*
+	 * Could adjust zone counters here to correct for the missing page.
+	 */
+
+	return result == RECOVERED ? 0 : -EBUSY;
+}
+
+#define N_UNMAP_TRIES 5
+
+/*
+ * Do all that is necessary to remove user space mappings. Unmap
+ * the pages and send SIGBUS to the processes if the data was dirty.
+ */
+static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
+				  int trapno)
+{
+	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+	struct address_space *mapping;
+	LIST_HEAD(tokill);
+	int ret;
+	int i;
+	int kill = 1;
+
+	if (PageReserved(p) || PageCompound(p) || PageSlab(p))
+		return;
+
+	if (!PageLRU(p))
+		lru_add_drain_all();
+
+	/*
+	 * This check implies we don't kill processes if their pages
+	 * are in the swap cache early. Those are always late kills.
+	 */
+	if (!page_mapped(p))
+		return;
+
+	if (PageSwapCache(p)) {
+		printk(KERN_ERR
+		       "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+		ttu |= TTU_IGNORE_HWPOISON;
+	}
+
+	/*
+	 * Propagate the dirty bit from PTEs to struct page first, because we
+	 * need this to decide if we should kill or just drop the page.
+	 */
+	mapping = page_mapping(p);
+	if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
+		if (page_mkclean(p)) {
+			SetPageDirty(p);
+		} else {
+			kill = 0;
+			ttu |= TTU_IGNORE_HWPOISON;
+			printk(KERN_INFO
+	"MCE %#lx: corrupted page was clean: dropped without side effects\n",
+				pfn);
+		}
+	}
+
+	/*
+	 * First collect all the processes that have the page
+	 * mapped in dirty form.  This has to be done before try_to_unmap,
+	 * because ttu takes the rmap data structures down.
+	 *
+	 * Error handling: We ignore errors here because
+	 * there's nothing that can be done.
+	 */
+	if (kill)
+		collect_procs(p, &tokill);
+
+	/*
+	 * try_to_unmap can fail temporarily due to races.
+	 * Try a few times (RED-PEN better strategy?)
+	 */
+	for (i = 0; i < N_UNMAP_TRIES; i++) {
+		ret = try_to_unmap(p, ttu);
+		if (ret == SWAP_SUCCESS)
+			break;
+		pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
+	}
+
+	if (ret != SWAP_SUCCESS)
+		printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
+				pfn, page_mapcount(p));
+
+	/*
+	 * Now that the dirty bit has been propagated to the
+	 * struct page and all unmaps done we can decide if
+	 * killing is needed or not.  Only kill when the page
+	 * was dirty, otherwise the tokill list is merely
+	 * freed.  When there was a problem unmapping earlier
+	 * use a more force-full uncatchable kill to prevent
+	 * any accesses to the poisoned memory.
+	 */
+	kill_procs_ao(&tokill, !!PageDirty(p), trapno,
+		      ret != SWAP_SUCCESS, pfn);
+}
+
+int __memory_failure(unsigned long pfn, int trapno, int ref)
+{
+	struct page_state *ps;
+	struct page *p;
+	int res;
+
+	if (!sysctl_memory_failure_recovery)
+		panic("Memory failure from trap %d on page %lx", trapno, pfn);
+
+	if (!pfn_valid(pfn)) {
+		action_result(pfn, "memory outside kernel control", IGNORED);
+		return -EIO;
+	}
+
+	p = pfn_to_page(pfn);
+	if (TestSetPageHWPoison(p)) {
+		action_result(pfn, "already hardware poisoned", IGNORED);
+		return 0;
+	}
+
+	atomic_long_add(1, &mce_bad_pages);
+
+	/*
+	 * We need/can do nothing about count=0 pages.
+	 * 1) it's a free page, and therefore in safe hand:
+	 *    prep_new_page() will be the gate keeper.
+	 * 2) it's part of a non-compound high order page.
+	 *    Implies some kernel user: cannot stop them from
+	 *    R/W the page; let's pray that the page has been
+	 *    used and will be freed some time later.
+	 * In fact it's dangerous to directly bump up page count from 0,
+	 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+	 */
+	if (!get_page_unless_zero(compound_head(p))) {
+		action_result(pfn, "free or high order kernel", IGNORED);
+		return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
+	}
+
+	/*
+	 * Lock the page and wait for writeback to finish.
+	 * It's very difficult to mess with pages currently under IO
+	 * and in many cases impossible, so we just avoid it here.
+	 */
+	lock_page_nosync(p);
+	wait_on_page_writeback(p);
+
+	/*
+	 * Now take care of user space mappings.
+	 */
+	hwpoison_user_mappings(p, pfn, trapno);
+
+	/*
+	 * Torn down by someone else?
+	 */
+	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+		action_result(pfn, "already truncated LRU", IGNORED);
+		res = 0;
+		goto out;
+	}
+
+	res = -EBUSY;
+	for (ps = error_states;; ps++) {
+		if ((p->flags & ps->mask) == ps->res) {
+			res = page_action(ps, p, pfn, ref);
+			break;
+		}
+	}
+out:
+	unlock_page(p);
+	return res;
+}
+EXPORT_SYMBOL_GPL(__memory_failure);
+
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+void memory_failure(unsigned long pfn, int trapno)
+{
+	__memory_failure(pfn, trapno, 0);
+}
diff --git a/mm/memory.c b/mm/memory.c
index b1443ac07c0..7e91b5f9f69 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -297,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		unsigned long addr = vma->vm_start;
 
 		/*
-		 * Hide vma from rmap and vmtruncate before freeing pgtables
+		 * Hide vma from rmap and truncate_pagecache before freeing
+		 * pgtables
 		 */
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
@@ -1325,7 +1326,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				if (ret & VM_FAULT_ERROR) {
 					if (ret & VM_FAULT_OOM)
 						return i ? i : -ENOMEM;
-					else if (ret & VM_FAULT_SIGBUS)
+					if (ret &
+					    (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
 						return i ? i : -EFAULT;
 					BUG();
 				}
@@ -2407,7 +2409,7 @@ restart:
  * @mapping: the address space containing mmaps to be unmapped.
  * @holebegin: byte in first page to unmap, relative to the start of
  * the underlying file.  This will be rounded down to a PAGE_SIZE
- * boundary.  Note that this is different from vmtruncate(), which
+ * boundary.  Note that this is different from truncate_pagecache(), which
  * must keep the partial page.  In contrast, we must get rid of
  * partial pages.
  * @holelen: size of prospective hole in bytes.  This will be rounded
@@ -2458,63 +2460,6 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
-/**
- * vmtruncate - unmap mappings "freed" by truncate() syscall
- * @inode: inode of the file used
- * @offset: file offset to start truncating
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page.  Ugly, but necessary.
- */
-int vmtruncate(struct inode * inode, loff_t offset)
-{
-	if (inode->i_size < offset) {
-		unsigned long limit;
-
-		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-		if (limit != RLIM_INFINITY && offset > limit)
-			goto out_sig;
-		if (offset > inode->i_sb->s_maxbytes)
-			goto out_big;
-		i_size_write(inode, offset);
-	} else {
-		struct address_space *mapping = inode->i_mapping;
-
-		/*
-		 * truncation of in-use swapfiles is disallowed - it would
-		 * cause subsequent swapout to scribble on the now-freed
-		 * blocks.
-		 */
-		if (IS_SWAPFILE(inode))
-			return -ETXTBSY;
-		i_size_write(inode, offset);
-
-		/*
-		 * unmap_mapping_range is called twice, first simply for
-		 * efficiency so that truncate_inode_pages does fewer
-		 * single-page unmaps.  However after this first call, and
-		 * before truncate_inode_pages finishes, it is possible for
-		 * private pages to be COWed, which remain after
-		 * truncate_inode_pages finishes, hence the second
-		 * unmap_mapping_range call must be made for correctness.
-		 */
-		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-		truncate_inode_pages(mapping, offset);
-		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-	}
-
-	if (inode->i_op->truncate)
-		inode->i_op->truncate(inode);
-	return 0;
-
-out_sig:
-	send_sig(SIGXFSZ, current, 0);
-out_big:
-	return -EFBIG;
-}
-EXPORT_SYMBOL(vmtruncate);
-
 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 {
 	struct address_space *mapping = inode->i_mapping;
@@ -2559,8 +2504,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
-	if (is_migration_entry(entry)) {
-		migration_entry_wait(mm, pmd, address);
+	if (unlikely(non_swap_entry(entry))) {
+		if (is_migration_entry(entry)) {
+			migration_entry_wait(mm, pmd, address);
+		} else if (is_hwpoison_entry(entry)) {
+			ret = VM_FAULT_HWPOISON;
+		} else {
+			print_bad_pte(vma, address, orig_pte, NULL);
+			ret = VM_FAULT_OOM;
+		}
 		goto out;
 	}
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
@@ -2584,6 +2536,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
+	} else if (PageHWPoison(page)) {
+		ret = VM_FAULT_HWPOISON;
+		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+		goto out;
 	}
 
 	lock_page(page);
@@ -2760,6 +2716,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
 		return ret;
 
+	if (unlikely(PageHWPoison(vmf.page))) {
+		if (ret & VM_FAULT_LOCKED)
+			unlock_page(vmf.page);
+		return VM_FAULT_HWPOISON;
+	}
+
 	/*
 	 * For consistency in subsequent calls, make the faulted page always
 	 * locked.
diff --git a/mm/migrate.c b/mm/migrate.c
index 16052e80aaa..1a4bf481378 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -675,7 +675,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	}
 
 	/* Establish migration ptes or remove ptes */
-	try_to_unmap(page, 1);
+	try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 
 skip_unmap:
 	if (!page_mapped(page))
diff --git a/mm/mremap.c b/mm/mremap.c
index 20a07dba6be..97bff254771 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -86,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	if (vma->vm_file) {
 		/*
 		 * Subtle point from Rajesh Venkatasubramanian: before
-		 * moving file-based ptes, we must lock vmtruncate out,
-		 * since it might clean the dst vma before the src vma,
+		 * moving file-based ptes, we must lock truncate_pagecache
+		 * out, since it might clean the dst vma before the src vma,
 		 * and we propagate stale pages into the dst afterward.
 		 */
 		mapping = vma->vm_file->f_mapping;
diff --git a/mm/nommu.c b/mm/nommu.c
index 8d484241d03..56a446f0597 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -83,46 +83,6 @@ struct vm_operations_struct generic_file_vm_ops = {
 };
 
 /*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page.  Ugly, but necessary.
- */
-int vmtruncate(struct inode *inode, loff_t offset)
-{
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long limit;
-
-	if (inode->i_size < offset)
-		goto do_expand;
-	i_size_write(inode, offset);
-
-	truncate_inode_pages(mapping, offset);
-	goto out_truncate;
-
-do_expand:
-	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-	if (limit != RLIM_INFINITY && offset > limit)
-		goto out_sig;
-	if (offset > inode->i_sb->s_maxbytes)
-		goto out;
-	i_size_write(inode, offset);
-
-out_truncate:
-	if (inode->i_op->truncate)
-		inode->i_op->truncate(inode);
-	return 0;
-out_sig:
-	send_sig(SIGXFSZ, current, 0);
-out:
-	return -EFBIG;
-}
-
-EXPORT_SYMBOL(vmtruncate);
-
-/*
  * Return the total memory allocated for this pointer, not
  * just what the caller asked for.
  *
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5f378dd5880..d99664e8607 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -155,37 +155,37 @@ static void update_completion_period(void)
 }
 
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write)
 		dirty_background_bytes = 0;
 	return ret;
 }
 
 int dirty_background_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write)
 		dirty_background_ratio = 0;
 	return ret;
 }
 
 int dirty_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int old_ratio = vm_dirty_ratio;
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
 		update_completion_period();
 		vm_dirty_bytes = 0;
@@ -195,13 +195,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
 
 
 int dirty_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	unsigned long old_bytes = vm_dirty_bytes;
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
 		update_completion_period();
 		vm_dirty_ratio = 0;
@@ -686,9 +686,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
  */
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	return 0;
 }
 
@@ -1149,6 +1149,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 EXPORT_SYMBOL(redirty_page_for_writepage);
 
 /*
+ * Dirty a page.
+ *
+ * For pages with a mapping this should be done under the page lock
+ * for the benefit of asynchronous memory errors who prefer a consistent
+ * dirty state. This rule can be broken in some special cases,
+ * but should be better not to.
+ *
  * If the mapping doesn't provide a set_page_dirty a_op, then
  * just fall through and assume that it wants buffer_heads.
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5717f27a070..bf720550b44 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,6 +234,12 @@ static void bad_page(struct page *page)
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 
+	/* Don't complain about poisoned pages */
+	if (PageHWPoison(page)) {
+		__ClearPageBuddy(page);
+		return;
+	}
+
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
@@ -666,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page,
 /*
  * This page is about to be returned from the page allocator
  */
-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+static inline int check_new_page(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
@@ -675,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 		bad_page(page);
 		return 1;
 	}
+	return 0;
+}
+
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+{
+	int i;
+
+	for (i = 0; i < (1 << order); i++) {
+		struct page *p = page + i;
+		if (unlikely(check_new_page(p)))
+			return 1;
+	}
 
 	set_page_private(page, 0);
 	set_page_refcounted(page);
@@ -2373,7 +2391,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
-		struct file *file, void __user *buffer, size_t *length,
+		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@ -2382,7 +2400,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
 	if (write)
 		strncpy(saved_string, (char*)table->data,
 			NUMA_ZONELIST_ORDER_LEN);
-	ret = proc_dostring(table, write, file, buffer, length, ppos);
+	ret = proc_dostring(table, write, buffer, length, ppos);
 	if (ret)
 		return ret;
 	if (write) {
@@ -4706,9 +4724,9 @@ module_init(init_per_zone_wmark_min)
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	if (write)
 		setup_per_zone_wmarks();
 	return 0;
@@ -4716,12 +4734,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 
-	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 
@@ -4732,12 +4750,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 }
 
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 
-	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 
@@ -4758,9 +4776,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
@@ -4772,13 +4790,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
  */
 
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	unsigned int cpu;
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || (ret == -EINVAL))
 		return ret;
 	for_each_populated_zone(zone) {
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 6eedf7e473d..6633965bb27 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages)
 	int node = numa_node_id();
 	struct zone *zones = NODE_DATA(node)->node_zones;
 	int num_cpus_on_node;
-	const struct cpumask *cpumask_on_node = cpumask_of_node(node);
 
 	node_free_pages =
 #ifdef CONFIG_ZONE_DMA
@@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages)
 
 	max = node_free_pages / FRACTION_OF_NODE_MEM;
 
-	num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
+	num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
 	max /= num_cpus_on_node;
 
 	return max(max, min_pages);
diff --git a/mm/rmap.c b/mm/rmap.c
index 720fc03a7bc..28aafe2b530 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,11 @@
  *                 mapping->tree_lock (widely used, in set_page_dirty,
  *                           in arch-dependent flush_dcache_mmap_lock,
  *                           within inode_lock in __sync_single_inode)
+ *
+ * (code doesn't rely on that order so it could be switched around)
+ * ->tasklist_lock
+ *   anon_vma->lock      (memory_failure, collect_procs_anon)
+ *     pte map lock
  */
 
 #include <linux/mm.h>
@@ -191,7 +196,7 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
 	struct anon_vma *anon_vma;
 	unsigned long anon_mapping;
@@ -211,7 +216,7 @@ out:
 	return NULL;
 }
 
-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
 	spin_unlock(&anon_vma->lock);
 	rcu_read_unlock();
@@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
  * if the page is not mapped into the page tables of this VMA.  Only
  * valid for normal file or anonymous VMAs.
  */
-static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 {
 	unsigned long address;
 	pte_t *pte;
@@ -756,7 +761,7 @@ void page_remove_rmap(struct page *page)
  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  */
 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-				int migration)
+				enum ttu_flags flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -778,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if (!migration) {
+	if (!(flags & TTU_IGNORE_MLOCK)) {
 		if (vma->vm_flags & VM_LOCKED) {
 			ret = SWAP_MLOCK;
 			goto out_unmap;
 		}
+	}
+	if (!(flags & TTU_IGNORE_ACCESS)) {
 		if (ptep_clear_flush_young_notify(vma, address, pte)) {
 			ret = SWAP_FAIL;
 			goto out_unmap;
@@ -800,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
 
-	if (PageAnon(page)) {
+	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+		if (PageAnon(page))
+			dec_mm_counter(mm, anon_rss);
+		else
+			dec_mm_counter(mm, file_rss);
+		set_pte_at(mm, address, pte,
+				swp_entry_to_pte(make_hwpoison_entry(page)));
+	} else if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page_private(page) };
 
 		if (PageSwapCache(page)) {
@@ -822,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * pte. do_swap_page() will wait until the migration
 			 * pte is removed and then restart fault handling.
 			 */
-			BUG_ON(!migration);
+			BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
 			entry = make_migration_entry(page, pte_write(pteval));
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
-	} else if (PAGE_MIGRATION && migration) {
+	} else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
 		/* Establish migration entry for a file page */
 		swp_entry_t entry;
 		entry = make_migration_entry(page, pte_write(pteval));
@@ -996,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_anon(struct page *page, int unlock, int migration)
+static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
 	unsigned int mlocked = 0;
 	int ret = SWAP_AGAIN;
+	int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
 
 	if (MLOCK_PAGES && unlikely(unlock))
 		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
@@ -1017,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 				continue;  /* must visit all unlocked vmas */
 			ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
 		} else {
-			ret = try_to_unmap_one(page, vma, migration);
+			ret = try_to_unmap_one(page, vma, flags);
 			if (ret == SWAP_FAIL || !page_mapped(page))
 				break;
 		}
@@ -1041,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 /**
  * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
  * @page: the page to unmap/unlock
- * @unlock:  request for unlock rather than unmap [unlikely]
- * @migration:  unmapping for migration - ignored if @unlock
+ * @flags: action and flags
  *
  * Find all the mappings of a page using the mapping pointer and the vma chains
  * contained in the address_space struct it points to.
@@ -1054,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_file(struct page *page, int unlock, int migration)
+static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1066,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
 	unsigned long max_nl_size = 0;
 	unsigned int mapcount;
 	unsigned int mlocked = 0;
+	int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
 
 	if (MLOCK_PAGES && unlikely(unlock))
 		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
@@ -1078,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
 				continue;	/* must visit all vmas */
 			ret = SWAP_MLOCK;
 		} else {
-			ret = try_to_unmap_one(page, vma, migration);
+			ret = try_to_unmap_one(page, vma, flags);
 			if (ret == SWAP_FAIL || !page_mapped(page))
 				goto out;
 		}
@@ -1103,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
 			ret = SWAP_MLOCK;	/* leave mlocked == 0 */
 			goto out;		/* no need to look further */
 		}
-		if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
+		if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
+			(vma->vm_flags & VM_LOCKED))
 			continue;
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
@@ -1137,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-			if (!MLOCK_PAGES && !migration &&
+			if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
 			    (vma->vm_flags & VM_LOCKED))
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
@@ -1177,7 +1193,7 @@ out:
 /**
  * try_to_unmap - try to remove all page table mappings to a page
  * @page: the page to get unmapped
- * @migration: migration flag
+ * @flags: action and flags
  *
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock.
@@ -1188,16 +1204,16 @@ out:
  * SWAP_FAIL	- the page is unswappable
  * SWAP_MLOCK	- page is mlocked.
  */
-int try_to_unmap(struct page *page, int migration)
+int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
 	int ret;
 
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
-		ret = try_to_unmap_anon(page, 0, migration);
+		ret = try_to_unmap_anon(page, flags);
 	else
-		ret = try_to_unmap_file(page, 0, migration);
+		ret = try_to_unmap_file(page, flags);
 	if (ret != SWAP_MLOCK && !page_mapped(page))
 		ret = SWAP_SUCCESS;
 	return ret;
@@ -1222,8 +1238,8 @@ int try_to_munlock(struct page *page)
 	VM_BUG_ON(!PageLocked(page) || PageLRU(page));
 
 	if (PageAnon(page))
-		return try_to_unmap_anon(page, 1, 0);
+		return try_to_unmap_anon(page, TTU_MUNLOCK);
 	else
-		return try_to_unmap_file(page, 1, 0);
+		return try_to_unmap_file(page, TTU_MUNLOCK);
 }
 
diff --git a/mm/shmem.c b/mm/shmem.c
index b206a7a32e2..98631c26c20 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1633,8 +1633,8 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 	if (pos + copied > inode->i_size)
 		i_size_write(inode, pos + copied);
 
-	unlock_page(page);
 	set_page_dirty(page);
+	unlock_page(page);
 	page_cache_release(page);
 
 	return copied;
@@ -1971,13 +1971,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 			iput(inode);
 			return error;
 		}
-		unlock_page(page);
 		inode->i_mapping->a_ops = &shmem_aops;
 		inode->i_op = &shmem_symlink_inode_operations;
 		kaddr = kmap_atomic(page, KM_USER0);
 		memcpy(kaddr, symname, len);
 		kunmap_atomic(kaddr, KM_USER0);
 		set_page_dirty(page);
+		unlock_page(page);
 		page_cache_release(page);
 	}
 	if (dir->i_mode & S_ISGID)
@@ -2420,6 +2420,7 @@ static const struct address_space_operations shmem_aops = {
 	.write_end	= shmem_write_end,
 #endif
 	.migratepage	= migrate_page,
+	.error_remove_page = generic_error_remove_page,
 };
 
 static const struct file_operations shmem_file_operations = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1bf19daadc..4de7f02f820 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry)
 	struct swap_info_struct *p;
 	struct page *page = NULL;
 
-	if (is_migration_entry(entry))
+	if (non_swap_entry(entry))
 		return 1;
 
 	p = swap_info_get(entry);
@@ -2085,7 +2085,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache)
 	int count;
 	bool has_cache;
 
-	if (is_migration_entry(entry))
+	if (non_swap_entry(entry))
 		return -EINVAL;
 
 	type = swp_type(entry);
diff --git a/mm/truncate.c b/mm/truncate.c
index ccc3ecf7cb9..450cebdabfc 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page);
  * its lock, b) when a concurrent invalidate_mapping_pages got there first and
  * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
  */
-static void
+static int
 truncate_complete_page(struct address_space *mapping, struct page *page)
 {
 	if (page->mapping != mapping)
-		return;
+		return -EIO;
 
 	if (page_has_private(page))
 		do_invalidatepage(page, 0);
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	remove_from_page_cache(page);
 	ClearPageMappedToDisk(page);
 	page_cache_release(page);	/* pagecache ref */
+	return 0;
 }
 
 /*
@@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	return ret;
 }
 
+int truncate_inode_page(struct address_space *mapping, struct page *page)
+{
+	if (page_mapped(page)) {
+		unmap_mapping_range(mapping,
+				   (loff_t)page->index << PAGE_CACHE_SHIFT,
+				   PAGE_CACHE_SIZE, 0);
+	}
+	return truncate_complete_page(mapping, page);
+}
+
+/*
+ * Used to get rid of pages on hardware memory corruption.
+ */
+int generic_error_remove_page(struct address_space *mapping, struct page *page)
+{
+	if (!mapping)
+		return -EINVAL;
+	/*
+	 * Only punch for normal data pages for now.
+	 * Handling other types like directories would need more auditing.
+	 */
+	if (!S_ISREG(mapping->host->i_mode))
+		return -EIO;
+	return truncate_inode_page(mapping, page);
+}
+EXPORT_SYMBOL(generic_error_remove_page);
+
+/*
+ * Safely invalidate one page from its pagecache mapping.
+ * It only drops clean, unused pages. The page must be locked.
+ *
+ * Returns 1 if the page is successfully invalidated, otherwise 0.
+ */
+int invalidate_inode_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	if (!mapping)
+		return 0;
+	if (PageDirty(page) || PageWriteback(page))
+		return 0;
+	if (page_mapped(page))
+		return 0;
+	return invalidate_complete_page(mapping, page);
+}
+
 /**
  * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
  * @mapping: mapping to truncate
@@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 				unlock_page(page);
 				continue;
 			}
-			if (page_mapped(page)) {
-				unmap_mapping_range(mapping,
-				  (loff_t)page_index<<PAGE_CACHE_SHIFT,
-				  PAGE_CACHE_SIZE, 0);
-			}
-			truncate_complete_page(mapping, page);
+			truncate_inode_page(mapping, page);
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
@@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
 				break;
 			lock_page(page);
 			wait_on_page_writeback(page);
-			if (page_mapped(page)) {
-				unmap_mapping_range(mapping,
-				  (loff_t)page->index<<PAGE_CACHE_SHIFT,
-				  PAGE_CACHE_SIZE, 0);
-			}
+			truncate_inode_page(mapping, page);
 			if (page->index > next)
 				next = page->index;
 			next++;
-			truncate_complete_page(mapping, page);
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
@@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 			if (lock_failed)
 				continue;
 
-			if (PageDirty(page) || PageWriteback(page))
-				goto unlock;
-			if (page_mapped(page))
-				goto unlock;
-			ret += invalidate_complete_page(mapping, page);
-unlock:
+			ret += invalidate_inode_page(page);
+
 			unlock_page(page);
 			if (next > end)
 				break;
@@ -465,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping)
 	return invalidate_inode_pages2_range(mapping, 0, -1);
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
+
+/**
+ * truncate_pagecache - unmap and remove pagecache that has been truncated
+ * @inode: inode
+ * @old: old file offset
+ * @new: new file offset
+ *
+ * inode's new i_size must already be written before truncate_pagecache
+ * is called.
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
+{
+	if (new < old) {
+		struct address_space *mapping = inode->i_mapping;
+
+		/*
+		 * unmap_mapping_range is called twice, first simply for
+		 * efficiency so that truncate_inode_pages does fewer
+		 * single-page unmaps.  However after this first call, and
+		 * before truncate_inode_pages finishes, it is possible for
+		 * private pages to be COWed, which remain after
+		 * truncate_inode_pages finishes, hence the second
+		 * unmap_mapping_range call must be made for correctness.
+		 */
+		unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+		truncate_inode_pages(mapping, new);
+		unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+	}
+}
+EXPORT_SYMBOL(truncate_pagecache);
+
+/**
+ * vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page.  Ugly, but necessary.
+ */
+int vmtruncate(struct inode *inode, loff_t offset)
+{
+	loff_t oldsize;
+	int error;
+
+	error = inode_newsize_ok(inode, offset);
+	if (error)
+		return error;
+	oldsize = inode->i_size;
+	i_size_write(inode, offset);
+	truncate_pagecache(inode, oldsize, offset);
+	if (inode->i_op->truncate)
+		inode->i_op->truncate(inode);
+
+	return error;
+}
+EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 613e89f471d..1219ceb8a9b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -663,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			switch (try_to_unmap(page, 0)) {
+			switch (try_to_unmap(page, TTU_UNMAP)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
@@ -1836,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
+unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+						gfp_t gfp_mask, bool noswap,
+						unsigned int swappiness,
+						struct zone *zone, int nid)
+{
+	struct scan_control sc = {
+		.may_writepage = !laptop_mode,
+		.may_unmap = 1,
+		.may_swap = !noswap,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.swappiness = swappiness,
+		.order = 0,
+		.mem_cgroup = mem,
+		.isolate_pages = mem_cgroup_isolate_pages,
+	};
+	nodemask_t nm  = nodemask_of_node(nid);
+
+	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+	sc.nodemask = &nm;
+	sc.nr_reclaimed = 0;
+	sc.nr_scanned = 0;
+	/*
+	 * NOTE: Although we can get the priority field, using it
+	 * here is not a good idea, since it limits the pages we can scan.
+	 * if we don't reclaim here, the shrink_zone from balance_pgdat
+	 * will pick up pages from other mem cgroup's as well. We hack
+	 * the priority and make it zero.
+	 */
+	shrink_zone(0, zone, &sc);
+	return sc.nr_reclaimed;
+}
+
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 					   gfp_t gfp_mask,
 					   bool noswap,
 					   unsigned int swappiness)
 {
+	struct zonelist *zonelist;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
@@ -1852,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
 		.nodemask = NULL, /* we don't care the placement */
 	};
-	struct zonelist *zonelist;
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1974,6 +2007,7 @@ loop_again:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
+			int nid, zid;
 
 			if (!populated_zone(zone))
 				continue;
@@ -1988,6 +2022,15 @@ loop_again:
 			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
 			note_zone_scanning_priority(zone, priority);
+
+			nid = pgdat->node_id;
+			zid = zone_idx(zone);
+			/*
+			 * Call soft limit reclaim before calling shrink_zone.
+			 * For now we ignore the return value
+			 */
+			mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
+							nid, zid);
 			/*
 			 * We put equal pressure on every zone, unless one
 			 * zone has way too many pages free already.
@@ -2801,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void)
 unsigned long scan_unevictable_pages;
 
 int scan_unevictable_handler(struct ctl_table *table, int write,
-			   struct file *file, void __user *buffer,
+			   void __user *buffer,
 			   size_t *length, loff_t *ppos)
 {
-	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
 	if (write && *(unsigned long *)table->data)
 		scan_all_zones_unevictable_pages();
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index da0f64f82b5..d6b1b054e29 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1781,8 +1781,8 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		ax25_info.idletimer = ax25_display_timer(&ax25->idletimer) / (60 * HZ);
 		ax25_info.n2count   = ax25->n2count;
 		ax25_info.state     = ax25->state;
-		ax25_info.rcv_q     = sk_wmem_alloc_get(sk);
-		ax25_info.snd_q     = sk_rmem_alloc_get(sk);
+		ax25_info.rcv_q     = sk_rmem_alloc_get(sk);
+		ax25_info.snd_q     = sk_wmem_alloc_get(sk);
 		ax25_info.vs        = ax25->vs;
 		ax25_info.vr        = ax25->vr;
 		ax25_info.va        = ax25->va;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 907a82e9023..a16a2342f6b 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -965,12 +965,12 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
 
 #ifdef CONFIG_SYSCTL
 static
-int brnf_sysctl_call_tables(ctl_table * ctl, int write, struct file *filp,
+int brnf_sysctl_call_tables(ctl_table * ctl, int write,
 			    void __user * buffer, size_t * lenp, loff_t * ppos)
 {
 	int ret;
 
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write && *(int *)(ctl->data))
 		*(int *)(ctl->data) = 1;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 0bcecbf0658..4d11c28ca8c 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -192,11 +192,10 @@
 #define F_QUEUE_MAP_CPU (1<<14)	/* queue map mirrors smp_processor_id() */
 
 /* Thread control flag bits */
-#define T_TERMINATE   (1<<0)
-#define T_STOP        (1<<1)	/* Stop run */
-#define T_RUN         (1<<2)	/* Start run */
-#define T_REMDEVALL   (1<<3)	/* Remove all devs */
-#define T_REMDEV      (1<<4)	/* Remove one dev */
+#define T_STOP        (1<<0)	/* Stop run */
+#define T_RUN         (1<<1)	/* Start run */
+#define T_REMDEVALL   (1<<2)	/* Remove all devs */
+#define T_REMDEV      (1<<3)	/* Remove one dev */
 
 /* If lock -- can be removed after some work */
 #define   if_lock(t)           spin_lock(&(t->if_lock));
@@ -2105,7 +2104,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 
 static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 {
-	ktime_t start;
+	ktime_t start_time, end_time;
 	s32 remaining;
 	struct hrtimer_sleeper t;
 
@@ -2116,7 +2115,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 	if (remaining <= 0)
 		return;
 
-	start = ktime_now();
+	start_time = ktime_now();
 	if (remaining < 100)
 		udelay(remaining); 	/* really small just spin */
 	else {
@@ -2135,7 +2134,10 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 		} while (t.task && pkt_dev->running && !signal_pending(current));
 		__set_current_state(TASK_RUNNING);
 	}
-	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), start));
+	end_time = ktime_now();
+
+	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+	pkt_dev->next_tx = ktime_add_ns(end_time, pkt_dev->delay);
 }
 
 static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
@@ -3365,19 +3367,29 @@ static void pktgen_rem_thread(struct pktgen_thread *t)
 	mutex_unlock(&pktgen_thread_lock);
 }
 
-static void idle(struct pktgen_dev *pkt_dev)
+static void pktgen_resched(struct pktgen_dev *pkt_dev)
 {
 	ktime_t idle_start = ktime_now();
+	schedule();
+	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start));
+}
 
-	if (need_resched())
-		schedule();
-	else
-		cpu_relax();
+static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
+{
+	ktime_t idle_start = ktime_now();
 
+	while (atomic_read(&(pkt_dev->skb->users)) != 1) {
+		if (signal_pending(current))
+			break;
+
+		if (need_resched())
+			pktgen_resched(pkt_dev);
+		else
+			cpu_relax();
+	}
 	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start));
 }
 
-
 static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 {
 	struct net_device *odev = pkt_dev->odev;
@@ -3387,36 +3399,21 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 	u16 queue_map;
 	int ret;
 
-	if (pkt_dev->delay) {
-		spin(pkt_dev, pkt_dev->next_tx);
-
-		/* This is max DELAY, this has special meaning of
-		 * "never transmit"
-		 */
-		if (pkt_dev->delay == ULLONG_MAX) {
-			pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX);
-			return;
-		}
-	}
-
-	if (!pkt_dev->skb) {
-		set_cur_queue_map(pkt_dev);
-		queue_map = pkt_dev->cur_queue_map;
-	} else {
-		queue_map = skb_get_queue_mapping(pkt_dev->skb);
+	/* If device is offline, then don't send */
+	if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
+		pktgen_stop_device(pkt_dev);
+		return;
 	}
 
-	txq = netdev_get_tx_queue(odev, queue_map);
-	/* Did we saturate the queue already? */
-	if (netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq)) {
-		/* If device is down, then all queues are permnantly frozen */
-		if (netif_running(odev))
-			idle(pkt_dev);
-		else
-			pktgen_stop_device(pkt_dev);
+	/* This is max DELAY, this has special meaning of
+	 * "never transmit"
+	 */
+	if (unlikely(pkt_dev->delay == ULLONG_MAX)) {
+		pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX);
 		return;
 	}
 
+	/* If no skb or clone count exhausted then get new one */
 	if (!pkt_dev->skb || (pkt_dev->last_ok &&
 			      ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
 		/* build a new pkt */
@@ -3435,54 +3432,45 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		pkt_dev->clone_count = 0;	/* reset counter */
 	}
 
-	/* fill_packet() might have changed the queue */
+	if (pkt_dev->delay && pkt_dev->last_ok)
+		spin(pkt_dev, pkt_dev->next_tx);
+
 	queue_map = skb_get_queue_mapping(pkt_dev->skb);
 	txq = netdev_get_tx_queue(odev, queue_map);
 
 	__netif_tx_lock_bh(txq);
-	if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq)))
-		pkt_dev->last_ok = 0;
-	else {
-		atomic_inc(&(pkt_dev->skb->users));
+	atomic_inc(&(pkt_dev->skb->users));
 
-	retry_now:
+	if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq)))
+		ret = NETDEV_TX_BUSY;
+	else
 		ret = (*xmit)(pkt_dev->skb, odev);
-		switch (ret) {
-		case NETDEV_TX_OK:
-			txq_trans_update(txq);
-			pkt_dev->last_ok = 1;
-			pkt_dev->sofar++;
-			pkt_dev->seq_num++;
-			pkt_dev->tx_bytes += pkt_dev->cur_pkt_size;
-			break;
-		case NETDEV_TX_LOCKED:
-			cpu_relax();
-			goto retry_now;
-		default: /* Drivers are not supposed to return other values! */
-			if (net_ratelimit())
-				pr_info("pktgen: %s xmit error: %d\n",
-					odev->name, ret);
-			pkt_dev->errors++;
-			/* fallthru */
-		case NETDEV_TX_BUSY:
-			/* Retry it next time */
-			atomic_dec(&(pkt_dev->skb->users));
-			pkt_dev->last_ok = 0;
-		}
-
-		if (pkt_dev->delay)
-			pkt_dev->next_tx = ktime_add_ns(ktime_now(),
-							pkt_dev->delay);
+
+	switch (ret) {
+	case NETDEV_TX_OK:
+		txq_trans_update(txq);
+		pkt_dev->last_ok = 1;
+		pkt_dev->sofar++;
+		pkt_dev->seq_num++;
+		pkt_dev->tx_bytes += pkt_dev->cur_pkt_size;
+		break;
+	default: /* Drivers are not supposed to return other values! */
+		if (net_ratelimit())
+			pr_info("pktgen: %s xmit error: %d\n",
+				odev->name, ret);
+		pkt_dev->errors++;
+		/* fallthru */
+	case NETDEV_TX_LOCKED:
+	case NETDEV_TX_BUSY:
+		/* Retry it next time */
+		atomic_dec(&(pkt_dev->skb->users));
+		pkt_dev->last_ok = 0;
 	}
 	__netif_tx_unlock_bh(txq);
 
 	/* If pkt_dev->count is zero, then run forever */
 	if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
-		while (atomic_read(&(pkt_dev->skb->users)) != 1) {
-			if (signal_pending(current))
-				break;
-			idle(pkt_dev);
-		}
+		pktgen_wait_for_skb(pkt_dev);
 
 		/* Done with this */
 		pktgen_stop_device(pkt_dev);
@@ -3515,20 +3503,24 @@ static int pktgen_thread_worker(void *arg)
 	while (!kthread_should_stop()) {
 		pkt_dev = next_to_run(t);
 
-		if (!pkt_dev &&
-		    (t->control & (T_STOP | T_RUN | T_REMDEVALL | T_REMDEV))
-		    == 0) {
-			prepare_to_wait(&(t->queue), &wait,
-					TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ / 10);
-			finish_wait(&(t->queue), &wait);
+		if (unlikely(!pkt_dev && t->control == 0)) {
+			wait_event_interruptible_timeout(t->queue,
+							 t->control != 0,
+							 HZ/10);
+			continue;
 		}
 
 		__set_current_state(TASK_RUNNING);
 
-		if (pkt_dev)
+		if (likely(pkt_dev)) {
 			pktgen_xmit(pkt_dev);
 
+			if (need_resched())
+				pktgen_resched(pkt_dev);
+			else
+				cpu_relax();
+		}
+
 		if (t->control & T_STOP) {
 			pktgen_stop(t);
 			t->control &= ~(T_STOP);
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 1c6a5bb6f0c..6e1f085db06 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -164,7 +164,7 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU
 static int min_priority[1];
 static int max_priority[] = { 127 }; /* From DECnet spec */
 
-static int dn_forwarding_proc(ctl_table *, int, struct file *,
+static int dn_forwarding_proc(ctl_table *, int,
 			void __user *, size_t *, loff_t *);
 static int dn_forwarding_sysctl(ctl_table *table,
 			void __user *oldval, size_t __user *oldlenp,
@@ -274,7 +274,6 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
 }
 
 static int dn_forwarding_proc(ctl_table *table, int write,
-				struct file *filep,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
@@ -290,7 +289,7 @@ static int dn_forwarding_proc(ctl_table *table, int write,
 	dn_db = dev->dn_ptr;
 	old = dn_db->parms.forwarding;
 
-	err = proc_dointvec(table, write, filep, buffer, lenp, ppos);
+	err = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if ((err >= 0) && write) {
 		if (dn_db->parms.forwarding < 0)
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 5bcd592ae6d..26b0ab1e9f5 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -165,7 +165,6 @@ static int dn_node_address_strategy(ctl_table *table,
 }
 
 static int dn_node_address_handler(ctl_table *table, int write,
-				struct file *filp,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
@@ -276,7 +275,6 @@ static int dn_def_dev_strategy(ctl_table *table,
 
 
 static int dn_def_dev_handler(ctl_table *table, int write,
-				struct file * filp,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 07336c6201f..e92f1fd28aa 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1270,10 +1270,10 @@ static void inet_forward_change(struct net *net)
 }
 
 static int devinet_conf_proc(ctl_table *ctl, int write,
-			     struct file *filp, void __user *buffer,
+			     void __user *buffer,
 			     size_t *lenp, loff_t *ppos)
 {
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write) {
 		struct ipv4_devconf *cnf = ctl->extra1;
@@ -1342,12 +1342,12 @@ static int devinet_conf_sysctl(ctl_table *table,
 }
 
 static int devinet_sysctl_forward(ctl_table *ctl, int write,
-				  struct file *filp, void __user *buffer,
+				  void __user *buffer,
 				  size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write && *valp != val) {
 		struct net *net = ctl->extra2;
@@ -1372,12 +1372,12 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
 }
 
 int ipv4_doint_and_flush(ctl_table *ctl, int write,
-			 struct file *filp, void __user *buffer,
+			 void __user *buffer,
 			 size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 	struct net *net = ctl->extra2;
 
 	if (write && *valp != val)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d9645c94a06..41ada9904d3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -66,10 +66,7 @@
    solution, but it supposes maintaing new variable in ALL
    skb, even if no tunneling is used.
 
-   Current solution: t->recursion lock breaks dead loops. It looks
-   like dev->tbusy flag, but I preferred new variable, because
-   the semantics is different. One day, when hard_start_xmit
-   will be multithreaded we will have to use skb->encapsulation.
+   Current solution: HARD_TX_LOCK lock breaks dead loops.
 
 
 
@@ -678,11 +675,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 	__be32 dst;
 	int    mtu;
 
-	if (tunnel->recursion++) {
-		stats->collisions++;
-		goto tx_error;
-	}
-
 	if (dev->type == ARPHRD_ETHER)
 		IPCB(skb)->flags = 0;
 
@@ -820,7 +812,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 			ip_rt_put(rt);
 			stats->tx_dropped++;
 			dev_kfree_skb(skb);
-			tunnel->recursion--;
 			return NETDEV_TX_OK;
 		}
 		if (skb->sk)
@@ -888,7 +879,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 	nf_reset(skb);
 
 	IPTUNNEL_XMIT();
-	tunnel->recursion--;
 	return NETDEV_TX_OK;
 
 tx_error_icmp:
@@ -897,7 +887,6 @@ tx_error_icmp:
 tx_error:
 	stats->tx_errors++;
 	dev_kfree_skb(skb);
-	tunnel->recursion--;
 	return NETDEV_TX_OK;
 }
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc7993e9061..5a0693576e8 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -611,6 +611,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		 *	Check the arguments are allowable
 		 */
 
+		if (optlen < sizeof(struct in_addr))
+			goto e_inval;
+
 		err = -EFAULT;
 		if (optlen >= sizeof(struct ip_mreqn)) {
 			if (copy_from_user(&mreq, optval, sizeof(mreq)))
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 62548cb0923..08ccd344de7 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -402,11 +402,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	__be32 dst = tiph->daddr;
 	int    mtu;
 
-	if (tunnel->recursion++) {
-		stats->collisions++;
-		goto tx_error;
-	}
-
 	if (skb->protocol != htons(ETH_P_IP))
 		goto tx_error;
 
@@ -485,7 +480,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 			ip_rt_put(rt);
 			stats->tx_dropped++;
 			dev_kfree_skb(skb);
-			tunnel->recursion--;
 			return NETDEV_TX_OK;
 		}
 		if (skb->sk)
@@ -523,7 +517,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	nf_reset(skb);
 
 	IPTUNNEL_XMIT();
-	tunnel->recursion--;
 	return NETDEV_TX_OK;
 
 tx_error_icmp:
@@ -531,7 +524,6 @@ tx_error_icmp:
 tx_error:
 	stats->tx_errors++;
 	dev_kfree_skb(skb);
-	tunnel->recursion--;
 	return NETDEV_TX_OK;
 }
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index df934731453..bb419925202 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3036,7 +3036,7 @@ void ip_rt_multicast_event(struct in_device *in_dev)
 
 #ifdef CONFIG_SYSCTL
 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
-					struct file *filp, void __user *buffer,
+					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
 	if (write) {
@@ -3046,7 +3046,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 
 		memcpy(&ctl, __ctl, sizeof(ctl));
 		ctl.data = &flush_delay;
-		proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
+		proc_dointvec(&ctl, write, buffer, lenp, ppos);
 
 		net = (struct net *)__ctl->extra1;
 		rt_cache_flush(net, flush_delay);
@@ -3106,12 +3106,11 @@ static void rt_secret_reschedule(int old)
 }
 
 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
-					  struct file *filp,
 					  void __user *buffer, size_t *lenp,
 					  loff_t *ppos)
 {
 	int old = ip_rt_secret_interval;
-	int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
 
 	rt_secret_reschedule(old);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4710d219f06..2dcf04d9b00 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -36,7 +36,7 @@ static void set_local_port_range(int range[2])
 }
 
 /* Validate changes from /proc interface. */
-static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
+static int ipv4_local_port_range(ctl_table *table, int write,
 				 void __user *buffer,
 				 size_t *lenp, loff_t *ppos)
 {
@@ -51,7 +51,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
 	};
 
 	inet_get_local_port_range(range, range + 1);
-	ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
 	if (write && ret == 0) {
 		if (range[1] < range[0])
@@ -91,7 +91,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table,
 }
 
 
-static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
+static int proc_tcp_congestion_control(ctl_table *ctl, int write,
 				       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char val[TCP_CA_NAME_MAX];
@@ -103,7 +103,7 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
 
 	tcp_get_default_congestion_control(val);
 
-	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	if (write && ret == 0)
 		ret = tcp_set_default_congestion_control(val);
 	return ret;
@@ -129,7 +129,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table,
 }
 
 static int proc_tcp_available_congestion_control(ctl_table *ctl,
-						 int write, struct file * filp,
+						 int write,
 						 void __user *buffer, size_t *lenp,
 						 loff_t *ppos)
 {
@@ -140,13 +140,13 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl,
 	if (!tbl.data)
 		return -ENOMEM;
 	tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
-	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	kfree(tbl.data);
 	return ret;
 }
 
 static int proc_allowed_congestion_control(ctl_table *ctl,
-					   int write, struct file * filp,
+					   int write,
 					   void __user *buffer, size_t *lenp,
 					   loff_t *ppos)
 {
@@ -158,7 +158,7 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
 		return -ENOMEM;
 
 	tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
-	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	if (write && ret == 0)
 		ret = tcp_set_allowed_congestion_control(tbl.data);
 	kfree(tbl.data);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 55f486d89c8..1fd0a3d775d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3986,14 +3986,14 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 #ifdef CONFIG_SYSCTL
 
 static
-int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
+int addrconf_sysctl_forward(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
 	int ret;
 
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write)
 		ret = addrconf_fixup_forwarding(ctl, valp, val);
@@ -4090,14 +4090,14 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old)
 }
 
 static
-int addrconf_sysctl_disable(ctl_table *ctl, int write, struct file * filp,
+int addrconf_sysctl_disable(ctl_table *ctl, int write,
 			    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
 	int ret;
 
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write)
 		ret = addrconf_disable_ipv6(ctl, valp, val);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 7d25bbe3211..c595bbe1ed9 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1043,11 +1043,6 @@ ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct net_device_stats *stats = &t->dev->stats;
 	int ret;
 
-	if (t->recursion++) {
-		stats->collisions++;
-		goto tx_err;
-	}
-
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
 		ret = ip4ip6_tnl_xmit(skb, dev);
@@ -1062,14 +1057,12 @@ ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (ret < 0)
 		goto tx_err;
 
-	t->recursion--;
 	return NETDEV_TX_OK;
 
 tx_err:
 	stats->tx_errors++;
 	stats->tx_dropped++;
 	kfree_skb(skb);
-	t->recursion--;
 	return NETDEV_TX_OK;
 }
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7015478797f..498b9b0b0fa 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1735,7 +1735,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
 	}
 }
 
-int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net_device *dev = ctl->extra1;
 	struct inet6_dev *idev;
@@ -1746,16 +1746,16 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f
 		ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default");
 
 	if (strcmp(ctl->procname, "retrans_time") == 0)
-		ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+		ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	else if (strcmp(ctl->procname, "base_reachable_time") == 0)
 		ret = proc_dointvec_jiffies(ctl, write,
-					    filp, buffer, lenp, ppos);
+					    buffer, lenp, ppos);
 
 	else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) ||
 		 (strcmp(ctl->procname, "base_reachable_time_ms") == 0))
 		ret = proc_dointvec_ms_jiffies(ctl, write,
-					       filp, buffer, lenp, ppos);
+					       buffer, lenp, ppos);
 	else
 		ret = -1;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 77aecbe8ff6..d6fe7646a8f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2524,13 +2524,13 @@ static const struct file_operations rt6_stats_seq_fops = {
 #ifdef CONFIG_SYSCTL
 
 static
-int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
+int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
 			      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = current->nsproxy->net_ns;
 	int delay = net->ipv6.sysctl.flush_delay;
 	if (write) {
-		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+		proc_dointvec(ctl, write, buffer, lenp, ppos);
 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
 		return 0;
 	} else
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0ae4f644818..fcb53962884 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -626,11 +626,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 	struct in6_addr *addr6;
 	int addr_type;
 
-	if (tunnel->recursion++) {
-		stats->collisions++;
-		goto tx_error;
-	}
-
 	if (skb->protocol != htons(ETH_P_IPV6))
 		goto tx_error;
 
@@ -753,7 +748,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 			ip_rt_put(rt);
 			stats->tx_dropped++;
 			dev_kfree_skb(skb);
-			tunnel->recursion--;
 			return NETDEV_TX_OK;
 		}
 		if (skb->sk)
@@ -794,7 +788,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 	nf_reset(skb);
 
 	IPTUNNEL_XMIT();
-	tunnel->recursion--;
 	return NETDEV_TX_OK;
 
 tx_error_icmp:
@@ -802,7 +795,6 @@ tx_error_icmp:
 tx_error:
 	stats->tx_errors++;
 	dev_kfree_skb(skb);
-	tunnel->recursion--;
 	return NETDEV_TX_OK;
 }
 
diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c
index 57f8817c397..5c86567e5a7 100644
--- a/net/irda/irsysctl.c
+++ b/net/irda/irsysctl.c
@@ -73,12 +73,12 @@ static int min_lap_keepalive_time = 100;	/* 100us */
 /* For other sysctl, I've no idea of the range. Maybe Dag could help
  * us on that - Jean II */
 
-static int do_devname(ctl_table *table, int write, struct file *filp,
+static int do_devname(ctl_table *table, int write,
 		      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_dostring(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write) {
 		struct ias_value *val;
 
@@ -90,12 +90,12 @@ static int do_devname(ctl_table *table, int write, struct file *filp,
 }
 
 
-static int do_discovery(ctl_table *table, int write, struct file *filp,
+static int do_discovery(ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int ret;
 
-       ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret)
 	       return ret;
 
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 039901109fa..71e10cabf81 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -90,8 +90,8 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 		bss->dtim_period = tim_ie->dtim_period;
 	}
 
-	/* set default value for buggy APs */
-	if (!elems->tim || bss->dtim_period == 0)
+	/* set default value for buggy AP/no TIM element */
+	if (bss->dtim_period == 0)
 		bss->dtim_period = 1;
 
 	bss->supp_rates_len = 0;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index fba2892b99e..446e9bd4b4b 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1496,14 +1496,14 @@ static int ip_vs_zero_all(void)
 
 
 static int
-proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+proc_do_defense_mode(ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = table->data;
 	int val = *valp;
 	int rc;
 
-	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (write && (*valp != val)) {
 		if ((*valp < 0) || (*valp > 3)) {
 			/* Restore the correct value */
@@ -1517,7 +1517,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
 
 
 static int
-proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+proc_do_sync_threshold(ctl_table *table, int write,
 		       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = table->data;
@@ -1527,7 +1527,7 @@ proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
 	/* backup the value first */
 	memcpy(val, valp, sizeof(val));
 
-	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
 		/* Restore the correct value */
 		memcpy(valp, val, sizeof(val));
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 4e620305f28..c93494fef8e 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -226,7 +226,7 @@ static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
 static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
 static struct ctl_table_header *nf_log_dir_header;
 
-static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp,
+static int nf_log_proc_dostring(ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	const struct nf_logger *logger;
@@ -260,7 +260,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp,
 			table->data = "NONE";
 		else
 			table->data = logger->name;
-		r = proc_dostring(table, write, filp, buffer, lenp, ppos);
+		r = proc_dostring(table, write, buffer, lenp, ppos);
 		mutex_unlock(&nf_log_mutex);
 	}
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 55180b99562..a4bafbf1509 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1609,6 +1609,16 @@ int netlink_change_ngroups(struct sock *sk, unsigned int groups)
 	return err;
 }
 
+void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+	struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
+
+	sk_for_each_bound(sk, node, &tbl->mc_list)
+		netlink_update_socket_mc(nlk_sk(sk), group, 0);
+}
+
 /**
  * netlink_clear_multicast_users - kick off multicast listeners
  *
@@ -1619,15 +1629,8 @@ int netlink_change_ngroups(struct sock *sk, unsigned int groups)
  */
 void netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
 {
-	struct sock *sk;
-	struct hlist_node *node;
-	struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
-
 	netlink_table_grab();
-
-	sk_for_each_bound(sk, node, &tbl->mc_list)
-		netlink_update_socket_mc(nlk_sk(sk), group, 0);
-
+	__netlink_clear_multicast_users(ksk, group);
 	netlink_table_ungrab();
 }
 
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 566941e0336..44ff3f3810f 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -220,10 +220,12 @@ static void __genl_unregister_mc_group(struct genl_family *family,
 	struct net *net;
 	BUG_ON(grp->family != family);
 
+	netlink_table_grab();
 	rcu_read_lock();
 	for_each_net_rcu(net)
-		netlink_clear_multicast_users(net->genl_sock, grp->id);
+		__netlink_clear_multicast_users(net->genl_sock, grp->id);
 	rcu_read_unlock();
+	netlink_table_ungrab();
 
 	clear_bit(grp->id, mc_groups);
 	list_del(&grp->list);
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index a662e62a99c..f60c0c2aacb 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -168,6 +168,12 @@ static int pn_send(struct sk_buff *skb, struct net_device *dev,
 		goto drop;
 	}
 
+	/* Broadcast sending is not implemented */
+	if (pn_addr(dst) == PNADDR_BROADCAST) {
+		err = -EOPNOTSUPP;
+		goto drop;
+	}
+
 	skb_reset_transport_header(skb);
 	WARN_ON(skb_headroom(skb) & 1); /* HW assumes word alignment */
 	skb_push(skb, sizeof(struct phonethdr));
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 7a4ee397d2f..07aa9f08d5f 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -113,6 +113,8 @@ void pn_sock_unhash(struct sock *sk)
 }
 EXPORT_SYMBOL(pn_sock_unhash);
 
+static DEFINE_MUTEX(port_mutex);
+
 static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
 {
 	struct sock *sk = sock->sk;
@@ -140,9 +142,11 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
 		err = -EINVAL; /* attempt to rebind */
 		goto out;
 	}
+	WARN_ON(sk_hashed(sk));
+	mutex_lock(&port_mutex);
 	err = sk->sk_prot->get_port(sk, pn_port(handle));
 	if (err)
-		goto out;
+		goto out_port;
 
 	/* get_port() sets the port, bind() sets the address if applicable */
 	pn->sobject = pn_object(saddr, pn_port(pn->sobject));
@@ -150,6 +154,8 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
 
 	/* Enable RX on the socket */
 	sk->sk_prot->hash(sk);
+out_port:
+	mutex_unlock(&port_mutex);
 out:
 	release_sock(sk);
 	return err;
@@ -357,8 +363,6 @@ const struct proto_ops phonet_stream_ops = {
 };
 EXPORT_SYMBOL(phonet_stream_ops);
 
-static DEFINE_MUTEX(port_mutex);
-
 /* allocate port for a socket */
 int pn_sock_get_port(struct sock *sk, unsigned short sport)
 {
@@ -370,9 +374,7 @@ int pn_sock_get_port(struct sock *sk, unsigned short sport)
 
 	memset(&try_sa, 0, sizeof(struct sockaddr_pn));
 	try_sa.spn_family = AF_PHONET;
-
-	mutex_lock(&port_mutex);
-
+	WARN_ON(!mutex_is_locked(&port_mutex));
 	if (!sport) {
 		/* search free port */
 		int port, pmin, pmax;
@@ -401,8 +403,6 @@ int pn_sock_get_port(struct sock *sk, unsigned short sport)
 		else
 			sock_put(tmpsk);
 	}
-	mutex_unlock(&port_mutex);
-
 	/* the port must be in use already */
 	return -EADDRINUSE;
 
diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c
index 7b5749ee276..2220f332232 100644
--- a/net/phonet/sysctl.c
+++ b/net/phonet/sysctl.c
@@ -56,7 +56,7 @@ void phonet_get_local_port_range(int *min, int *max)
 	} while (read_seqretry(&local_port_range_lock, seq));
 }
 
-static int proc_local_port_range(ctl_table *table, int write, struct file *filp,
+static int proc_local_port_range(ctl_table *table, int write,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
@@ -70,7 +70,7 @@ static int proc_local_port_range(ctl_table *table, int write, struct file *filp,
 		.extra2 = &local_port_range_max,
 	};
 
-	ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
 	if (write && ret == 0) {
 		if (range[1] < range[0])
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index c70dd7f5258..1db618f56ec 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -8,7 +8,6 @@
 
 #include <linux/types.h>
 #include <linux/module.h>
-#include <linux/utsname.h>
 #include <linux/sunrpc/clnt.h>
 
 #ifdef RPC_DEBUG
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index a417d5ab5dd..38829e20500 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -640,10 +640,11 @@ EXPORT_SYMBOL_GPL(rpc_call_async);
 /**
  * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
  * rpc_execute against it
- * @ops: RPC call ops
+ * @req: RPC request
+ * @tk_ops: RPC call ops
  */
 struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
-					const struct rpc_call_ops *tk_ops)
+				const struct rpc_call_ops *tk_ops)
 {
 	struct rpc_task *task;
 	struct xdr_buf *xbufp = &req->rq_snd_buf;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 858a443f418..49278f83036 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -860,7 +860,8 @@ static void rpc_clntdir_depopulate(struct dentry *dentry)
 
 /**
  * rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs
- * @path: path from the rpc_pipefs root to the new directory
+ * @dentry: dentry from the rpc_pipefs root to the new directory
+ * @name: &struct qstr for the name
  * @rpc_client: rpc client to associate with this directory
  *
  * This creates a directory at the given @path associated with
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 5231f7aaac0..42f9748ae09 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -56,7 +56,7 @@ rpc_unregister_sysctl(void)
 	}
 }
 
-static int proc_do_xprt(ctl_table *table, int write, struct file *file,
+static int proc_do_xprt(ctl_table *table, int write,
 			void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char tmpbuf[256];
@@ -71,7 +71,7 @@ static int proc_do_xprt(ctl_table *table, int write, struct file *file,
 }
 
 static int
-proc_dodebug(ctl_table *table, int write, struct file *file,
+proc_dodebug(ctl_table *table, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char		tmpbuf[20], c, *s;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 87101177825..35fb68b9c8e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -80,7 +80,7 @@ struct kmem_cache *svc_rdma_ctxt_cachep;
  * current value.
  */
 static int read_reset_stat(ctl_table *table, int write,
-			   struct file *filp, void __user *buffer, size_t *lenp,
+			   void __user *buffer, size_t *lenp,
 			   loff_t *ppos)
 {
 	atomic_t *stat = (atomic_t *)table->data;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bee41546575..37c5475ba25 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -773,6 +773,7 @@ static void xs_close(struct rpc_xprt *xprt)
 	dprintk("RPC:       xs_close xprt %p\n", xprt);
 
 	xs_reset_transport(transport);
+	xprt->reestablish_timeout = 0;
 
 	smp_mb__before_clear_bit();
 	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
@@ -1264,6 +1265,12 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
 	if (xprt->shutdown)
 		goto out;
 
+	/* Any data means we had a useful conversation, so
+	 * the we don't need to delay the next reconnect
+	 */
+	if (xprt->reestablish_timeout)
+		xprt->reestablish_timeout = 0;
+
 	/* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
 	rd_desc.arg.data = xprt;
 	do {
@@ -2034,6 +2041,8 @@ static void xs_connect(struct rpc_task *task)
 				   &transport->connect_worker,
 				   xprt->reestablish_timeout);
 		xprt->reestablish_timeout <<= 1;
+		if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
 		if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
 			xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
 	} else {
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index d16cd9ea4d0..bf725275eb8 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -26,11 +26,11 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 
 	wdev->wext.connect.ie = wdev->wext.ie;
 	wdev->wext.connect.ie_len = wdev->wext.ie_len;
-	wdev->wext.connect.privacy = wdev->wext.default_key != -1;
 
 	if (wdev->wext.keys) {
 		wdev->wext.keys->def = wdev->wext.default_key;
 		wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
+		wdev->wext.connect.privacy = true;
 	}
 
 	if (!wdev->wext.connect.ssid_len)
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index c29be8f9024..4f9c1908593 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -83,11 +83,12 @@ TMPOUT := $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/)
 # is automatically cleaned up.
 try-run = $(shell set -e;		\
 	TMP="$(TMPOUT).$$$$.tmp";	\
+	TMPO="$(TMPOUT).$$$$.o";	\
 	if ($(1)) >/dev/null 2>&1;	\
 	then echo "$(2)";		\
 	else echo "$(3)";		\
 	fi;				\
-	rm -f "$$TMP")
+	rm -f "$$TMP" "$$TMPO")
 
 # as-option
 # Usage: cflags-y += $(call as-option,-Wa$(comma)-isa=foo,)
@@ -105,12 +106,12 @@ as-instr = $(call try-run,\
 # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586)
 
 cc-option = $(call try-run,\
-	$(CC) $(KBUILD_CFLAGS) $(1) -c -xc /dev/null -o "$$TMP",$(1),$(2))
+	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -xc /dev/null -o "$$TMP",$(1),$(2))
 
 # cc-option-yn
 # Usage: flag := $(call cc-option-yn,-march=winchip-c6)
 cc-option-yn = $(call try-run,\
-	$(CC) $(KBUILD_CFLAGS) $(1) -c -xc /dev/null -o "$$TMP",y,n)
+	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -xc /dev/null -o "$$TMP",y,n)
 
 # cc-option-align
 # Prefix align with either -falign or -malign
@@ -130,10 +131,15 @@ cc-fullversion = $(shell $(CONFIG_SHELL) \
 # Usage:  EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1)
 cc-ifversion = $(shell [ $(call cc-version, $(CC)) $(1) $(2) ] && echo $(3))
 
+# cc-ldoption
+# Usage: ldflags += $(call cc-ldoption, -Wl$(comma)--hash-style=both)
+cc-ldoption = $(call try-run,\
+	$(CC) $(1) -nostdlib -xc /dev/null -o "$$TMP",$(1),$(2))
+
 # ld-option
-# Usage: ldflags += $(call ld-option, -Wl$(comma)--hash-style=both)
+# Usage: LDFLAGS += $(call ld-option, -X)
 ld-option = $(call try-run,\
-	$(CC) $(1) -nostdlib -xc /dev/null -o "$$TMP",$(1),$(2))
+	$(CC) /dev/null -c -o "$$TMPO" ; $(LD) $(1) "$$TMPO" -o "$$TMP",$(1),$(2))
 
 ######
 
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 5c4b7a400c1..341b58902ff 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -206,7 +206,7 @@ cmd_modversions =							\
 endif
 
 ifdef CONFIG_FTRACE_MCOUNT_RECORD
-cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
+cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
 	"$(if $(CONFIG_64BIT),64,32)" \
 	"$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" \
 	"$(if $(part-of-module),1,0)" "$(@)";
@@ -216,6 +216,7 @@ define rule_cc_o_c
 	$(call echo-cmd,checksrc) $(cmd_checksrc)			  \
 	$(call echo-cmd,cc_o_c) $(cmd_cc_o_c);				  \
 	$(cmd_modversions)						  \
+	$(call echo-cmd,record_mcount)					  \
 	$(cmd_record_mcount)						  \
 	scripts/basic/fixdep $(depfile) $@ '$(call make-cmd,cc_o_c)' >    \
 	                                              $(dot-target).tmp;  \
@@ -269,7 +270,8 @@ targets += $(extra-y) $(MAKECMDGOALS) $(always)
 # Linker scripts preprocessor (.lds.S -> .lds)
 # ---------------------------------------------------------------------------
 quiet_cmd_cpp_lds_S = LDS     $@
-      cmd_cpp_lds_S = $(CPP) $(cpp_flags) -D__ASSEMBLY__ -o $@ $<
+      cmd_cpp_lds_S = $(CPP) $(cpp_flags) -P -C -U$(ARCH) \
+	                     -D__ASSEMBLY__ -DLINKER_SCRIPT -o $@ $<
 
 $(obj)/%.lds: $(src)/%.lds.S FORCE
 	$(call if_changed_dep,cpp_lds_S)
diff --git a/scripts/basic/docproc.c b/scripts/basic/docproc.c
index 99ca7a69868..79ab973fb43 100644
--- a/scripts/basic/docproc.c
+++ b/scripts/basic/docproc.c
@@ -71,7 +71,7 @@ FILELINE * docsection;
 
 static char *srctree, *kernsrctree;
 
-void usage (void)
+static void usage (void)
 {
 	fprintf(stderr, "Usage: docproc {doc|depend} file\n");
 	fprintf(stderr, "Input is read from file.tmpl. Output is sent to stdout\n");
@@ -84,7 +84,7 @@ void usage (void)
 /*
  * Execute kernel-doc with parameters given in svec
  */
-void exec_kernel_doc(char **svec)
+static void exec_kernel_doc(char **svec)
 {
 	pid_t pid;
 	int ret;
@@ -129,7 +129,7 @@ struct symfile
 struct symfile symfilelist[MAXFILES];
 int symfilecnt = 0;
 
-void add_new_symbol(struct symfile *sym, char * symname)
+static void add_new_symbol(struct symfile *sym, char * symname)
 {
 	sym->symbollist =
           realloc(sym->symbollist, (sym->symbolcnt + 1) * sizeof(char *));
@@ -137,14 +137,14 @@ void add_new_symbol(struct symfile *sym, char * symname)
 }
 
 /* Add a filename to the list */
-struct symfile * add_new_file(char * filename)
+static struct symfile * add_new_file(char * filename)
 {
 	symfilelist[symfilecnt++].filename = strdup(filename);
 	return &symfilelist[symfilecnt - 1];
 }
 
 /* Check if file already are present in the list */
-struct symfile * filename_exist(char * filename)
+static struct symfile * filename_exist(char * filename)
 {
 	int i;
 	for (i=0; i < symfilecnt; i++)
@@ -157,20 +157,20 @@ struct symfile * filename_exist(char * filename)
  * List all files referenced within the template file.
  * Files are separated by tabs.
  */
-void adddep(char * file)		   { printf("\t%s", file); }
-void adddep2(char * file, char * line)     { line = line; adddep(file); }
-void noaction(char * line)		   { line = line; }
-void noaction2(char * file, char * line)   { file = file; line = line; }
+static void adddep(char * file)		   { printf("\t%s", file); }
+static void adddep2(char * file, char * line)     { line = line; adddep(file); }
+static void noaction(char * line)		   { line = line; }
+static void noaction2(char * file, char * line)   { file = file; line = line; }
 
 /* Echo the line without further action */
-void printline(char * line)               { printf("%s", line); }
+static void printline(char * line)               { printf("%s", line); }
 
 /*
  * Find all symbols in filename that are exported with EXPORT_SYMBOL &
  * EXPORT_SYMBOL_GPL (& EXPORT_SYMBOL_GPL_FUTURE implicitly).
  * All symbols located are stored in symfilelist.
  */
-void find_export_symbols(char * filename)
+static void find_export_symbols(char * filename)
 {
 	FILE * fp;
 	struct symfile *sym;
@@ -227,7 +227,7 @@ void find_export_symbols(char * filename)
  * intfunc uses -nofunction
  * extfunc uses -function
  */
-void docfunctions(char * filename, char * type)
+static void docfunctions(char * filename, char * type)
 {
 	int i,j;
 	int symcnt = 0;
@@ -258,15 +258,15 @@ void docfunctions(char * filename, char * type)
 	fflush(stdout);
 	free(vec);
 }
-void intfunc(char * filename) {	docfunctions(filename, NOFUNCTION); }
-void extfunc(char * filename) { docfunctions(filename, FUNCTION);   }
+static void intfunc(char * filename) {	docfunctions(filename, NOFUNCTION); }
+static void extfunc(char * filename) { docfunctions(filename, FUNCTION);   }
 
 /*
  * Document specific function(s) in a file.
  * Call kernel-doc with the following parameters:
  * kernel-doc -docbook -function function1 [-function function2]
  */
-void singfunc(char * filename, char * line)
+static void singfunc(char * filename, char * line)
 {
 	char *vec[200]; /* Enough for specific functions */
         int i, idx = 0;
@@ -297,7 +297,7 @@ void singfunc(char * filename, char * line)
  * Call kernel-doc with the following parameters:
  * kernel-doc -docbook -function "doc section" filename
  */
-void docsect(char *filename, char *line)
+static void docsect(char *filename, char *line)
 {
 	char *vec[6]; /* kerneldoc -docbook -function "section" file NULL */
 	char *s;
@@ -324,7 +324,7 @@ void docsect(char *filename, char *line)
  * 5) Lines containing !P
  * 6) Default lines - lines not matching the above
  */
-void parse_file(FILE *infile)
+static void parse_file(FILE *infile)
 {
 	char line[MAXLINESZ];
 	char * s;
diff --git a/scripts/basic/fixdep.c b/scripts/basic/fixdep.c
index 8ab44861168..6bf21f83837 100644
--- a/scripts/basic/fixdep.c
+++ b/scripts/basic/fixdep.c
@@ -124,7 +124,7 @@ char *target;
 char *depfile;
 char *cmdline;
 
-void usage(void)
+static void usage(void)
 {
 	fprintf(stderr, "Usage: fixdep <depfile> <target> <cmdline>\n");
 	exit(1);
@@ -133,7 +133,7 @@ void usage(void)
 /*
  * Print out the commandline prefixed with cmd_<target filename> :=
  */
-void print_cmdline(void)
+static void print_cmdline(void)
 {
 	printf("cmd_%s := %s\n\n", target, cmdline);
 }
@@ -146,7 +146,7 @@ int    len_config  = 0;
  * Grow the configuration string to a desired length.
  * Usually the first growth is plenty.
  */
-void grow_config(int len)
+static void grow_config(int len)
 {
 	while (len_config + len > size_config) {
 		if (size_config == 0)
@@ -162,7 +162,7 @@ void grow_config(int len)
 /*
  * Lookup a value in the configuration string.
  */
-int is_defined_config(const char * name, int len)
+static int is_defined_config(const char * name, int len)
 {
 	const char * pconfig;
 	const char * plast = str_config + len_config - len;
@@ -178,7 +178,7 @@ int is_defined_config(const char * name, int len)
 /*
  * Add a new value to the configuration string.
  */
-void define_config(const char * name, int len)
+static void define_config(const char * name, int len)
 {
 	grow_config(len + 1);
 
@@ -190,7 +190,7 @@ void define_config(const char * name, int len)
 /*
  * Clear the set of configuration strings.
  */
-void clear_config(void)
+static void clear_config(void)
 {
 	len_config = 0;
 	define_config("", 0);
@@ -199,7 +199,7 @@ void clear_config(void)
 /*
  * Record the use of a CONFIG_* word.
  */
-void use_config(char *m, int slen)
+static void use_config(char *m, int slen)
 {
 	char s[PATH_MAX];
 	char *p;
@@ -220,7 +220,7 @@ void use_config(char *m, int slen)
 	printf("    $(wildcard include/config/%s.h) \\\n", s);
 }
 
-void parse_config_file(char *map, size_t len)
+static void parse_config_file(char *map, size_t len)
 {
 	int *end = (int *) (map + len);
 	/* start at +1, so that p can never be < map */
@@ -254,7 +254,7 @@ void parse_config_file(char *map, size_t len)
 }
 
 /* test is s ends in sub */
-int strrcmp(char *s, char *sub)
+static int strrcmp(char *s, char *sub)
 {
 	int slen = strlen(s);
 	int sublen = strlen(sub);
@@ -265,7 +265,7 @@ int strrcmp(char *s, char *sub)
 	return memcmp(s + slen - sublen, sub, sublen);
 }
 
-void do_config_file(char *filename)
+static void do_config_file(char *filename)
 {
 	struct stat st;
 	int fd;
@@ -296,7 +296,7 @@ void do_config_file(char *filename)
 	close(fd);
 }
 
-void parse_dep_file(void *map, size_t len)
+static void parse_dep_file(void *map, size_t len)
 {
 	char *m = map;
 	char *end = m + len;
@@ -336,7 +336,7 @@ void parse_dep_file(void *map, size_t len)
 	printf("$(deps_%s):\n", target);
 }
 
-void print_deps(void)
+static void print_deps(void)
 {
 	struct stat st;
 	int fd;
@@ -368,7 +368,7 @@ void print_deps(void)
 	close(fd);
 }
 
-void traps(void)
+static void traps(void)
 {
 	static char test[] __attribute__((aligned(sizeof(int)))) = "CONF";
 	int *p = (int *)test;
diff --git a/scripts/basic/hash.c b/scripts/basic/hash.c
index 3299ad7fc8c..2ef5d3f666b 100644
--- a/scripts/basic/hash.c
+++ b/scripts/basic/hash.c
@@ -21,7 +21,7 @@ static void usage(void)
  * http://www.cse.yorku.ca/~oz/hash.html
  */
 
-unsigned int djb2_hash(char *str)
+static unsigned int djb2_hash(char *str)
 {
 	unsigned long hash = 5381;
 	int c;
@@ -34,7 +34,7 @@ unsigned int djb2_hash(char *str)
 	return (unsigned int)(hash & ((1 << DYNAMIC_DEBUG_HASH_BITS) - 1));
 }
 
-unsigned int r5_hash(char *str)
+static unsigned int r5_hash(char *str)
 {
 	unsigned long hash = 0;
 	int c;
diff --git a/scripts/checkincludes.pl b/scripts/checkincludes.pl
index 8e6b716c191..676ddc07d6f 100755
--- a/scripts/checkincludes.pl
+++ b/scripts/checkincludes.pl
@@ -1,24 +1,85 @@
 #!/usr/bin/perl
 #
-# checkincludes: Find files included more than once in (other) files.
+# checkincludes: find/remove files included more than once
+#
 # Copyright abandoned, 2000, Niels Kristian Bech Jensen <nkbj@image.dk>.
+# Copyright 2009 Luis R. Rodriguez <mcgrof@gmail.com>
+#
+# This script checks for duplicate includes. It also has support
+# to remove them in place. Note that this will not take into
+# consideration macros so you should run this only if you know
+# you do have real dups and do not have them under #ifdef's. You
+# could also just review the results.
+
+sub usage {
+	print "Usage: checkincludes.pl [-r]\n";
+	print "By default we just warn of duplicates\n";
+	print "To remove duplicated includes in place use -r\n";
+	exit 1;
+}
+
+my $remove = 0;
+
+if ($#ARGV < 0) {
+	usage();
+}
+
+if ($#ARGV >= 1) {
+	if ($ARGV[0] =~ /^-/) {
+		if ($ARGV[0] eq "-r") {
+			$remove = 1;
+			shift;
+		} else {
+			usage();
+		}
+	}
+}
 
 foreach $file (@ARGV) {
 	open(FILE, $file) or die "Cannot open $file: $!.\n";
 
 	my %includedfiles = ();
+	my @file_lines = ();
 
 	while (<FILE>) {
 		if (m/^\s*#\s*include\s*[<"](\S*)[>"]/o) {
 			++$includedfiles{$1};
 		}
+		push(@file_lines, $_);
 	}
-	
-	foreach $filename (keys %includedfiles) {
-		if ($includedfiles{$filename} > 1) {
-			print "$file: $filename is included more than once.\n";
+
+	close(FILE);
+
+	if (!$remove) {
+		foreach $filename (keys %includedfiles) {
+			if ($includedfiles{$filename} > 1) {
+				print "$file: $filename is included more than once.\n";
+			}
 		}
+		next;
 	}
 
+	open(FILE,">$file") || die("Cannot write to $file: $!");
+
+	my $dups = 0;
+	foreach (@file_lines) {
+		if (m/^\s*#\s*include\s*[<"](\S*)[>"]/o) {
+			foreach $filename (keys %includedfiles) {
+				if ($1 eq $filename) {
+					if ($includedfiles{$filename} > 1) {
+						$includedfiles{$filename}--;
+						$dups++;
+					} else {
+						print FILE $_;
+					}
+				}
+			}
+		} else {
+			print FILE $_;
+		}
+	}
+	if ($dups > 0) {
+		print "$file: removed $dups duplicate includes\n";
+	}
 	close(FILE);
 }
diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c
index 3baaaecd6b1..9960d1c303f 100644
--- a/scripts/kconfig/conf.c
+++ b/scripts/kconfig/conf.c
@@ -38,14 +38,14 @@ static int conf_cnt;
 static char line[128];
 static struct menu *rootEntry;
 
-static char nohelp_text[] = N_("Sorry, no help available for this option yet.\n");
-
-static const char *get_help(struct menu *menu)
+static void print_help(struct menu *menu)
 {
-	if (menu_has_help(menu))
-		return _(menu_get_help(menu));
-	else
-		return nohelp_text;
+	struct gstr help = str_new();
+
+	menu_get_ext_help(menu, &help);
+
+	printf("\n%s\n", str_get(&help));
+	str_free(&help);
 }
 
 static void strip(char *str)
@@ -121,7 +121,7 @@ static int conf_askvalue(struct symbol *sym, const char *def)
 	return 1;
 }
 
-int conf_string(struct menu *menu)
+static int conf_string(struct menu *menu)
 {
 	struct symbol *sym = menu->sym;
 	const char *def;
@@ -140,7 +140,7 @@ int conf_string(struct menu *menu)
 		case '?':
 			/* print help */
 			if (line[1] == '\n') {
-				printf("\n%s\n", get_help(menu));
+				print_help(menu);
 				def = NULL;
 				break;
 			}
@@ -220,7 +220,7 @@ static int conf_sym(struct menu *menu)
 		if (sym_set_tristate_value(sym, newval))
 			return 0;
 help:
-		printf("\n%s\n", get_help(menu));
+		print_help(menu);
 	}
 }
 
@@ -307,7 +307,7 @@ static int conf_choice(struct menu *menu)
 			fgets(line, 128, stdin);
 			strip(line);
 			if (line[0] == '?') {
-				printf("\n%s\n", get_help(menu));
+				print_help(menu);
 				continue;
 			}
 			if (!line[0])
@@ -331,7 +331,7 @@ static int conf_choice(struct menu *menu)
 		if (!child)
 			continue;
 		if (line[strlen(line) - 1] == '?') {
-			printf("\n%s\n", get_help(child));
+			print_help(child);
 			continue;
 		}
 		sym_set_choice_value(sym, child->sym);
diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c
index a04da3459f0..b55e72ff2fc 100644
--- a/scripts/kconfig/confdata.c
+++ b/scripts/kconfig/confdata.c
@@ -560,7 +560,7 @@ int conf_write(const char *name)
 	return 0;
 }
 
-int conf_split_config(void)
+static int conf_split_config(void)
 {
 	const char *name;
 	char path[128];
diff --git a/scripts/kconfig/expr.c b/scripts/kconfig/expr.c
index 579ece4fa58..edd3f39a080 100644
--- a/scripts/kconfig/expr.c
+++ b/scripts/kconfig/expr.c
@@ -348,7 +348,7 @@ struct expr *expr_trans_bool(struct expr *e)
 /*
  * e1 || e2 -> ?
  */
-struct expr *expr_join_or(struct expr *e1, struct expr *e2)
+static struct expr *expr_join_or(struct expr *e1, struct expr *e2)
 {
 	struct expr *tmp;
 	struct symbol *sym1, *sym2;
@@ -412,7 +412,7 @@ struct expr *expr_join_or(struct expr *e1, struct expr *e2)
 	return NULL;
 }
 
-struct expr *expr_join_and(struct expr *e1, struct expr *e2)
+static struct expr *expr_join_and(struct expr *e1, struct expr *e2)
 {
 	struct expr *tmp;
 	struct symbol *sym1, *sym2;
@@ -1098,6 +1098,8 @@ void expr_fprint(struct expr *e, FILE *out)
 static void expr_print_gstr_helper(void *data, struct symbol *sym, const char *str)
 {
 	str_append((struct gstr*)data, str);
+	if (sym)
+		str_printf((struct gstr*)data, " [=%s]", sym_get_string_value(sym));
 }
 
 void expr_gstr_print(struct expr *e, struct gstr *gs)
diff --git a/scripts/kconfig/gconf.c b/scripts/kconfig/gconf.c
index 199b22bb49e..65464366fe3 100644
--- a/scripts/kconfig/gconf.c
+++ b/scripts/kconfig/gconf.c
@@ -456,19 +456,9 @@ static void text_insert_help(struct menu *menu)
 	GtkTextBuffer *buffer;
 	GtkTextIter start, end;
 	const char *prompt = _(menu_get_prompt(menu));
-	gchar *name;
-	const char *help;
+	struct gstr help = str_new();
 
-	help = menu_get_help(menu);
-
-	/* Gettextize if the help text not empty */
-	if ((help != 0) && (help[0] != 0))
-		help = _(help);
-
-	if (menu->sym && menu->sym->name)
-		name = g_strdup_printf(menu->sym->name);
-	else
-		name = g_strdup("");
+	menu_get_ext_help(menu, &help);
 
 	buffer = gtk_text_view_get_buffer(GTK_TEXT_VIEW(text_w));
 	gtk_text_buffer_get_bounds(buffer, &start, &end);
@@ -478,14 +468,11 @@ static void text_insert_help(struct menu *menu)
 	gtk_text_buffer_get_end_iter(buffer, &end);
 	gtk_text_buffer_insert_with_tags(buffer, &end, prompt, -1, tag1,
 					 NULL);
-	gtk_text_buffer_insert_at_cursor(buffer, " ", 1);
-	gtk_text_buffer_get_end_iter(buffer, &end);
-	gtk_text_buffer_insert_with_tags(buffer, &end, name, -1, tag1,
-					 NULL);
 	gtk_text_buffer_insert_at_cursor(buffer, "\n\n", 2);
 	gtk_text_buffer_get_end_iter(buffer, &end);
-	gtk_text_buffer_insert_with_tags(buffer, &end, help, -1, tag2,
+	gtk_text_buffer_insert_with_tags(buffer, &end, str_get(&help), -1, tag2,
 					 NULL);
+	str_free(&help);
 }
 
 
diff --git a/scripts/kconfig/gconf.glade b/scripts/kconfig/gconf.glade
index 803233fdd6d..b1c86c19292 100644
--- a/scripts/kconfig/gconf.glade
+++ b/scripts/kconfig/gconf.glade
@@ -547,7 +547,7 @@
 		  <property name="headers_visible">True</property>
 		  <property name="rules_hint">False</property>
 		  <property name="reorderable">False</property>
-		  <property name="enable_search">True</property>
+		  <property name="enable_search">False</property>
 		  <signal name="cursor_changed" handler="on_treeview2_cursor_changed" last_modification_time="Sun, 12 Jan 2003 15:58:22 GMT"/>
 		  <signal name="button_press_event" handler="on_treeview1_button_press_event" last_modification_time="Sun, 12 Jan 2003 16:03:52 GMT"/>
 		  <signal name="key_press_event" handler="on_treeview2_key_press_event" last_modification_time="Sun, 12 Jan 2003 16:11:44 GMT"/>
@@ -582,7 +582,7 @@
 		      <property name="headers_visible">True</property>
 		      <property name="rules_hint">False</property>
 		      <property name="reorderable">False</property>
-		      <property name="enable_search">True</property>
+		      <property name="enable_search">False</property>
 		      <signal name="cursor_changed" handler="on_treeview2_cursor_changed" last_modification_time="Sun, 12 Jan 2003 15:57:55 GMT"/>
 		      <signal name="button_press_event" handler="on_treeview2_button_press_event" last_modification_time="Sun, 12 Jan 2003 15:57:58 GMT"/>
 		      <signal name="key_press_event" handler="on_treeview2_key_press_event" last_modification_time="Sun, 12 Jan 2003 15:58:01 GMT"/>
diff --git a/scripts/kconfig/kxgettext.c b/scripts/kconfig/kxgettext.c
index 8d9ce22b0fc..dcc3fcc0cc9 100644
--- a/scripts/kconfig/kxgettext.c
+++ b/scripts/kconfig/kxgettext.c
@@ -166,7 +166,7 @@ static int message__add(const char *msg, char *option, char *file, int lineno)
 	return rc;
 }
 
-void menu_build_message_list(struct menu *menu)
+static void menu_build_message_list(struct menu *menu)
 {
 	struct menu *child;
 
@@ -211,7 +211,7 @@ static void message__print_gettext_msgid_msgstr(struct message *self)
 	       "msgstr \"\"\n", self->msg);
 }
 
-void menu__xgettext(void)
+static void menu__xgettext(void)
 {
 	struct message *m = message__list;
 
diff --git a/scripts/kconfig/lkc_proto.h b/scripts/kconfig/lkc_proto.h
index 8e69461313d..ffeb532b2cf 100644
--- a/scripts/kconfig/lkc_proto.h
+++ b/scripts/kconfig/lkc_proto.h
@@ -17,6 +17,8 @@ P(menu_get_root_menu,struct menu *,(struct menu *menu));
 P(menu_get_parent_menu,struct menu *,(struct menu *menu));
 P(menu_has_help,bool,(struct menu *menu));
 P(menu_get_help,const char *,(struct menu *menu));
+P(get_symbol_str,void,(struct gstr *r, struct symbol *sym));
+P(menu_get_ext_help,void,(struct menu *menu, struct gstr *help));
 
 /* symbol.c */
 P(symbol_hash,struct symbol *,[SYMBOL_HASHSIZE]);
diff --git a/scripts/kconfig/mconf.c b/scripts/kconfig/mconf.c
index 25b60bc117f..d8295357358 100644
--- a/scripts/kconfig/mconf.c
+++ b/scripts/kconfig/mconf.c
@@ -199,8 +199,6 @@ inputbox_instructions_string[] = N_(
 setmod_text[] = N_(
 	"This feature depends on another which has been configured as a module.\n"
 	"As a result, this feature will be built as a module."),
-nohelp_text[] = N_(
-	"There is no help available for this kernel option.\n"),
 load_config_text[] = N_(
 	"Enter the name of the configuration file you wish to load.  "
 	"Accept the name shown to restore the configuration you "
@@ -284,66 +282,6 @@ static void show_textbox(const char *title, const char *text, int r, int c);
 static void show_helptext(const char *title, const char *text);
 static void show_help(struct menu *menu);
 
-static void get_prompt_str(struct gstr *r, struct property *prop)
-{
-	int i, j;
-	struct menu *submenu[8], *menu;
-
-	str_printf(r, _("Prompt: %s\n"), _(prop->text));
-	str_printf(r, _("  Defined at %s:%d\n"), prop->menu->file->name,
-		prop->menu->lineno);
-	if (!expr_is_yes(prop->visible.expr)) {
-		str_append(r, _("  Depends on: "));
-		expr_gstr_print(prop->visible.expr, r);
-		str_append(r, "\n");
-	}
-	menu = prop->menu->parent;
-	for (i = 0; menu != &rootmenu && i < 8; menu = menu->parent)
-		submenu[i++] = menu;
-	if (i > 0) {
-		str_printf(r, _("  Location:\n"));
-		for (j = 4; --i >= 0; j += 2) {
-			menu = submenu[i];
-			str_printf(r, "%*c-> %s", j, ' ', _(menu_get_prompt(menu)));
-			if (menu->sym) {
-				str_printf(r, " (%s [=%s])", menu->sym->name ?
-					menu->sym->name : _("<choice>"),
-					sym_get_string_value(menu->sym));
-			}
-			str_append(r, "\n");
-		}
-	}
-}
-
-static void get_symbol_str(struct gstr *r, struct symbol *sym)
-{
-	bool hit;
-	struct property *prop;
-
-	if (sym && sym->name)
-		str_printf(r, "Symbol: %s [=%s]\n", sym->name,
-		                                    sym_get_string_value(sym));
-	for_all_prompts(sym, prop)
-		get_prompt_str(r, prop);
-	hit = false;
-	for_all_properties(sym, prop, P_SELECT) {
-		if (!hit) {
-			str_append(r, "  Selects: ");
-			hit = true;
-		} else
-			str_printf(r, " && ");
-		expr_gstr_print(prop->expr, r);
-	}
-	if (hit)
-		str_append(r, "\n");
-	if (sym->rev_dep.expr) {
-		str_append(r, _("  Selected by: "));
-		expr_gstr_print(sym->rev_dep.expr, r);
-		str_append(r, "\n");
-	}
-	str_append(r, "\n\n");
-}
-
 static struct gstr get_relations_str(struct symbol **sym_arr)
 {
 	struct symbol *sym;
@@ -699,19 +637,9 @@ static void show_helptext(const char *title, const char *text)
 static void show_help(struct menu *menu)
 {
 	struct gstr help = str_new();
-	struct symbol *sym = menu->sym;
-
-	if (menu_has_help(menu))
-	{
-		if (sym->name) {
-			str_printf(&help, "CONFIG_%s:\n\n", sym->name);
-			str_append(&help, _(menu_get_help(menu)));
-			str_append(&help, "\n");
-		}
-	} else {
-		str_append(&help, nohelp_text);
-	}
-	get_symbol_str(&help, sym);
+
+	menu_get_ext_help(menu, &help);
+
 	show_helptext(_(menu_get_prompt(menu)), str_get(&help));
 	str_free(&help);
 }
diff --git a/scripts/kconfig/menu.c b/scripts/kconfig/menu.c
index 07ff8d105c9..059a2465c57 100644
--- a/scripts/kconfig/menu.c
+++ b/scripts/kconfig/menu.c
@@ -9,6 +9,9 @@
 #define LKC_DIRECT_LINK
 #include "lkc.h"
 
+static const char nohelp_text[] = N_(
+	"There is no help available for this kernel option.\n");
+
 struct menu rootmenu;
 static struct menu **last_entry_ptr;
 
@@ -74,7 +77,7 @@ void menu_end_menu(void)
 	current_menu = current_menu->parent;
 }
 
-struct expr *menu_check_dep(struct expr *e)
+static struct expr *menu_check_dep(struct expr *e)
 {
 	if (!e)
 		return e;
@@ -184,7 +187,7 @@ static int menu_range_valid_sym(struct symbol *sym, struct symbol *sym2)
 	       (sym2->type == S_UNKNOWN && sym_string_valid(sym, sym2->name));
 }
 
-void sym_check_prop(struct symbol *sym)
+static void sym_check_prop(struct symbol *sym)
 {
 	struct property *prop;
 	struct symbol *sym2;
@@ -451,3 +454,80 @@ const char *menu_get_help(struct menu *menu)
 	else
 		return "";
 }
+
+static void get_prompt_str(struct gstr *r, struct property *prop)
+{
+	int i, j;
+	struct menu *submenu[8], *menu;
+
+	str_printf(r, _("Prompt: %s\n"), _(prop->text));
+	str_printf(r, _("  Defined at %s:%d\n"), prop->menu->file->name,
+		prop->menu->lineno);
+	if (!expr_is_yes(prop->visible.expr)) {
+		str_append(r, _("  Depends on: "));
+		expr_gstr_print(prop->visible.expr, r);
+		str_append(r, "\n");
+	}
+	menu = prop->menu->parent;
+	for (i = 0; menu != &rootmenu && i < 8; menu = menu->parent)
+		submenu[i++] = menu;
+	if (i > 0) {
+		str_printf(r, _("  Location:\n"));
+		for (j = 4; --i >= 0; j += 2) {
+			menu = submenu[i];
+			str_printf(r, "%*c-> %s", j, ' ', _(menu_get_prompt(menu)));
+			if (menu->sym) {
+				str_printf(r, " (%s [=%s])", menu->sym->name ?
+					menu->sym->name : _("<choice>"),
+					sym_get_string_value(menu->sym));
+			}
+			str_append(r, "\n");
+		}
+	}
+}
+
+void get_symbol_str(struct gstr *r, struct symbol *sym)
+{
+	bool hit;
+	struct property *prop;
+
+	if (sym && sym->name)
+		str_printf(r, "Symbol: %s [=%s]\n", sym->name,
+			   sym_get_string_value(sym));
+	for_all_prompts(sym, prop)
+		get_prompt_str(r, prop);
+	hit = false;
+	for_all_properties(sym, prop, P_SELECT) {
+		if (!hit) {
+			str_append(r, "  Selects: ");
+			hit = true;
+		} else
+			str_printf(r, " && ");
+		expr_gstr_print(prop->expr, r);
+	}
+	if (hit)
+		str_append(r, "\n");
+	if (sym->rev_dep.expr) {
+		str_append(r, _("  Selected by: "));
+		expr_gstr_print(sym->rev_dep.expr, r);
+		str_append(r, "\n");
+	}
+	str_append(r, "\n\n");
+}
+
+void menu_get_ext_help(struct menu *menu, struct gstr *help)
+{
+	struct symbol *sym = menu->sym;
+
+	if (menu_has_help(menu)) {
+		if (sym->name) {
+			str_printf(help, "CONFIG_%s:\n\n", sym->name);
+			str_append(help, _(menu_get_help(menu)));
+			str_append(help, "\n");
+		}
+	} else {
+		str_append(help, nohelp_text);
+	}
+	if (sym)
+		get_symbol_str(help, sym);
+}
diff --git a/scripts/kconfig/qconf.cc b/scripts/kconfig/qconf.cc
index ce7d508c752..00c51507cfc 100644
--- a/scripts/kconfig/qconf.cc
+++ b/scripts/kconfig/qconf.cc
@@ -1042,12 +1042,10 @@ void ConfigInfoView::menuInfo(void)
 		if (showDebug())
 			debug = debug_info(sym);
 
-		help = menu_get_help(menu);
-		/* Gettextize if the help text not empty */
-		if (help.isEmpty())
-			help = print_filter(menu_get_help(menu));
-		else
-			help = print_filter(_(menu_get_help(menu)));
+		struct gstr help_gstr = str_new();
+		menu_get_ext_help(menu, &help_gstr);
+		help = print_filter(str_get(&help_gstr));
+		str_free(&help_gstr);
 	} else if (menu->prompt) {
 		head += "<big><b>";
 		head += print_filter(_(menu->prompt->text));
diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c
index 18f3e5c3363..6c8fbbb66eb 100644
--- a/scripts/kconfig/symbol.c
+++ b/scripts/kconfig/symbol.c
@@ -36,7 +36,7 @@ tristate modules_val;
 
 struct expr *sym_env_list;
 
-void sym_add_default(struct symbol *sym, const char *def)
+static void sym_add_default(struct symbol *sym, const char *def)
 {
 	struct property *prop = prop_alloc(P_DEFAULT, sym);
 
@@ -125,7 +125,7 @@ struct property *sym_get_default_prop(struct symbol *sym)
 	return NULL;
 }
 
-struct property *sym_get_range_prop(struct symbol *sym)
+static struct property *sym_get_range_prop(struct symbol *sym)
 {
 	struct property *prop;
 
@@ -943,7 +943,7 @@ const char *prop_get_type_name(enum prop_type type)
 	return "unknown";
 }
 
-void prop_add_env(const char *env)
+static void prop_add_env(const char *env)
 {
 	struct symbol *sym, *sym2;
 	struct property *prop;
diff --git a/scripts/markup_oops.pl b/scripts/markup_oops.pl
index 89774011965..5f0fcb712e2 100644
--- a/scripts/markup_oops.pl
+++ b/scripts/markup_oops.pl
@@ -184,10 +184,7 @@ if ($target eq "0") {
 
 # if it's a module, we need to find the .ko file and calculate a load offset
 if ($module ne "") {
-	my $dir = dirname($filename);
-	$dir = $dir . "/";
-	my $mod = $module . ".ko";
-	my $modulefile = `find $dir -name $mod | head -1`;
+	my $modulefile = `modinfo $module | grep '^filename:' | awk '{ print \$2 }'`;
 	chomp($modulefile);
 	$filename = $modulefile;
 	if ($filename eq "") {
diff --git a/scripts/tags.sh b/scripts/tags.sh
index 4a34ec591e8..d52f7a01557 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -101,7 +101,8 @@ exuberant()
 	-I ____cacheline_aligned_in_smp                         \
 	-I ____cacheline_internodealigned_in_smp                \
 	-I EXPORT_SYMBOL,EXPORT_SYMBOL_GPL                      \
-	--extra=+f --c-kinds=+px                                \
+	-I DEFINE_TRACE,EXPORT_TRACEPOINT_SYMBOL,EXPORT_TRACEPOINT_SYMBOL_GPL \
+	--extra=+f --c-kinds=-px                                \
 	--regex-asm='/^ENTRY\(([^)]*)\).*/\1/'                  \
 	--regex-c='/^SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/sys_\1/'
 
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index b8186bac8b7..6cf8fd2b79e 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -61,7 +61,8 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
 struct cgroup_subsys devices_subsys;
 
 static int devcgroup_can_attach(struct cgroup_subsys *ss,
-		struct cgroup *new_cgroup, struct task_struct *task)
+		struct cgroup *new_cgroup, struct task_struct *task,
+		bool threadgroup)
 {
 	if (current != task && !capable(CAP_SYS_ADMIN))
 			return -EPERM;
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 485fc6233c3..4770be375ff 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -169,9 +169,9 @@ static void key_garbage_collector(struct work_struct *work)
 
 	/* trawl through the keys looking for keyrings */
 	for (;;) {
-		if (key->expiry > now && key->expiry < new_timer) {
+		if (key->expiry > limit && key->expiry < new_timer) {
 			kdebug("will expire %x in %ld",
-			       key_serial(key), key->expiry - now);
+			       key_serial(key), key->expiry - limit);
 			new_timer = key->expiry;
 		}
 
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 500aad0ebd6..3bb90b6f1dd 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -187,7 +187,7 @@ static inline void print_ipv6_addr(struct audit_buffer *ab,
 				   char *name1, char *name2)
 {
 	if (!ipv6_addr_any(addr))
-		audit_log_format(ab, " %s=%pI6", name1, addr);
+		audit_log_format(ab, " %s=%pI6c", name1, addr);
 	if (port)
 		audit_log_format(ab, " %s=%d", name2, ntohs(port));
 }
diff --git a/security/min_addr.c b/security/min_addr.c
index 14cc7b3b8d0..c844eed7915 100644
--- a/security/min_addr.c
+++ b/security/min_addr.c
@@ -28,12 +28,12 @@ static void update_mmap_min_addr(void)
  * sysctl handler which just sets dac_mmap_min_addr = the new value and then
  * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly
  */
-int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp,
+int mmap_min_addr_handler(struct ctl_table *table, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 
 	update_mmap_min_addr();
 
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 1ed0f076aad..b4b5da1c0a4 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -868,8 +868,19 @@ u32 avc_policy_seqno(void)
 
 void avc_disable(void)
 {
-	avc_flush();
-	synchronize_rcu();
-	if (avc_node_cachep)
-		kmem_cache_destroy(avc_node_cachep);
+	/*
+	 * If you are looking at this because you have realized that we are
+	 * not destroying the avc_node_cachep it might be easy to fix, but
+	 * I don't know the memory barrier semantics well enough to know.  It's
+	 * possible that some other task dereferenced security_ops when
+	 * it still pointed to selinux operations.  If that is the case it's
+	 * possible that it is about to use the avc and is about to need the
+	 * avc_node_cachep.  I know I could wrap the security.c security_ops call
+	 * in an rcu_lock, but seriously, it's not worth it.  Instead I just flush
+	 * the cache and get that memory back.
+	 */
+	if (avc_node_cachep) {
+		avc_flush();
+		/* kmem_cache_destroy(avc_node_cachep); */
+	}
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 417f7c99452..bb230d5d708 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2411,7 +2411,7 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 	/* Wake up the parent if it is waiting so that it can recheck
 	 * wait permission to the new task SID. */
 	read_lock(&tasklist_lock);
-	wake_up_interruptible(&current->real_parent->signal->wait_chldexit);
+	__wake_up_parent(current, current->real_parent);
 	read_unlock(&tasklist_lock);
 }
 
diff --git a/usr/.gitignore b/usr/.gitignore
index 69b2e89fa16..8e48117a3f3 100644
--- a/usr/.gitignore
+++ b/usr/.gitignore
@@ -4,5 +4,7 @@
 gen_init_cpio
 initramfs_data.cpio
 initramfs_data.cpio.gz
+initramfs_data.cpio.bz2
+initramfs_data.cpio.lzma
 initramfs_list
 include
diff --git a/usr/Makefile b/usr/Makefile
index 245145a99c1..1e6a9e4a72c 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -6,7 +6,7 @@ klibcdirs:;
 PHONY += klibcdirs
 
 
-# Gzip, but no bzip2
+# Gzip
 suffix_$(CONFIG_INITRAMFS_COMPRESSION_GZIP)   = .gz
 
 # Bzip2
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 897bff3b7df..034a798b043 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -738,8 +738,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 	bool called = true;
 	struct kvm_vcpu *vcpu;
 
-	if (alloc_cpumask_var(&cpus, GFP_ATOMIC))
-		cpumask_clear(cpus);
+	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
 
 	spin_lock(&kvm->requests_lock);
 	me = smp_processor_id();