77 files changed, 1749 insertions, 1192 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 3031e3233dd..2a983d49d19 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -45,7 +45,7 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
 	struct v9fs_dentry *dent;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n",
-					fid->fid, dentry->d_iname);
+					fid->fid, dentry->d_name.name);
 
 	dent = dentry->d_fsdata;
 	if (!dent) {
@@ -79,7 +79,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
 	struct p9_fid *fid, *ret;
 
 	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
-		dentry->d_iname, dentry, uid, any);
+		dentry->d_name.name, dentry, uid, any);
 	dent = (struct v9fs_dentry *) dentry->d_fsdata;
 	ret = NULL;
 	if (dent) {
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 24eb01087b6..332b5ff02fe 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -160,7 +160,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
 				v9ses->flags |= V9FS_ACCESS_ANY;
 			else {
 				v9ses->flags |= V9FS_ACCESS_SINGLE;
-				v9ses->uid = simple_strtol(s, &e, 10);
+				v9ses->uid = simple_strtoul(s, &e, 10);
 				if (*e != '\0')
 					v9ses->uid = ~0;
 			}
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f9534f18df0..06dcc7c4f23 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -52,7 +52,8 @@
 
 static int v9fs_dentry_delete(struct dentry *dentry)
 {
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+									dentry);
 
 	return 1;
 }
@@ -69,7 +70,8 @@ static int v9fs_dentry_delete(struct dentry *dentry)
 static int v9fs_cached_dentry_delete(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+									dentry);
 
 	if(!inode)
 		return 1;
@@ -88,7 +90,8 @@ void v9fs_dentry_release(struct dentry *dentry)
 	struct v9fs_dentry *dent;
 	struct p9_fid *temp, *current_fid;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+									dentry);
 	dent = dentry->d_fsdata;
 	if (dent) {
 		list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8314d3f43b7..2dfcf5487ef 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -963,7 +963,8 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
 	if (buflen > PATH_MAX)
 		buflen = PATH_MAX;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+									dentry);
 
 	retval = v9fs_readlink(dentry, link, buflen);
 
@@ -1022,7 +1023,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
 	char *s = nd_get_link(nd);
 
-	P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, s);
+	P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name,
+		IS_ERR(s) ? "<error>" : s);
 	if (!IS_ERR(s))
 		__putname(s);
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index db831efbdbb..99e0ae1a4c7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1135,12 +1135,15 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (res)
 		return res;
 
-	if (!(filp->f_mode & FMODE_EXCL))
-		return 0;
+	if (filp->f_mode & FMODE_EXCL) {
+		res = bd_claim(bdev, filp);
+		if (res)
+			goto out_blkdev_put;
+	}
 
-	if (!(res = bd_claim(bdev, filp)))
-		return 0;
+	return 0;
 
+ out_blkdev_put:
 	blkdev_put(bdev, filp->f_mode);
 	return res;
 }
@@ -1203,8 +1206,16 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 	fmode_t mode = file->f_mode;
+
+	/*
+	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
+	 * to updated it before every ioctl.
+	 */
 	if (file->f_flags & O_NDELAY)
-		mode |= FMODE_NDELAY_NOW;
+		mode |= FMODE_NDELAY;
+	else
+		mode &= ~FMODE_NDELAY;
+
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 6569fda5cfe..10179cfa115 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -878,6 +878,7 @@ void invalidate_inode_buffers(struct inode *inode)
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 }
+EXPORT_SYMBOL(invalidate_inode_buffers);
 
 /*
  * Remove any clean buffers from the inode's buffer list.  This is called
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8855331b2fb..e078b7aea14 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -8,7 +8,11 @@ handling fcntl(F_SETLEASE).  Convert cifs to using blocking tcp
 sends, and also let tcp autotune the socket send and receive buffers.
 This reduces the number of EAGAIN errors returned by TCP/IP in
 high stress workloads (and the number of retries on socket writes
-when sending large SMBWriteX requests).
+when sending large SMBWriteX requests).  Fix case in which a portion of
+data can in some cases not get written to the file on the server before the
+file is closed.  Fix DFS parsing to properly handle path consumed field,
+and to handle certain codepage conversions better.  Fix mount and
+umount race that can cause oops in mount or umount or reconnect.
 
 Version 1.54
 ------------
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 69a12aae91d..490e34bbf27 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -107,12 +107,13 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 #ifdef CONFIG_PROC_FS
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
-	struct list_head *tmp;
-	struct list_head *tmp1;
+	struct list_head *tmp1, *tmp2, *tmp3;
 	struct mid_q_entry *mid_entry;
+	struct TCP_Server_Info *server;
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
-	int i;
+	int i, j;
+	__u32 dev_type;
 
 	seq_puts(m,
 		    "Display Internal CIFS Data Structures for Debugging\n"
@@ -122,46 +123,78 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 	seq_printf(m, "Servers:");
 
 	i = 0;
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp1, &cifs_tcp_ses_list) {
+		server = list_entry(tmp1, struct TCP_Server_Info,
+				    tcp_ses_list);
 		i++;
-		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
-		if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) ||
-		   (ses->serverNOS == NULL)) {
-			seq_printf(m, "\nentry for %s not fully "
-					"displayed\n\t", ses->serverName);
-		} else {
-			seq_printf(m,
-				    "\n%d) Name: %s  Domain: %s Mounts: %d OS:"
-				    " %s  \n\tNOS: %s\tCapability: 0x%x\n\tSMB"
+		list_for_each(tmp2, &server->smb_ses_list) {
+			ses = list_entry(tmp2, struct cifsSesInfo,
+					 smb_ses_list);
+			if ((ses->serverDomain == NULL) ||
+				(ses->serverOS == NULL) ||
+				(ses->serverNOS == NULL)) {
+				seq_printf(m, "\n%d) entry for %s not fully "
+					   "displayed\n\t", i, ses->serverName);
+			} else {
+				seq_printf(m,
+				    "\n%d) Name: %s  Domain: %s Uses: %d OS:"
+				    " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB"
 				    " session status: %d\t",
 				i, ses->serverName, ses->serverDomain,
-				atomic_read(&ses->inUse),
-				ses->serverOS, ses->serverNOS,
+				ses->ses_count, ses->serverOS, ses->serverNOS,
 				ses->capabilities, ses->status);
-		}
-		if (ses->server) {
+			}
 			seq_printf(m, "TCP status: %d\n\tLocal Users To "
-				    "Server: %d SecMode: 0x%x Req On Wire: %d",
-				ses->server->tcpStatus,
-				atomic_read(&ses->server->socketUseCount),
-				ses->server->secMode,
-				atomic_read(&ses->server->inFlight));
+				   "Server: %d SecMode: 0x%x Req On Wire: %d",
+				   server->tcpStatus, server->srv_count,
+				   server->secMode,
+				   atomic_read(&server->inFlight));
 
 #ifdef CONFIG_CIFS_STATS2
 			seq_printf(m, " In Send: %d In MaxReq Wait: %d",
-				atomic_read(&ses->server->inSend),
-				atomic_read(&ses->server->num_waiters));
+				atomic_read(&server->inSend),
+				atomic_read(&server->num_waiters));
 #endif
 
-			seq_puts(m, "\nMIDs:\n");
+			seq_puts(m, "\n\tShares:");
+			j = 0;
+			list_for_each(tmp3, &ses->tcon_list) {
+				tcon = list_entry(tmp3, struct cifsTconInfo,
+						  tcon_list);
+				++j;
+				dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+				seq_printf(m, "\n\t%d) %s Mounts: %d ", j,
+					   tcon->treeName, tcon->tc_count);
+				if (tcon->nativeFileSystem) {
+					seq_printf(m, "Type: %s ",
+						   tcon->nativeFileSystem);
+				}
+				seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
+					"\nPathComponentMax: %d Status: 0x%d",
+					le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
+					le32_to_cpu(tcon->fsAttrInfo.Attributes),
+					le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
+					tcon->tidStatus);
+				if (dev_type == FILE_DEVICE_DISK)
+					seq_puts(m, " type: DISK ");
+				else if (dev_type == FILE_DEVICE_CD_ROM)
+					seq_puts(m, " type: CDROM ");
+				else
+					seq_printf(m, " type: %d ", dev_type);
+
+				if (tcon->need_reconnect)
+					seq_puts(m, "\tDISCONNECTED ");
+				seq_putc(m, '\n');
+			}
+
+			seq_puts(m, "\n\tMIDs:\n");
 
 			spin_lock(&GlobalMid_Lock);
-			list_for_each(tmp1, &ses->server->pending_mid_q) {
-				mid_entry = list_entry(tmp1, struct
-					mid_q_entry,
+			list_for_each(tmp3, &server->pending_mid_q) {
+				mid_entry = list_entry(tmp3, struct mid_q_entry,
 					qhead);
-				seq_printf(m, "State: %d com: %d pid:"
+				seq_printf(m, "\tState: %d com: %d pid:"
 						" %d tsk: %p mid %d\n",
 						mid_entry->midState,
 						(int)mid_entry->command,
@@ -171,44 +204,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			}
 			spin_unlock(&GlobalMid_Lock);
 		}
-
-	}
-	read_unlock(&GlobalSMBSeslock);
-	seq_putc(m, '\n');
-
-	seq_puts(m, "Shares:");
-
-	i = 0;
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		__u32 dev_type;
-		i++;
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
-		seq_printf(m, "\n%d) %s Uses: %d ", i,
-				 tcon->treeName, atomic_read(&tcon->useCount));
-		if (tcon->nativeFileSystem) {
-			seq_printf(m, "Type: %s ",
-					 tcon->nativeFileSystem);
-		}
-		seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
-				 "\nPathComponentMax: %d Status: %d",
-			    le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
-			    le32_to_cpu(tcon->fsAttrInfo.Attributes),
-			    le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
-			    tcon->tidStatus);
-		if (dev_type == FILE_DEVICE_DISK)
-			seq_puts(m, " type: DISK ");
-		else if (dev_type == FILE_DEVICE_CD_ROM)
-			seq_puts(m, " type: CDROM ");
-		else
-			seq_printf(m, " type: %d ", dev_type);
-
-		if (tcon->tidStatus == CifsNeedReconnect)
-			seq_puts(m, "\tDISCONNECTED ");
 	}
-	read_unlock(&GlobalSMBSeslock);
-
+	read_unlock(&cifs_tcp_ses_lock);
 	seq_putc(m, '\n');
 
 	/* BB add code to dump additional info such as TCP session info now */
@@ -234,7 +231,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 {
 	char c;
 	int rc;
-	struct list_head *tmp;
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct TCP_Server_Info *server;
+	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 
 	rc = get_user(c, buffer);
@@ -242,33 +241,42 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 		return rc;
 
 	if (c == '1' || c == 'y' || c == 'Y' || c == '0') {
-		read_lock(&GlobalSMBSeslock);
 #ifdef CONFIG_CIFS_STATS2
 		atomic_set(&totBufAllocCount, 0);
 		atomic_set(&totSmBufAllocCount, 0);
 #endif /* CONFIG_CIFS_STATS2 */
-		list_for_each(tmp, &GlobalTreeConnectionList) {
-			tcon = list_entry(tmp, struct cifsTconInfo,
-					cifsConnectionList);
-			atomic_set(&tcon->num_smbs_sent, 0);
-			atomic_set(&tcon->num_writes, 0);
-			atomic_set(&tcon->num_reads, 0);
-			atomic_set(&tcon->num_oplock_brks, 0);
-			atomic_set(&tcon->num_opens, 0);
-			atomic_set(&tcon->num_closes, 0);
-			atomic_set(&tcon->num_deletes, 0);
-			atomic_set(&tcon->num_mkdirs, 0);
-			atomic_set(&tcon->num_rmdirs, 0);
-			atomic_set(&tcon->num_renames, 0);
-			atomic_set(&tcon->num_t2renames, 0);
-			atomic_set(&tcon->num_ffirst, 0);
-			atomic_set(&tcon->num_fnext, 0);
-			atomic_set(&tcon->num_fclose, 0);
-			atomic_set(&tcon->num_hardlinks, 0);
-			atomic_set(&tcon->num_symlinks, 0);
-			atomic_set(&tcon->num_locks, 0);
+		read_lock(&cifs_tcp_ses_lock);
+		list_for_each(tmp1, &cifs_tcp_ses_list) {
+			server = list_entry(tmp1, struct TCP_Server_Info,
+					    tcp_ses_list);
+			list_for_each(tmp2, &server->smb_ses_list) {
+				ses = list_entry(tmp2, struct cifsSesInfo,
+						 smb_ses_list);
+				list_for_each(tmp3, &ses->tcon_list) {
+					tcon = list_entry(tmp3,
+							  struct cifsTconInfo,
+							  tcon_list);
+					atomic_set(&tcon->num_smbs_sent, 0);
+					atomic_set(&tcon->num_writes, 0);
+					atomic_set(&tcon->num_reads, 0);
+					atomic_set(&tcon->num_oplock_brks, 0);
+					atomic_set(&tcon->num_opens, 0);
+					atomic_set(&tcon->num_closes, 0);
+					atomic_set(&tcon->num_deletes, 0);
+					atomic_set(&tcon->num_mkdirs, 0);
+					atomic_set(&tcon->num_rmdirs, 0);
+					atomic_set(&tcon->num_renames, 0);
+					atomic_set(&tcon->num_t2renames, 0);
+					atomic_set(&tcon->num_ffirst, 0);
+					atomic_set(&tcon->num_fnext, 0);
+					atomic_set(&tcon->num_fclose, 0);
+					atomic_set(&tcon->num_hardlinks, 0);
+					atomic_set(&tcon->num_symlinks, 0);
+					atomic_set(&tcon->num_locks, 0);
+				}
+			}
 		}
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 	}
 
 	return count;
@@ -277,7 +285,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 static int cifs_stats_proc_show(struct seq_file *m, void *v)
 {
 	int i;
-	struct list_head *tmp;
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct TCP_Server_Info *server;
+	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 
 	seq_printf(m,
@@ -306,44 +316,55 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 		GlobalCurrentXid, GlobalMaxActiveXid);
 
 	i = 0;
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		i++;
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		seq_printf(m, "\n%d) %s", i, tcon->treeName);
-		if (tcon->tidStatus == CifsNeedReconnect)
-			seq_puts(m, "\tDISCONNECTED ");
-		seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
-			atomic_read(&tcon->num_smbs_sent),
-			atomic_read(&tcon->num_oplock_brks));
-		seq_printf(m, "\nReads:  %d Bytes: %lld",
-			atomic_read(&tcon->num_reads),
-			(long long)(tcon->bytes_read));
-		seq_printf(m, "\nWrites: %d Bytes: %lld",
-			atomic_read(&tcon->num_writes),
-			(long long)(tcon->bytes_written));
-		seq_printf(m,
-			"\nLocks: %d HardLinks: %d Symlinks: %d",
-			atomic_read(&tcon->num_locks),
-			atomic_read(&tcon->num_hardlinks),
-			atomic_read(&tcon->num_symlinks));
-
-		seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d",
-			atomic_read(&tcon->num_opens),
-			atomic_read(&tcon->num_closes),
-			atomic_read(&tcon->num_deletes));
-		seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
-			atomic_read(&tcon->num_mkdirs),
-			atomic_read(&tcon->num_rmdirs));
-		seq_printf(m, "\nRenames: %d T2 Renames %d",
-			atomic_read(&tcon->num_renames),
-			atomic_read(&tcon->num_t2renames));
-		seq_printf(m, "\nFindFirst: %d FNext %d FClose %d",
-			atomic_read(&tcon->num_ffirst),
-			atomic_read(&tcon->num_fnext),
-			atomic_read(&tcon->num_fclose));
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp1, &cifs_tcp_ses_list) {
+		server = list_entry(tmp1, struct TCP_Server_Info,
+				    tcp_ses_list);
+		list_for_each(tmp2, &server->smb_ses_list) {
+			ses = list_entry(tmp2, struct cifsSesInfo,
+					 smb_ses_list);
+			list_for_each(tmp3, &ses->tcon_list) {
+				tcon = list_entry(tmp3,
+						  struct cifsTconInfo,
+						  tcon_list);
+				i++;
+				seq_printf(m, "\n%d) %s", i, tcon->treeName);
+				if (tcon->need_reconnect)
+					seq_puts(m, "\tDISCONNECTED ");
+				seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
+					atomic_read(&tcon->num_smbs_sent),
+					atomic_read(&tcon->num_oplock_brks));
+				seq_printf(m, "\nReads:  %d Bytes: %lld",
+					atomic_read(&tcon->num_reads),
+					(long long)(tcon->bytes_read));
+				seq_printf(m, "\nWrites: %d Bytes: %lld",
+					atomic_read(&tcon->num_writes),
+					(long long)(tcon->bytes_written));
+				seq_printf(m, "\nLocks: %d HardLinks: %d "
+					      "Symlinks: %d",
+					atomic_read(&tcon->num_locks),
+					atomic_read(&tcon->num_hardlinks),
+					atomic_read(&tcon->num_symlinks));
+				seq_printf(m, "\nOpens: %d Closes: %d"
+					      "Deletes: %d",
+					atomic_read(&tcon->num_opens),
+					atomic_read(&tcon->num_closes),
+					atomic_read(&tcon->num_deletes));
+				seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
+					atomic_read(&tcon->num_mkdirs),
+					atomic_read(&tcon->num_rmdirs));
+				seq_printf(m, "\nRenames: %d T2 Renames %d",
+					atomic_read(&tcon->num_renames),
+					atomic_read(&tcon->num_t2renames));
+				seq_printf(m, "\nFindFirst: %d FNext %d "
+					      "FClose %d",
+					atomic_read(&tcon->num_ffirst),
+					atomic_read(&tcon->num_fnext),
+					atomic_read(&tcon->num_fclose));
+			}
+		}
 	}
-	read_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 
 	seq_putc(m, '\n');
 	return 0;
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d2c8eef84f3..e1c18362ba4 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -106,7 +106,8 @@ static char *cifs_get_share_name(const char *node_name)
 /**
  * compose_mount_options	-	creates mount options for refferral
  * @sb_mountdata:	parent/root DFS mount options (template)
- * @ref_unc:		refferral server UNC
+ * @dentry:		point where we are going to mount
+ * @ref:		server's referral
  * @devname:		pointer for saving device name
  *
  * creates mount options for submount based on template options sb_mountdata
@@ -116,7 +117,8 @@ static char *cifs_get_share_name(const char *node_name)
  * Caller is responcible for freeing retunrned value if it is not error.
  */
 static char *compose_mount_options(const char *sb_mountdata,
-				   const char *ref_unc,
+				   struct dentry *dentry,
+				   const struct dfs_info3_param *ref,
 				   char **devname)
 {
 	int rc;
@@ -126,11 +128,12 @@ static char *compose_mount_options(const char *sb_mountdata,
 	char *srvIP = NULL;
 	char sep = ',';
 	int off, noff;
+	char *fullpath;
 
 	if (sb_mountdata == NULL)
 		return ERR_PTR(-EINVAL);
 
-	*devname = cifs_get_share_name(ref_unc);
+	*devname = cifs_get_share_name(ref->node_name);
 	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
 	if (rc != 0) {
 		cERROR(1, ("%s: Failed to resolve server part of %s to IP",
@@ -138,7 +141,12 @@ static char *compose_mount_options(const char *sb_mountdata,
 		mountdata = ERR_PTR(rc);
 		goto compose_mount_options_out;
 	}
-	md_len = strlen(sb_mountdata) + strlen(srvIP) + strlen(ref_unc) + 3;
+	/* md_len = strlen(...) + 12 for 'sep+prefixpath='
+	 * assuming that we have 'unc=' and 'ip=' in
+	 * the original sb_mountdata
+	 */
+	md_len = strlen(sb_mountdata) + strlen(srvIP) +
+		strlen(ref->node_name) + 12;
 	mountdata = kzalloc(md_len+1, GFP_KERNEL);
 	if (mountdata == NULL) {
 		mountdata = ERR_PTR(-ENOMEM);
@@ -152,41 +160,56 @@ static char *compose_mount_options(const char *sb_mountdata,
 			strncpy(mountdata, sb_mountdata, 5);
 			off += 5;
 	}
-	while ((tkn_e = strchr(sb_mountdata+off, sep))) {
-		noff = (tkn_e - (sb_mountdata+off)) + 1;
-		if (strnicmp(sb_mountdata+off, "unc=", 4) == 0) {
+
+	do {
+		tkn_e = strchr(sb_mountdata + off, sep);
+		if (tkn_e == NULL)
+			noff = strlen(sb_mountdata + off);
+		else
+			noff = tkn_e - (sb_mountdata + off) + 1;
+
+		if (strnicmp(sb_mountdata + off, "unc=", 4) == 0) {
 			off += noff;
 			continue;
 		}
-		if (strnicmp(sb_mountdata+off, "ip=", 3) == 0) {
+		if (strnicmp(sb_mountdata + off, "ip=", 3) == 0) {
 			off += noff;
 			continue;
 		}
-		if (strnicmp(sb_mountdata+off, "prefixpath=", 3) == 0) {
+		if (strnicmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
 			off += noff;
 			continue;
 		}
-		strncat(mountdata, sb_mountdata+off, noff);
+		strncat(mountdata, sb_mountdata + off, noff);
 		off += noff;
-	}
-	strcat(mountdata, sb_mountdata+off);
+	} while (tkn_e);
+	strcat(mountdata, sb_mountdata + off);
 	mountdata[md_len] = '\0';
 
 	/* copy new IP and ref share name */
-	strcat(mountdata, ",ip=");
+	if (mountdata[strlen(mountdata) - 1] != sep)
+		strncat(mountdata, &sep, 1);
+	strcat(mountdata, "ip=");
 	strcat(mountdata, srvIP);
-	strcat(mountdata, ",unc=");
+	strncat(mountdata, &sep, 1);
+	strcat(mountdata, "unc=");
 	strcat(mountdata, *devname);
 
 	/* find & copy prefixpath */
-	tkn_e = strchr(ref_unc+2, '\\');
-	if (tkn_e) {
-		tkn_e = strchr(tkn_e+1, '\\');
-		if (tkn_e) {
-			strcat(mountdata, ",prefixpath=");
-			strcat(mountdata, tkn_e+1);
-		}
+	tkn_e = strchr(ref->node_name + 2, '\\');
+	if (tkn_e == NULL) /* invalid unc, missing share name*/
+		goto compose_mount_options_out;
+
+	fullpath = build_path_from_dentry(dentry);
+	tkn_e = strchr(tkn_e + 1, '\\');
+	if (tkn_e || strlen(fullpath) - (ref->path_consumed)) {
+		strncat(mountdata, &sep, 1);
+		strcat(mountdata, "prefixpath=");
+		if (tkn_e)
+			strcat(mountdata, tkn_e + 1);
+		strcat(mountdata, fullpath + (ref->path_consumed));
 	}
+	kfree(fullpath);
 
 	/*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
 	/*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
@@ -198,7 +221,7 @@ compose_mount_options_out:
 
 
 static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
-		struct dentry *dentry, char *ref_unc)
+		struct dentry *dentry, const struct dfs_info3_param *ref)
 {
 	struct cifs_sb_info *cifs_sb;
 	struct vfsmount *mnt;
@@ -207,7 +230,7 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
 
 	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
 	mountdata = compose_mount_options(cifs_sb->mountdata,
-						ref_unc, &devname);
+						dentry, ref, &devname);
 
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
@@ -310,7 +333,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 			}
 			mnt = cifs_dfs_do_refmount(nd->path.mnt,
 						nd->path.dentry,
-						referrals[i].node_name);
+						referrals + i);
 			cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
 					 __func__,
 					referrals[i].node_name, mnt));
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index fcee9298b62..0ab2fb5afef 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -73,8 +73,8 @@ struct key_type cifs_spnego_key_type = {
  * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN	13
 
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
-#define MAX_IPV6_ADDR_LEN	42
+/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
+#define MAX_IPV6_ADDR_LEN	43
 
 /* strlen of "host=" */
 #define HOST_KEY_LEN		5
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ac5915d61dc..d9cf467309e 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -514,10 +514,11 @@ static void cifs_umount_begin(struct super_block *sb)
 	tcon = cifs_sb->tcon;
 	if (tcon == NULL)
 		return;
-	down(&tcon->tconSem);
-	if (atomic_read(&tcon->useCount) == 1)
+
+	read_lock(&cifs_tcp_ses_lock);
+	if (tcon->tc_count == 1)
 		tcon->tidStatus = CifsExiting;
-	up(&tcon->tconSem);
+	read_unlock(&cifs_tcp_ses_lock);
 
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
@@ -1013,7 +1014,7 @@ static int cifs_oplock_thread(void *dummyarg)
 				not bother sending an oplock release if session
 				to server still is disconnected since oplock
 				already released by the server in that case */
-			if (pTcon->tidStatus != CifsNeedReconnect) {
+			if (!pTcon->need_reconnect) {
 				rc = CIFSSMBLock(0, pTcon, netfid,
 						0 /* len */ , 0 /* offset */, 0,
 						0, LOCKING_ANDX_OPLOCK_RELEASE,
@@ -1031,24 +1032,24 @@ static int cifs_oplock_thread(void *dummyarg)
 static int cifs_dnotify_thread(void *dummyarg)
 {
 	struct list_head *tmp;
-	struct cifsSesInfo *ses;
+	struct TCP_Server_Info *server;
 
 	do {
 		if (try_to_freeze())
 			continue;
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(15*HZ);
-		read_lock(&GlobalSMBSeslock);
 		/* check if any stuck requests that need
 		   to be woken up and wakeq so the
 		   thread can wake up and error out */
-		list_for_each(tmp, &GlobalSMBSessionList) {
-			ses = list_entry(tmp, struct cifsSesInfo,
-				cifsSessionList);
-			if (ses->server && atomic_read(&ses->server->inFlight))
-				wake_up_all(&ses->server->response_q);
+		read_lock(&cifs_tcp_ses_lock);
+		list_for_each(tmp, &cifs_tcp_ses_list) {
+			server = list_entry(tmp, struct TCP_Server_Info,
+					 tcp_ses_list);
+			if (atomic_read(&server->inFlight))
+				wake_up_all(&server->response_q);
 		}
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 	} while (!kthread_should_stop());
 
 	return 0;
@@ -1059,9 +1060,7 @@ init_cifs(void)
 {
 	int rc = 0;
 	cifs_proc_init();
-/*	INIT_LIST_HEAD(&GlobalServerList);*/	/* BB not implemented yet */
-	INIT_LIST_HEAD(&GlobalSMBSessionList);
-	INIT_LIST_HEAD(&GlobalTreeConnectionList);
+	INIT_LIST_HEAD(&cifs_tcp_ses_list);
 	INIT_LIST_HEAD(&GlobalOplock_Q);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	INIT_LIST_HEAD(&GlobalDnotifyReqList);
@@ -1089,6 +1088,7 @@ init_cifs(void)
 	GlobalMaxActiveXid = 0;
 	memset(Local_System_Name, 0, 15);
 	rwlock_init(&GlobalSMBSeslock);
+	rwlock_init(&cifs_tcp_ses_lock);
 	spin_lock_init(&GlobalMid_Lock);
 
 	if (cifs_max_pending < 2) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1cb1189f24e..c57c0565547 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -85,8 +85,7 @@ enum securityEnum {
 };
 
 enum protocolEnum {
-	IPV4 = 0,
-	IPV6,
+	TCP = 0,
 	SCTP
 	/* Netbios frames protocol not supported at this time */
 };
@@ -122,6 +121,9 @@ struct cifs_cred {
  */
 
 struct TCP_Server_Info {
+	struct list_head tcp_ses_list;
+	struct list_head smb_ses_list;
+	int srv_count; /* reference counter */
 	/* 15 character server name + 0x20 16th byte indicating type = srv */
 	char server_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
 	char unicode_server_Name[SERVER_NAME_LEN_WITH_NULL * 2];
@@ -143,7 +145,6 @@ struct TCP_Server_Info {
 	bool svlocal:1;			/* local server or remote */
 	bool noblocksnd;		/* use blocking sendmsg */
 	bool noautotune;		/* do not autotune send buf sizes */
-	atomic_t socketUseCount; /* number of open cifs sessions on socket */
 	atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
 	atomic_t inSend; /* requests trying to send */
@@ -194,13 +195,14 @@ struct cifsUidInfo {
  * Session structure.  One of these for each uid session with a particular host
  */
 struct cifsSesInfo {
-	struct list_head cifsSessionList;
+	struct list_head smb_ses_list;
+	struct list_head tcon_list;
 	struct semaphore sesSem;
 #if 0
 	struct cifsUidInfo *uidInfo;	/* pointer to user info */
 #endif
 	struct TCP_Server_Info *server;	/* pointer to server info */
-	atomic_t inUse; /* # of mounts (tree connections) on this ses */
+	int ses_count;		/* reference counter */
 	enum statusEnum status;
 	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
@@ -216,6 +218,7 @@ struct cifsSesInfo {
 	char userName[MAX_USERNAME_SIZE + 1];
 	char *domainName;
 	char *password;
+	bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
 #define CIFS_SES_NT4 1
@@ -230,16 +233,15 @@ struct cifsSesInfo {
  * session
  */
 struct cifsTconInfo {
-	struct list_head cifsConnectionList;
+	struct list_head tcon_list;
+	int tc_count;
 	struct list_head openFileList;
-	struct semaphore tconSem;
 	struct cifsSesInfo *ses;	/* pointer to session associated with */
 	char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
 	char *nativeFileSystem;
 	__u16 tid;		/* The 2 byte tree id */
 	__u16 Flags;		/* optional support bits */
 	enum statusEnum tidStatus;
-	atomic_t useCount;	/* how many explicit/implicit mounts to share */
 #ifdef CONFIG_CIFS_STATS
 	atomic_t num_smbs_sent;
 	atomic_t num_writes;
@@ -288,6 +290,7 @@ struct cifsTconInfo {
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
 	bool local_lease:1; /* check leases (only) on local system not remote */
+	bool need_reconnect:1; /* connection reset, tid now invalid */
 	/* BB add field for back pointer to sb struct(s)? */
 };
 
@@ -588,22 +591,30 @@ require use of the stronger protocol */
 #endif
 
 /*
- * The list of servers that did not respond with NT LM 0.12.
- * This list helps improve performance and eliminate the messages indicating
- * that we had a communications error talking to the server in this list.
+ * the list of TCP_Server_Info structures, ie each of the sockets
+ * connecting our client to a distinct server (ip address), is
+ * chained together by cifs_tcp_ses_list. The list of all our SMB
+ * sessions (and from that the tree connections) can be found
+ * by iterating over cifs_tcp_ses_list
  */
-/* Feature not supported */
-/* GLOBAL_EXTERN struct servers_not_supported *NotSuppList; */
+GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
 
 /*
- * The following is a hash table of all the users we know about.
+ * This lock protects the cifs_tcp_ses_list, the list of smb sessions per
+ * tcp session, and the list of tcon's per smb session. It also protects
+ * the reference counters for the server, smb session, and tcon. Finally,
+ * changes to the tcon->tidStatus should be done while holding this lock.
  */
-GLOBAL_EXTERN struct smbUidInfo *GlobalUidList[UID_HASH];
+GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
 
-/* GLOBAL_EXTERN struct list_head GlobalServerList; BB not implemented yet */
-GLOBAL_EXTERN struct list_head GlobalSMBSessionList;
-GLOBAL_EXTERN struct list_head GlobalTreeConnectionList;
-GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
+/*
+ * This lock protects the cifs_file->llist and cifs_file->flist
+ * list operations, and updates to some flags (cifs_file->invalidHandle)
+ * It will be moved to either use the tcon->stat_lock or equivalent later.
+ * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
+ * the cifs_tcp_ses_lock must be grabbed first and released last.
+ */
+GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
 
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d5eac48fc41..6d51696dc76 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -190,10 +190,10 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 		/* need to prevent multiple threads trying to
 		simultaneously reconnect the same SMB session */
 			down(&tcon->ses->sesSem);
-			if (tcon->ses->status == CifsNeedReconnect)
+			if (tcon->ses->need_reconnect)
 				rc = cifs_setup_session(0, tcon->ses,
 							nls_codepage);
-			if (!rc && (tcon->tidStatus == CifsNeedReconnect)) {
+			if (!rc && (tcon->need_reconnect)) {
 				mark_open_files_invalid(tcon);
 				rc = CIFSTCon(0, tcon->ses, tcon->treeName,
 					      tcon, nls_codepage);
@@ -337,10 +337,10 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 		/* need to prevent multiple threads trying to
 		simultaneously reconnect the same SMB session */
 			down(&tcon->ses->sesSem);
-			if (tcon->ses->status == CifsNeedReconnect)
+			if (tcon->ses->need_reconnect)
 				rc = cifs_setup_session(0, tcon->ses,
 							nls_codepage);
-			if (!rc && (tcon->tidStatus == CifsNeedReconnect)) {
+			if (!rc && (tcon->need_reconnect)) {
 				mark_open_files_invalid(tcon);
 				rc = CIFSTCon(0, tcon->ses, tcon->treeName,
 					      tcon, nls_codepage);
@@ -664,8 +664,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			rc = -EIO;
 			goto neg_err_exit;
 		}
-
-		if (server->socketUseCount.counter > 1) {
+		read_lock(&cifs_tcp_ses_lock);
+		if (server->srv_count > 1) {
+			read_unlock(&cifs_tcp_ses_lock);
 			if (memcmp(server->server_GUID,
 				   pSMBr->u.extended_response.
 				   GUID, 16) != 0) {
@@ -674,9 +675,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 					pSMBr->u.extended_response.GUID,
 					16);
 			}
-		} else
+		} else {
+			read_unlock(&cifs_tcp_ses_lock);
 			memcpy(server->server_GUID,
 			       pSMBr->u.extended_response.GUID, 16);
+		}
 
 		if (count == 16) {
 			server->secType = RawNTLMSSP;
@@ -739,50 +742,31 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 	int rc = 0;
 
 	cFYI(1, ("In tree disconnect"));
-	/*
-	 *  If last user of the connection and
-	 *  connection alive - disconnect it
-	 *  If this is the last connection on the server session disconnect it
-	 *  (and inside session disconnect we should check if tcp socket needs
-	 *  to be freed and kernel thread woken up).
-	 */
-	if (tcon)
-		down(&tcon->tconSem);
-	else
-		return -EIO;
 
-	atomic_dec(&tcon->useCount);
-	if (atomic_read(&tcon->useCount) > 0) {
-		up(&tcon->tconSem);
-		return -EBUSY;
-	}
+	/* BB: do we need to check this? These should never be NULL. */
+	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
+		return -EIO;
 
-	/* No need to return error on this operation if tid invalidated and
-	closed on server already e.g. due to tcp session crashing */
-	if (tcon->tidStatus == CifsNeedReconnect) {
-		up(&tcon->tconSem);
+	/*
+	 * No need to return error on this operation if tid invalidated and
+	 * closed on server already e.g. due to tcp session crashing. Also,
+	 * the tcon is no longer on the list, so no need to take lock before
+	 * checking this.
+	 */
+	if (tcon->need_reconnect)
 		return 0;
-	}
 
-	if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) {
-		up(&tcon->tconSem);
-		return -EIO;
-	}
 	rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon,
 			    (void **)&smb_buffer);
-	if (rc) {
-		up(&tcon->tconSem);
+	if (rc)
 		return rc;
-	}
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
 	if (rc)
 		cFYI(1, ("Tree disconnect failed %d", rc));
 
-	up(&tcon->tconSem);
-
 	/* No need to return error on this operation if tid invalidated and
-	closed on server already e.g. due to tcp session crashing */
+	   closed on server already e.g. due to tcp session crashing */
 	if (rc == -EAGAIN)
 		rc = 0;
 
@@ -796,43 +780,36 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	int rc = 0;
 
 	cFYI(1, ("In SMBLogoff for session disconnect"));
-	if (ses)
-		down(&ses->sesSem);
-	else
+
+	/*
+	 * BB: do we need to check validity of ses and server? They should
+	 * always be valid since we have an active reference. If not, that
+	 * should probably be a BUG()
+	 */
+	if (!ses || !ses->server)
 		return -EIO;
 
-	atomic_dec(&ses->inUse);
-	if (atomic_read(&ses->inUse) > 0) {
-		up(&ses->sesSem);
-		return -EBUSY;
-	}
+	down(&ses->sesSem);
+	if (ses->need_reconnect)
+		goto session_already_dead; /* no need to send SMBlogoff if uid
+					      already closed due to reconnect */
 	rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
 	if (rc) {
 		up(&ses->sesSem);
 		return rc;
 	}
 
-	if (ses->server) {
-		pSMB->hdr.Mid = GetNextMid(ses->server);
+	pSMB->hdr.Mid = GetNextMid(ses->server);
 
-		if (ses->server->secMode &
+	if (ses->server->secMode &
 		   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
 			pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-	}
 
 	pSMB->hdr.Uid = ses->Suid;
 
 	pSMB->AndXCommand = 0xFF;
 	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
-	if (ses->server) {
-		atomic_dec(&ses->server->socketUseCount);
-		if (atomic_read(&ses->server->socketUseCount) == 0) {
-			spin_lock(&GlobalMid_Lock);
-			ses->server->tcpStatus = CifsExiting;
-			spin_unlock(&GlobalMid_Lock);
-			rc = -ESHUTDOWN;
-		}
-	}
+session_already_dead:
 	up(&ses->sesSem);
 
 	/* if session dead then we do not need to do ulogoff,
@@ -3922,6 +3899,27 @@ GetInodeNumOut:
 	return rc;
 }
 
+/* computes length of UCS string converted to host codepage
+ * @src:	UCS string
+ * @maxlen:	length of the input string in UCS characters
+ * 		(not in bytes)
+ *
+ * return:	size of input string in host codepage
+ */
+static int hostlen_fromUCS(const __le16 *src, const int maxlen,
+		const struct nls_table *nls_codepage) {
+	int i;
+	int hostlen = 0;
+	char to[4];
+	int charlen;
+	for (i = 0; (i < maxlen) && src[i]; ++i) {
+		charlen = nls_codepage->uni2char(le16_to_cpu(src[i]),
+				to, NLS_MAX_CHARSET_SIZE);
+		hostlen += charlen > 0 ? charlen : 1;
+	}
+	return hostlen;
+}
+
 /* parses DFS refferal V3 structure
  * caller is responsible for freeing target_nodes
  * returns:
@@ -3932,7 +3930,8 @@ static int
 parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		unsigned int *num_of_nodes,
 		struct dfs_info3_param **target_nodes,
-		const struct nls_table *nls_codepage)
+		const struct nls_table *nls_codepage, int remap,
+		const char *searchName)
 {
 	int i, rc = 0;
 	char *data_end;
@@ -3983,7 +3982,18 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		struct dfs_info3_param *node = (*target_nodes)+i;
 
 		node->flags = le16_to_cpu(pSMBr->DFSFlags);
-		node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+		if (is_unicode) {
+			__le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
+						GFP_KERNEL);
+			cifsConvertToUCS((__le16 *) tmp, searchName,
+					PATH_MAX, nls_codepage, remap);
+			node->path_consumed = hostlen_fromUCS(tmp,
+					le16_to_cpu(pSMBr->PathConsumed)/2,
+					nls_codepage);
+			kfree(tmp);
+		} else
+			node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+
 		node->server_type = le16_to_cpu(ref->ServerType);
 		node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags);
 
@@ -4116,7 +4126,8 @@ getDFSRetry:
 
 	/* parse returned result into more usable form */
 	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
-				 target_nodes, nls_codepage);
+				 target_nodes, nls_codepage, remap,
+				 searchName);
 
 GetDFSRefExit:
 	cifs_buf_release(pSMB);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e9f9248cb3f..c7d34171458 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -124,7 +124,7 @@ static int
 cifs_reconnect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
-	struct list_head *tmp;
+	struct list_head *tmp, *tmp2;
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 	struct mid_q_entry *mid_entry;
@@ -144,23 +144,17 @@ cifs_reconnect(struct TCP_Server_Info *server)
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
-		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
-		if (ses->server) {
-			if (ses->server == server) {
-				ses->status = CifsNeedReconnect;
-				ses->ipc_tid = 0;
-			}
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		ses->need_reconnect = true;
+		ses->ipc_tid = 0;
+		list_for_each(tmp2, &ses->tcon_list) {
+			tcon = list_entry(tmp2, struct cifsTconInfo, tcon_list);
+			tcon->need_reconnect = true;
 		}
-		/* else tcp and smb sessions need reconnection */
-	}
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if ((tcon->ses) && (tcon->ses->server == server))
-			tcon->tidStatus = CifsNeedReconnect;
 	}
-	read_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 	/* do not want to be sending data on a socket we are freeing */
 	down(&server->tcpSem);
 	if (server->ssocket) {
@@ -193,7 +187,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	while ((server->tcpStatus != CifsExiting) &&
 	       (server->tcpStatus != CifsGood)) {
 		try_to_freeze();
-		if (server->protocolType == IPV6) {
+		if (server->addr.sockAddr6.sin6_family == AF_INET6) {
 			rc = ipv6_connect(&server->addr.sockAddr6,
 					  &server->ssocket, server->noautotune);
 		} else {
@@ -417,9 +411,14 @@ incomplete_rcv:
 			msleep(1); /* minimum sleep to prevent looping
 				allowing socket to clear and app threads to set
 				tcpStatus CifsNeedReconnect if server hung */
-			if (pdu_length < 4)
+			if (pdu_length < 4) {
+				iov.iov_base = (4 - pdu_length) +
+							(char *)smb_buffer;
+				iov.iov_len = pdu_length;
+				smb_msg.msg_control = NULL;
+				smb_msg.msg_controllen = 0;
 				goto incomplete_rcv;
-			else
+			} else
 				continue;
 		} else if (length <= 0) {
 			if (server->tcpStatus == CifsNew) {
@@ -654,6 +653,11 @@ multi_t2_fnd:
 		}
 	} /* end while !EXITING */
 
+	/* take it off the list, if it's not already */
+	write_lock(&cifs_tcp_ses_lock);
+	list_del_init(&server->tcp_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
@@ -686,29 +690,29 @@ multi_t2_fnd:
 	if (smallbuf) /* no sense logging a debug message if NULL */
 		cifs_small_buf_release(smallbuf);
 
-	read_lock(&GlobalSMBSeslock);
+	/*
+	 * BB: we shouldn't have to do any of this. It shouldn't be
+	 * possible to exit from the thread with active SMB sessions
+	 */
+	read_lock(&cifs_tcp_ses_lock);
 	if (list_empty(&server->pending_mid_q)) {
 		/* loop through server session structures attached to this and
 		    mark them dead */
-		list_for_each(tmp, &GlobalSMBSessionList) {
-			ses =
-			    list_entry(tmp, struct cifsSesInfo,
-				       cifsSessionList);
-			if (ses->server == server) {
-				ses->status = CifsExiting;
-				ses->server = NULL;
-			}
+		list_for_each(tmp, &server->smb_ses_list) {
+			ses = list_entry(tmp, struct cifsSesInfo,
+					 smb_ses_list);
+			ses->status = CifsExiting;
+			ses->server = NULL;
 		}
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 	} else {
 		/* although we can not zero the server struct pointer yet,
 		since there are active requests which may depnd on them,
 		mark the corresponding SMB sessions as exiting too */
-		list_for_each(tmp, &GlobalSMBSessionList) {
+		list_for_each(tmp, &server->smb_ses_list) {
 			ses = list_entry(tmp, struct cifsSesInfo,
-					 cifsSessionList);
-			if (ses->server == server)
-				ses->status = CifsExiting;
+					 smb_ses_list);
+			ses->status = CifsExiting;
 		}
 
 		spin_lock(&GlobalMid_Lock);
@@ -723,7 +727,7 @@ multi_t2_fnd:
 			}
 		}
 		spin_unlock(&GlobalMid_Lock);
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 		/* 1/8th of sec is more than enough time for them to exit */
 		msleep(125);
 	}
@@ -745,14 +749,13 @@ multi_t2_fnd:
 	if there are any pointing to this (e.g
 	if a crazy root user tried to kill cifsd
 	kernel thread explicitly this might happen) */
-	write_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
-		ses = list_entry(tmp, struct cifsSesInfo,
-				cifsSessionList);
-		if (ses->server == server)
-			ses->server = NULL;
+	/* BB: This shouldn't be necessary, see above */
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		ses->server = NULL;
 	}
-	write_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 
 	kfree(server->hostname);
 	task_to_wake = xchg(&server->tsk, NULL);
@@ -1352,94 +1355,158 @@ cifs_parse_mount_options(char *options, const char *devname,
 	return 0;
 }
 
-static struct cifsSesInfo *
-cifs_find_tcp_session(struct in_addr *target_ip_addr,
-		      struct in6_addr *target_ip6_addr,
-		      char *userName, struct TCP_Server_Info **psrvTcp)
+static struct TCP_Server_Info *
+cifs_find_tcp_session(struct sockaddr *addr)
 {
 	struct list_head *tmp;
-	struct cifsSesInfo *ses;
-
-	*psrvTcp = NULL;
+	struct TCP_Server_Info *server;
+	struct sockaddr_in *addr4 = (struct sockaddr_in *) addr;
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr;
+
+	write_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &cifs_tcp_ses_list) {
+		server = list_entry(tmp, struct TCP_Server_Info,
+				    tcp_ses_list);
+		/*
+		 * the demux thread can exit on its own while still in CifsNew
+		 * so don't accept any sockets in that state. Since the
+		 * tcpStatus never changes back to CifsNew it's safe to check
+		 * for this without a lock.
+		 */
+		if (server->tcpStatus == CifsNew)
+			continue;
 
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
-		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
-		if (!ses->server)
+		if (addr->sa_family == AF_INET &&
+		    (addr4->sin_addr.s_addr !=
+		     server->addr.sockAddr.sin_addr.s_addr))
+			continue;
+		else if (addr->sa_family == AF_INET6 &&
+			 memcmp(&server->addr.sockAddr6.sin6_addr,
+				&addr6->sin6_addr, sizeof(addr6->sin6_addr)))
 			continue;
 
-		if (target_ip_addr &&
-		    ses->server->addr.sockAddr.sin_addr.s_addr != target_ip_addr->s_addr)
-				continue;
-		else if (target_ip6_addr &&
-			 memcmp(&ses->server->addr.sockAddr6.sin6_addr,
-				target_ip6_addr, sizeof(*target_ip6_addr)))
-				continue;
-		/* BB lock server and tcp session; increment use count here?? */
+		++server->srv_count;
+		write_unlock(&cifs_tcp_ses_lock);
+		cFYI(1, ("Existing tcp session with server found"));
+		return server;
+	}
+	write_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
 
-		/* found a match on the TCP session */
-		*psrvTcp = ses->server;
+static void
+cifs_put_tcp_session(struct TCP_Server_Info *server)
+{
+	struct task_struct *task;
 
-		/* BB check if reconnection needed */
-		if (strncmp(ses->userName, userName, MAX_USERNAME_SIZE) == 0) {
-			read_unlock(&GlobalSMBSeslock);
-			/* Found exact match on both TCP and
-			   SMB sessions */
-			return ses;
-		}
-		/* else tcp and smb sessions need reconnection */
+	write_lock(&cifs_tcp_ses_lock);
+	if (--server->srv_count > 0) {
+		write_unlock(&cifs_tcp_ses_lock);
+		return;
 	}
-	read_unlock(&GlobalSMBSeslock);
 
-	return NULL;
+	list_del_init(&server->tcp_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
+	spin_lock(&GlobalMid_Lock);
+	server->tcpStatus = CifsExiting;
+	spin_unlock(&GlobalMid_Lock);
+
+	task = xchg(&server->tsk, NULL);
+	if (task)
+		force_sig(SIGKILL, task);
 }
 
-static struct cifsTconInfo *
-find_unc(__be32 new_target_ip_addr, char *uncName, char *userName)
+static struct cifsSesInfo *
+cifs_find_smb_ses(struct TCP_Server_Info *server, char *username)
 {
 	struct list_head *tmp;
-	struct cifsTconInfo *tcon;
-	__be32 old_ip;
-
-	read_lock(&GlobalSMBSeslock);
+	struct cifsSesInfo *ses;
 
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		cFYI(1, ("Next tcon"));
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if (!tcon->ses || !tcon->ses->server)
+	write_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		if (strncmp(ses->userName, username, MAX_USERNAME_SIZE))
 			continue;
 
-		old_ip = tcon->ses->server->addr.sockAddr.sin_addr.s_addr;
-		cFYI(1, ("old ip addr: %x == new ip %x ?",
-			old_ip, new_target_ip_addr));
+		++ses->ses_count;
+		write_unlock(&cifs_tcp_ses_lock);
+		return ses;
+	}
+	write_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
 
-		if (old_ip != new_target_ip_addr)
-			continue;
+static void
+cifs_put_smb_ses(struct cifsSesInfo *ses)
+{
+	int xid;
+	struct TCP_Server_Info *server = ses->server;
 
-		/* BB lock tcon, server, tcp session and increment use count? */
-		/* found a match on the TCP session */
-		/* BB check if reconnection needed */
-		cFYI(1, ("IP match, old UNC: %s new: %s",
-			tcon->treeName, uncName));
+	write_lock(&cifs_tcp_ses_lock);
+	if (--ses->ses_count > 0) {
+		write_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
 
-		if (strncmp(tcon->treeName, uncName, MAX_TREE_SIZE))
-			continue;
+	list_del_init(&ses->smb_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
 
-		cFYI(1, ("and old usr: %s new: %s",
-			tcon->treeName, uncName));
+	if (ses->status == CifsGood) {
+		xid = GetXid();
+		CIFSSMBLogoff(xid, ses);
+		_FreeXid(xid);
+	}
+	sesInfoFree(ses);
+	cifs_put_tcp_session(server);
+}
 
-		if (strncmp(tcon->ses->userName, userName, MAX_USERNAME_SIZE))
+static struct cifsTconInfo *
+cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
+{
+	struct list_head *tmp;
+	struct cifsTconInfo *tcon;
+
+	write_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &ses->tcon_list) {
+		tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
+		if (tcon->tidStatus == CifsExiting)
+			continue;
+		if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
 			continue;
 
-		/* matched smb session (user name) */
-		read_unlock(&GlobalSMBSeslock);
+		++tcon->tc_count;
+		write_unlock(&cifs_tcp_ses_lock);
 		return tcon;
 	}
-
-	read_unlock(&GlobalSMBSeslock);
+	write_unlock(&cifs_tcp_ses_lock);
 	return NULL;
 }
 
+static void
+cifs_put_tcon(struct cifsTconInfo *tcon)
+{
+	int xid;
+	struct cifsSesInfo *ses = tcon->ses;
+
+	write_lock(&cifs_tcp_ses_lock);
+	if (--tcon->tc_count > 0) {
+		write_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+
+	list_del_init(&tcon->tcon_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
+	xid = GetXid();
+	CIFSSMBTDis(xid, tcon);
+	_FreeXid(xid);
+
+	DeleteTconOplockQEntries(tcon);
+	tconInfoFree(tcon);
+	cifs_put_smb_ses(ses);
+}
+
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1876,14 +1943,90 @@ convert_delimiter(char *path, char delim)
 	}
 }
 
-static void
-kill_cifsd(struct TCP_Server_Info *server)
+static void setup_cifs_sb(struct smb_vol *pvolume_info,
+			  struct cifs_sb_info *cifs_sb)
 {
-	struct task_struct *task;
-
-	task = xchg(&server->tsk, NULL);
-	if (task)
-		force_sig(SIGKILL, task);
+	if (pvolume_info->rsize > CIFSMaxBufSize) {
+		cERROR(1, ("rsize %d too large, using MaxBufSize",
+			pvolume_info->rsize));
+		cifs_sb->rsize = CIFSMaxBufSize;
+	} else if ((pvolume_info->rsize) &&
+			(pvolume_info->rsize <= CIFSMaxBufSize))
+		cifs_sb->rsize = pvolume_info->rsize;
+	else /* default */
+		cifs_sb->rsize = CIFSMaxBufSize;
+
+	if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
+		cERROR(1, ("wsize %d too large, using 4096 instead",
+			  pvolume_info->wsize));
+		cifs_sb->wsize = 4096;
+	} else if (pvolume_info->wsize)
+		cifs_sb->wsize = pvolume_info->wsize;
+	else
+		cifs_sb->wsize = min_t(const int,
+					PAGEVEC_SIZE * PAGE_CACHE_SIZE,
+					127*1024);
+		/* old default of CIFSMaxBufSize was too small now
+		   that SMB Write2 can send multiple pages in kvec.
+		   RFC1001 does not describe what happens when frame
+		   bigger than 128K is sent so use that as max in
+		   conjunction with 52K kvec constraint on arch with 4K
+		   page size  */
+
+	if (cifs_sb->rsize < 2048) {
+		cifs_sb->rsize = 2048;
+		/* Windows ME may prefer this */
+		cFYI(1, ("readsize set to minimum: 2048"));
+	}
+	/* calculate prepath */
+	cifs_sb->prepath = pvolume_info->prepath;
+	if (cifs_sb->prepath) {
+		cifs_sb->prepathlen = strlen(cifs_sb->prepath);
+		/* we can not convert the / to \ in the path
+		separators in the prefixpath yet because we do not
+		know (until reset_cifs_unix_caps is called later)
+		whether POSIX PATH CAP is available. We normalize
+		the / to \ after reset_cifs_unix_caps is called */
+		pvolume_info->prepath = NULL;
+	} else
+		cifs_sb->prepathlen = 0;
+	cifs_sb->mnt_uid = pvolume_info->linux_uid;
+	cifs_sb->mnt_gid = pvolume_info->linux_gid;
+	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
+	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
+	cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
+		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
+
+	if (pvolume_info->noperm)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
+	if (pvolume_info->setuids)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
+	if (pvolume_info->server_ino)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
+	if (pvolume_info->remap)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
+	if (pvolume_info->no_xattr)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
+	if (pvolume_info->sfu_emul)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
+	if (pvolume_info->nobrl)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+	if (pvolume_info->cifs_acl)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+	if (pvolume_info->override_uid)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
+	if (pvolume_info->override_gid)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+	if (pvolume_info->dynperm)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
+	if (pvolume_info->direct_io) {
+		cFYI(1, ("mounting share using direct i/o"));
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
+	}
+
+	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
+		cERROR(1, ("mount option dynperm ignored if cifsacl "
+			   "mount option supported"));
 }
 
 int
@@ -1892,13 +2035,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 {
 	int rc = 0;
 	int xid;
-	int address_type = AF_INET;
 	struct socket *csocket = NULL;
-	struct sockaddr_in sin_server;
-	struct sockaddr_in6 sin_server6;
+	struct sockaddr addr;
+	struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
+	struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
 	struct smb_vol volume_info;
 	struct cifsSesInfo *pSesInfo = NULL;
-	struct cifsSesInfo *existingCifsSes = NULL;
 	struct cifsTconInfo *tcon = NULL;
 	struct TCP_Server_Info *srvTcp = NULL;
 
@@ -1906,6 +2048,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 /* cFYI(1, ("Entering cifs_mount. Xid: %d with: %s", xid, mount_data)); */
 
+	memset(&addr, 0, sizeof(struct sockaddr));
 	memset(&volume_info, 0, sizeof(struct smb_vol));
 	if (cifs_parse_mount_options(mount_data, devname, &volume_info)) {
 		rc = -EINVAL;
@@ -1928,16 +2071,16 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 	if (volume_info.UNCip && volume_info.UNC) {
 		rc = cifs_inet_pton(AF_INET, volume_info.UNCip,
-				    &sin_server.sin_addr.s_addr);
+				    &sin_server->sin_addr.s_addr);
 
 		if (rc <= 0) {
 			/* not ipv4 address, try ipv6 */
 			rc = cifs_inet_pton(AF_INET6, volume_info.UNCip,
-					    &sin_server6.sin6_addr.in6_u);
+					    &sin_server6->sin6_addr.in6_u);
 			if (rc > 0)
-				address_type = AF_INET6;
+				addr.sa_family = AF_INET6;
 		} else {
-			address_type = AF_INET;
+			addr.sa_family = AF_INET;
 		}
 
 		if (rc <= 0) {
@@ -1977,41 +2120,25 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		}
 	}
 
-	if (address_type == AF_INET)
-		existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr,
-			NULL /* no ipv6 addr */,
-			volume_info.username, &srvTcp);
-	else if (address_type == AF_INET6) {
-		cFYI(1, ("looking for ipv6 address"));
-		existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */,
-			&sin_server6.sin6_addr,
-			volume_info.username, &srvTcp);
-	} else {
-		rc = -EINVAL;
-		goto out;
-	}
-
-	if (srvTcp) {
-		cFYI(1, ("Existing tcp session with server found"));
-	} else {	/* create socket */
-		if (volume_info.port)
-			sin_server.sin_port = htons(volume_info.port);
-		else
-			sin_server.sin_port = 0;
-		if (address_type == AF_INET6) {
+	srvTcp = cifs_find_tcp_session(&addr);
+	if (!srvTcp) { /* create socket */
+		if (addr.sa_family == AF_INET6) {
 			cFYI(1, ("attempting ipv6 connect"));
 			/* BB should we allow ipv6 on port 139? */
 			/* other OS never observed in Wild doing 139 with v6 */
-			rc = ipv6_connect(&sin_server6, &csocket,
+			sin_server6->sin6_port = htons(volume_info.port);
+			rc = ipv6_connect(sin_server6, &csocket,
 					volume_info.noblocksnd);
-		} else
-			rc = ipv4_connect(&sin_server, &csocket,
+		} else {
+			sin_server->sin_port = htons(volume_info.port);
+			rc = ipv4_connect(sin_server, &csocket,
 				  volume_info.source_rfc1001_name,
 				  volume_info.target_rfc1001_name,
 				  volume_info.noblocksnd,
 				  volume_info.noautotune);
+		}
 		if (rc < 0) {
-			cERROR(1, ("Error connecting to IPv4 socket. "
+			cERROR(1, ("Error connecting to socket. "
 				   "Aborting operation"));
 			if (csocket != NULL)
 				sock_release(csocket);
@@ -2026,12 +2153,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		} else {
 			srvTcp->noblocksnd = volume_info.noblocksnd;
 			srvTcp->noautotune = volume_info.noautotune;
-			memcpy(&srvTcp->addr.sockAddr, &sin_server,
-				sizeof(struct sockaddr_in));
+			if (addr.sa_family == AF_INET6)
+				memcpy(&srvTcp->addr.sockAddr6, sin_server6,
+					sizeof(struct sockaddr_in6));
+			else
+				memcpy(&srvTcp->addr.sockAddr, sin_server,
+					sizeof(struct sockaddr_in));
 			atomic_set(&srvTcp->inFlight, 0);
 			/* BB Add code for ipv6 case too */
 			srvTcp->ssocket = csocket;
-			srvTcp->protocolType = IPV4;
 			srvTcp->hostname = extract_hostname(volume_info.UNC);
 			if (IS_ERR(srvTcp->hostname)) {
 				rc = PTR_ERR(srvTcp->hostname);
@@ -2061,15 +2191,28 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			memcpy(srvTcp->server_RFC1001_name,
 				volume_info.target_rfc1001_name, 16);
 			srvTcp->sequence_number = 0;
+			INIT_LIST_HEAD(&srvTcp->tcp_ses_list);
+			INIT_LIST_HEAD(&srvTcp->smb_ses_list);
+			++srvTcp->srv_count;
+			write_lock(&cifs_tcp_ses_lock);
+			list_add(&srvTcp->tcp_ses_list,
+				 &cifs_tcp_ses_list);
+			write_unlock(&cifs_tcp_ses_lock);
 		}
 	}
 
-	if (existingCifsSes) {
-		pSesInfo = existingCifsSes;
+	pSesInfo = cifs_find_smb_ses(srvTcp, volume_info.username);
+	if (pSesInfo) {
 		cFYI(1, ("Existing smb sess found (status=%d)",
 			pSesInfo->status));
+		/*
+		 * The existing SMB session already has a reference to srvTcp,
+		 * so we can put back the extra one we got before
+		 */
+		cifs_put_tcp_session(srvTcp);
+
 		down(&pSesInfo->sesSem);
-		if (pSesInfo->status == CifsNeedReconnect) {
+		if (pSesInfo->need_reconnect) {
 			cFYI(1, ("Session needs reconnect"));
 			rc = cifs_setup_session(xid, pSesInfo,
 						cifs_sb->local_nls);
@@ -2078,187 +2221,101 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	} else if (!rc) {
 		cFYI(1, ("Existing smb sess not found"));
 		pSesInfo = sesInfoAlloc();
-		if (pSesInfo == NULL)
+		if (pSesInfo == NULL) {
 			rc = -ENOMEM;
-		else {
-			pSesInfo->server = srvTcp;
-			sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
-				NIPQUAD(sin_server.sin_addr.s_addr));
-		}
+			goto mount_fail_check;
+		}
+
+		/* new SMB session uses our srvTcp ref */
+		pSesInfo->server = srvTcp;
+		sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
+			NIPQUAD(sin_server->sin_addr.s_addr));
+
+		write_lock(&cifs_tcp_ses_lock);
+		list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
+		write_unlock(&cifs_tcp_ses_lock);
+
+		/* volume_info.password freed at unmount */
+		if (volume_info.password) {
+			pSesInfo->password = volume_info.password;
+			/* set to NULL to prevent freeing on exit */
+			volume_info.password = NULL;
+		}
+		if (volume_info.username)
+			strncpy(pSesInfo->userName, volume_info.username,
+				MAX_USERNAME_SIZE);
+		if (volume_info.domainname) {
+			int len = strlen(volume_info.domainname);
+			pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
+			if (pSesInfo->domainName)
+				strcpy(pSesInfo->domainName,
+					volume_info.domainname);
+		}
+		pSesInfo->linux_uid = volume_info.linux_uid;
+		pSesInfo->overrideSecFlg = volume_info.secFlg;
+		down(&pSesInfo->sesSem);
 
-		if (!rc) {
-			/* volume_info.password freed at unmount */
-			if (volume_info.password) {
-				pSesInfo->password = volume_info.password;
-				/* set to NULL to prevent freeing on exit */
-				volume_info.password = NULL;
-			}
-			if (volume_info.username)
-				strncpy(pSesInfo->userName,
-					volume_info.username,
-					MAX_USERNAME_SIZE);
-			if (volume_info.domainname) {
-				int len = strlen(volume_info.domainname);
-				pSesInfo->domainName =
-					kmalloc(len + 1, GFP_KERNEL);
-				if (pSesInfo->domainName)
-					strcpy(pSesInfo->domainName,
-						volume_info.domainname);
-			}
-			pSesInfo->linux_uid = volume_info.linux_uid;
-			pSesInfo->overrideSecFlg = volume_info.secFlg;
-			down(&pSesInfo->sesSem);
-			/* BB FIXME need to pass vol->secFlgs BB */
-			rc = cifs_setup_session(xid, pSesInfo,
-						cifs_sb->local_nls);
-			up(&pSesInfo->sesSem);
-			if (!rc)
-				atomic_inc(&srvTcp->socketUseCount);
-		}
+		/* BB FIXME need to pass vol->secFlgs BB */
+		rc = cifs_setup_session(xid, pSesInfo,
+					cifs_sb->local_nls);
+		up(&pSesInfo->sesSem);
 	}
 
 	/* search for existing tcon to this server share */
 	if (!rc) {
-		if (volume_info.rsize > CIFSMaxBufSize) {
-			cERROR(1, ("rsize %d too large, using MaxBufSize",
-				volume_info.rsize));
-			cifs_sb->rsize = CIFSMaxBufSize;
-		} else if ((volume_info.rsize) &&
-				(volume_info.rsize <= CIFSMaxBufSize))
-			cifs_sb->rsize = volume_info.rsize;
-		else /* default */
-			cifs_sb->rsize = CIFSMaxBufSize;
-
-		if (volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-			cERROR(1, ("wsize %d too large, using 4096 instead",
-				  volume_info.wsize));
-			cifs_sb->wsize = 4096;
-		} else if (volume_info.wsize)
-			cifs_sb->wsize = volume_info.wsize;
-		else
-			cifs_sb->wsize =
-				min_t(const int, PAGEVEC_SIZE * PAGE_CACHE_SIZE,
-					127*1024);
-			/* old default of CIFSMaxBufSize was too small now
-			   that SMB Write2 can send multiple pages in kvec.
-			   RFC1001 does not describe what happens when frame
-			   bigger than 128K is sent so use that as max in
-			   conjunction with 52K kvec constraint on arch with 4K
-			   page size  */
-
-		if (cifs_sb->rsize < 2048) {
-			cifs_sb->rsize = 2048;
-			/* Windows ME may prefer this */
-			cFYI(1, ("readsize set to minimum: 2048"));
-		}
-		/* calculate prepath */
-		cifs_sb->prepath = volume_info.prepath;
-		if (cifs_sb->prepath) {
-			cifs_sb->prepathlen = strlen(cifs_sb->prepath);
-			/* we can not convert the / to \ in the path
-			separators in the prefixpath yet because we do not
-			know (until reset_cifs_unix_caps is called later)
-			whether POSIX PATH CAP is available. We normalize
-			the / to \ after reset_cifs_unix_caps is called */
-			volume_info.prepath = NULL;
-		} else
-			cifs_sb->prepathlen = 0;
-		cifs_sb->mnt_uid = volume_info.linux_uid;
-		cifs_sb->mnt_gid = volume_info.linux_gid;
-		cifs_sb->mnt_file_mode = volume_info.file_mode;
-		cifs_sb->mnt_dir_mode = volume_info.dir_mode;
-		cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
-			cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
-
-		if (volume_info.noperm)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
-		if (volume_info.setuids)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
-		if (volume_info.server_ino)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
-		if (volume_info.remap)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
-		if (volume_info.no_xattr)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
-		if (volume_info.sfu_emul)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
-		if (volume_info.nobrl)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
-		if (volume_info.cifs_acl)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
-		if (volume_info.override_uid)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
-		if (volume_info.override_gid)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
-		if (volume_info.dynperm)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
-		if (volume_info.direct_io) {
-			cFYI(1, ("mounting share using direct i/o"));
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
-		}
+		setup_cifs_sb(&volume_info, cifs_sb);
 
-		if ((volume_info.cifs_acl) && (volume_info.dynperm))
-			cERROR(1, ("mount option dynperm ignored if cifsacl "
-				   "mount option supported"));
-
-		tcon =
-		    find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
-			     volume_info.username);
+		tcon = cifs_find_tcon(pSesInfo, volume_info.UNC);
 		if (tcon) {
 			cFYI(1, ("Found match on UNC path"));
-			/* we can have only one retry value for a connection
-			   to a share so for resources mounted more than once
-			   to the same server share the last value passed in
-			   for the retry flag is used */
-			tcon->retry = volume_info.retry;
-			tcon->nocase = volume_info.nocase;
-			tcon->local_lease = volume_info.local_lease;
+			/* existing tcon already has a reference */
+			cifs_put_smb_ses(pSesInfo);
 			if (tcon->seal != volume_info.seal)
 				cERROR(1, ("transport encryption setting "
 					   "conflicts with existing tid"));
 		} else {
 			tcon = tconInfoAlloc();
-			if (tcon == NULL)
+			if (tcon == NULL) {
 				rc = -ENOMEM;
-			else {
-				/* check for null share name ie connecting to
-				 * dfs root */
-
-				/* BB check if this works for exactly length
-				 * three strings */
-				if ((strchr(volume_info.UNC + 3, '\\') == NULL)
-				    && (strchr(volume_info.UNC + 3, '/') ==
-					NULL)) {
-/*					rc = connect_to_dfs_path(xid, pSesInfo,
-						"", cifs_sb->local_nls,
-						cifs_sb->mnt_cifs_flags &
-						  CIFS_MOUNT_MAP_SPECIAL_CHR);*/
-					cFYI(1, ("DFS root not supported"));
-					rc = -ENODEV;
-					goto out;
-				} else {
-					/* BB Do we need to wrap sesSem around
-					 * this TCon call and Unix SetFS as
-					 * we do on SessSetup and reconnect? */
-					rc = CIFSTCon(xid, pSesInfo,
-						volume_info.UNC,
-						tcon, cifs_sb->local_nls);
-					cFYI(1, ("CIFS Tcon rc = %d", rc));
-					if (volume_info.nodfs) {
-						tcon->Flags &=
-							~SMB_SHARE_IS_IN_DFS;
-						cFYI(1, ("DFS disabled (%d)",
-							tcon->Flags));
-					}
-				}
-				if (!rc) {
-					atomic_inc(&pSesInfo->inUse);
-					tcon->retry = volume_info.retry;
-					tcon->nocase = volume_info.nocase;
-					tcon->seal = volume_info.seal;
+				goto mount_fail_check;
+			}
+			tcon->ses = pSesInfo;
+
+			/* check for null share name ie connect to dfs root */
+			if ((strchr(volume_info.UNC + 3, '\\') == NULL)
+			    && (strchr(volume_info.UNC + 3, '/') == NULL)) {
+				/* rc = connect_to_dfs_path(...) */
+				cFYI(1, ("DFS root not supported"));
+				rc = -ENODEV;
+				goto mount_fail_check;
+			} else {
+				/* BB Do we need to wrap sesSem around
+				 * this TCon call and Unix SetFS as
+				 * we do on SessSetup and reconnect? */
+				rc = CIFSTCon(xid, pSesInfo, volume_info.UNC,
+					      tcon, cifs_sb->local_nls);
+				cFYI(1, ("CIFS Tcon rc = %d", rc));
+				if (volume_info.nodfs) {
+					tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
+					cFYI(1, ("DFS disabled (%d)",
+						tcon->Flags));
 				}
 			}
-		}
+			if (rc)
+				goto mount_fail_check;
+			tcon->seal = volume_info.seal;
+			write_lock(&cifs_tcp_ses_lock);
+			list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
+			write_unlock(&cifs_tcp_ses_lock);
+		}
+
+		/* we can have only one retry value for a connection
+		   to a share so for resources mounted more than once
+		   to the same server share the last value passed in
+		   for the retry flag is used */
+		tcon->retry = volume_info.retry;
+		tcon->nocase = volume_info.nocase;
+		tcon->local_lease = volume_info.local_lease;
 	}
 	if (pSesInfo) {
 		if (pSesInfo->capabilities & CAP_LARGE_FILES) {
@@ -2270,80 +2327,49 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
 	sb->s_time_gran = 100;
 
-/* on error free sesinfo and tcon struct if needed */
+mount_fail_check:
+	/* on error free sesinfo and tcon struct if needed */
 	if (rc) {
-		/* if session setup failed, use count is zero but
-		we still need to free cifsd thread */
-		if (atomic_read(&srvTcp->socketUseCount) == 0) {
-			spin_lock(&GlobalMid_Lock);
-			srvTcp->tcpStatus = CifsExiting;
-			spin_unlock(&GlobalMid_Lock);
-			kill_cifsd(srvTcp);
-		}
-		 /* If find_unc succeeded then rc == 0 so we can not end */
-		if (tcon)  /* up accidently freeing someone elses tcon struct */
-			tconInfoFree(tcon);
-		if (existingCifsSes == NULL) {
-			if (pSesInfo) {
-				if ((pSesInfo->server) &&
-				    (pSesInfo->status == CifsGood)) {
-					int temp_rc;
-					temp_rc = CIFSSMBLogoff(xid, pSesInfo);
-					/* if the socketUseCount is now zero */
-					if ((temp_rc == -ESHUTDOWN) &&
-					    (pSesInfo->server))
-						kill_cifsd(pSesInfo->server);
-				} else {
-					cFYI(1, ("No session or bad tcon"));
-					if (pSesInfo->server) {
-						spin_lock(&GlobalMid_Lock);
-						srvTcp->tcpStatus = CifsExiting;
-						spin_unlock(&GlobalMid_Lock);
-						kill_cifsd(pSesInfo->server);
-					}
-				}
-				sesInfoFree(pSesInfo);
-				/* pSesInfo = NULL; */
-			}
-		}
-	} else {
-		atomic_inc(&tcon->useCount);
-		cifs_sb->tcon = tcon;
-		tcon->ses = pSesInfo;
-
-		/* do not care if following two calls succeed - informational */
-		if (!tcon->ipc) {
-			CIFSSMBQFSDeviceInfo(xid, tcon);
-			CIFSSMBQFSAttributeInfo(xid, tcon);
-		}
-
-		/* tell server which Unix caps we support */
-		if (tcon->ses->capabilities & CAP_UNIX)
-			/* reset of caps checks mount to see if unix extensions
-			   disabled for just this mount */
-			reset_cifs_unix_caps(xid, tcon, sb, &volume_info);
+		/* If find_unc succeeded then rc == 0 so we can not end */
+		/* up accidently freeing someone elses tcon struct */
+		if (tcon)
+			cifs_put_tcon(tcon);
+		else if (pSesInfo)
+			cifs_put_smb_ses(pSesInfo);
 		else
-			tcon->unix_ext = 0; /* server does not support them */
+			cifs_put_tcp_session(srvTcp);
+		goto out;
+	}
+	cifs_sb->tcon = tcon;
 
-		/* convert forward to back slashes in prepath here if needed */
-		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
-			convert_delimiter(cifs_sb->prepath,
-					  CIFS_DIR_SEP(cifs_sb));
+	/* do not care if following two calls succeed - informational */
+	if (!tcon->ipc) {
+		CIFSSMBQFSDeviceInfo(xid, tcon);
+		CIFSSMBQFSAttributeInfo(xid, tcon);
+	}
 
-		if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
-			cifs_sb->rsize = 1024 * 127;
-			cFYI(DBG2,
-				("no very large read support, rsize now 127K"));
-		}
-		if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
-			cifs_sb->wsize = min(cifs_sb->wsize,
-					     (tcon->ses->server->maxBuf -
-					      MAX_CIFS_HDR_SIZE));
-		if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
-			cifs_sb->rsize = min(cifs_sb->rsize,
-					     (tcon->ses->server->maxBuf -
-					      MAX_CIFS_HDR_SIZE));
+	/* tell server which Unix caps we support */
+	if (tcon->ses->capabilities & CAP_UNIX)
+		/* reset of caps checks mount to see if unix extensions
+		   disabled for just this mount */
+		reset_cifs_unix_caps(xid, tcon, sb, &volume_info);
+	else
+		tcon->unix_ext = 0; /* server does not support them */
+
+	/* convert forward to back slashes in prepath here if needed */
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
+		convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
+
+	if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
+		cifs_sb->rsize = 1024 * 127;
+		cFYI(DBG2, ("no very large read support, rsize now 127K"));
 	}
+	if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
+		cifs_sb->wsize = min(cifs_sb->wsize,
+			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+	if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
+		cifs_sb->rsize = min(cifs_sb->rsize,
+			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
 
 	/* volume_info.password is freed above when existing session found
 	(in which case it is not needed anymore) but when new sesion is created
@@ -3513,6 +3539,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	/* above now done in SendReceive */
 	if ((rc == 0) && (tcon != NULL)) {
 		tcon->tidStatus = CifsGood;
+		tcon->need_reconnect = false;
 		tcon->tid = smb_buffer_response->Tid;
 		bcc_ptr = pByteArea(smb_buffer_response);
 		length = strnlen(bcc_ptr, BCC(smb_buffer_response) - 2);
@@ -3584,48 +3611,17 @@ int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
 	int rc = 0;
-	int xid;
-	struct cifsSesInfo *ses = NULL;
 	char *tmp;
 
-	xid = GetXid();
-
-	if (cifs_sb->tcon) {
-		ses = cifs_sb->tcon->ses; /* save ptr to ses before delete tcon!*/
-		rc = CIFSSMBTDis(xid, cifs_sb->tcon);
-		if (rc == -EBUSY) {
-			FreeXid(xid);
-			return 0;
-		}
-		DeleteTconOplockQEntries(cifs_sb->tcon);
-		tconInfoFree(cifs_sb->tcon);
-		if ((ses) && (ses->server)) {
-			/* save off task so we do not refer to ses later */
-			cFYI(1, ("About to do SMBLogoff "));
-			rc = CIFSSMBLogoff(xid, ses);
-			if (rc == -EBUSY) {
-				FreeXid(xid);
-				return 0;
-			} else if (rc == -ESHUTDOWN) {
-				cFYI(1, ("Waking up socket by sending signal"));
-				if (ses->server)
-					kill_cifsd(ses->server);
-				rc = 0;
-			} /* else - we have an smb session
-				left on this socket do not kill cifsd */
-		} else
-			cFYI(1, ("No session or bad tcon"));
-	}
+	if (cifs_sb->tcon)
+		cifs_put_tcon(cifs_sb->tcon);
 
 	cifs_sb->tcon = NULL;
 	tmp = cifs_sb->prepath;
 	cifs_sb->prepathlen = 0;
 	cifs_sb->prepath = NULL;
 	kfree(tmp);
-	if (ses)
-		sesInfoFree(ses);
 
-	FreeXid(xid);
 	return rc;
 }
 
@@ -3741,6 +3737,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 		cFYI(1, ("CIFS Session Established successfully"));
 			spin_lock(&GlobalMid_Lock);
 			pSesInfo->status = CifsGood;
+			pSesInfo->need_reconnect = false;
 			spin_unlock(&GlobalMid_Lock);
 	}
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ead1a3bb025..f0a81e631ae 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -488,12 +488,13 @@ int cifs_close(struct inode *inode, struct file *file)
 	pTcon = cifs_sb->tcon;
 	if (pSMBFile) {
 		struct cifsLockInfo *li, *tmp;
-
+		write_lock(&GlobalSMBSeslock);
 		pSMBFile->closePend = true;
 		if (pTcon) {
 			/* no sense reconnecting to close a file that is
 			   already closed */
-			if (pTcon->tidStatus != CifsNeedReconnect) {
+			if (!pTcon->need_reconnect) {
+				write_unlock(&GlobalSMBSeslock);
 				timeout = 2;
 				while ((atomic_read(&pSMBFile->wrtPending) != 0)
 					&& (timeout <= 2048)) {
@@ -510,12 +511,15 @@ int cifs_close(struct inode *inode, struct file *file)
 					timeout *= 4;
 				}
 				if (atomic_read(&pSMBFile->wrtPending))
-					cERROR(1,
-						("close with pending writes"));
-				rc = CIFSSMBClose(xid, pTcon,
+					cERROR(1, ("close with pending write"));
+				if (!pTcon->need_reconnect &&
+				    !pSMBFile->invalidHandle)
+					rc = CIFSSMBClose(xid, pTcon,
 						  pSMBFile->netfid);
-			}
-		}
+			} else
+				write_unlock(&GlobalSMBSeslock);
+		} else
+			write_unlock(&GlobalSMBSeslock);
 
 		/* Delete any outstanding lock records.
 		   We'll lose them when the file is closed anyway. */
@@ -587,15 +591,18 @@ int cifs_closedir(struct inode *inode, struct file *file)
 		pTcon = cifs_sb->tcon;
 
 		cFYI(1, ("Freeing private data in close dir"));
+		write_lock(&GlobalSMBSeslock);
 		if (!pCFileStruct->srch_inf.endOfSearch &&
 		    !pCFileStruct->invalidHandle) {
 			pCFileStruct->invalidHandle = true;
+			write_unlock(&GlobalSMBSeslock);
 			rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
 			cFYI(1, ("Closing uncompleted readdir with rc %d",
 				 rc));
 			/* not much we can do if it fails anyway, ignore rc */
 			rc = 0;
-		}
+		} else
+			write_unlock(&GlobalSMBSeslock);
 		ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
 		if (ptmp) {
 			cFYI(1, ("closedir free smb buf in srch struct"));
@@ -1404,7 +1411,10 @@ retry:
 			if ((wbc->nr_to_write -= n_iov) <= 0)
 				done = 1;
 			index = next;
-		}
+		} else
+			/* Need to re-find the pages we skipped */
+			index = pvec.pages[0]->index + 1;
+
 		pagevec_release(&pvec);
 	}
 	if (!scanned && !done) {
@@ -1465,7 +1475,11 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
 		 page, pos, copied));
 
-	if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+	if (PageChecked(page)) {
+		if (copied == len)
+			SetPageUptodate(page);
+		ClearPageChecked(page);
+	} else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
 		SetPageUptodate(page);
 
 	if (!PageUptodate(page)) {
@@ -2052,39 +2066,70 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+	loff_t page_start = pos & PAGE_MASK;
+	loff_t i_size;
+	struct page *page;
+	int rc = 0;
 
 	cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
 
-	*pagep = __grab_cache_page(mapping, index);
-	if (!*pagep)
-		return -ENOMEM;
-
-	if (PageUptodate(*pagep))
-		return 0;
+	page = __grab_cache_page(mapping, index);
+	if (!page) {
+		rc = -ENOMEM;
+		goto out;
+	}
 
-	/* If we are writing a full page it will be up to date,
-	   no need to read from the server */
-	if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE)
-		return 0;
+	if (PageUptodate(page))
+		goto out;
 
-	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
-		int rc;
+	/*
+	 * If we write a full page it will be up to date, no need to read from
+	 * the server. If the write is short, we'll end up doing a sync write
+	 * instead.
+	 */
+	if (len == PAGE_CACHE_SIZE)
+		goto out;
 
-		/* might as well read a page, it is fast enough */
-		rc = cifs_readpage_worker(file, *pagep, &offset);
+	/*
+	 * optimize away the read when we have an oplock, and we're not
+	 * expecting to use any of the data we'd be reading in. That
+	 * is, when the page lies beyond the EOF, or straddles the EOF
+	 * and the write will cover all of the existing data.
+	 */
+	if (CIFS_I(mapping->host)->clientCanCacheRead) {
+		i_size = i_size_read(mapping->host);
+		if (page_start >= i_size ||
+		    (offset == 0 && (pos + len) >= i_size)) {
+			zero_user_segments(page, 0, offset,
+					   offset + len,
+					   PAGE_CACHE_SIZE);
+			/*
+			 * PageChecked means that the parts of the page
+			 * to which we're not writing are considered up
+			 * to date. Once the data is copied to the
+			 * page, it can be set uptodate.
+			 */
+			SetPageChecked(page);
+			goto out;
+		}
+	}
 
-		/* we do not need to pass errors back
-		   e.g. if we do not have read access to the file
-		   because cifs_write_end will attempt synchronous writes
-		   -- shaggy */
+	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+		/*
+		 * might as well read a page, it is fast enough. If we get
+		 * an error, we don't need to return it. cifs_write_end will
+		 * do a sync write instead since PG_uptodate isn't set.
+		 */
+		cifs_readpage_worker(file, page, &page_start);
 	} else {
 		/* we could try using another file handle if there is one -
 		   but how would we lock it to prevent close of that handle
 		   racing with this read? In any case
 		   this will be written out by write_end so is fine */
 	}
-
-	return 0;
+out:
+	*pagep = page;
+	return rc;
 }
 
 const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 88786ba02d2..9ee3f689c2b 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -75,12 +75,12 @@ sesInfoAlloc(void)
 
 	ret_buf = kzalloc(sizeof(struct cifsSesInfo), GFP_KERNEL);
 	if (ret_buf) {
-		write_lock(&GlobalSMBSeslock);
 		atomic_inc(&sesInfoAllocCount);
 		ret_buf->status = CifsNew;
-		list_add(&ret_buf->cifsSessionList, &GlobalSMBSessionList);
+		++ret_buf->ses_count;
+		INIT_LIST_HEAD(&ret_buf->smb_ses_list);
+		INIT_LIST_HEAD(&ret_buf->tcon_list);
 		init_MUTEX(&ret_buf->sesSem);
-		write_unlock(&GlobalSMBSeslock);
 	}
 	return ret_buf;
 }
@@ -93,10 +93,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
 		return;
 	}
 
-	write_lock(&GlobalSMBSeslock);
 	atomic_dec(&sesInfoAllocCount);
-	list_del(&buf_to_free->cifsSessionList);
-	write_unlock(&GlobalSMBSeslock);
 	kfree(buf_to_free->serverOS);
 	kfree(buf_to_free->serverDomain);
 	kfree(buf_to_free->serverNOS);
@@ -111,17 +108,14 @@ tconInfoAlloc(void)
 	struct cifsTconInfo *ret_buf;
 	ret_buf = kzalloc(sizeof(struct cifsTconInfo), GFP_KERNEL);
 	if (ret_buf) {
-		write_lock(&GlobalSMBSeslock);
 		atomic_inc(&tconInfoAllocCount);
-		list_add(&ret_buf->cifsConnectionList,
-			 &GlobalTreeConnectionList);
 		ret_buf->tidStatus = CifsNew;
+		++ret_buf->tc_count;
 		INIT_LIST_HEAD(&ret_buf->openFileList);
-		init_MUTEX(&ret_buf->tconSem);
+		INIT_LIST_HEAD(&ret_buf->tcon_list);
 #ifdef CONFIG_CIFS_STATS
 		spin_lock_init(&ret_buf->stat_lock);
 #endif
-		write_unlock(&GlobalSMBSeslock);
 	}
 	return ret_buf;
 }
@@ -133,10 +127,7 @@ tconInfoFree(struct cifsTconInfo *buf_to_free)
 		cFYI(1, ("Null buffer passed to tconInfoFree"));
 		return;
 	}
-	write_lock(&GlobalSMBSeslock);
 	atomic_dec(&tconInfoAllocCount);
-	list_del(&buf_to_free->cifsConnectionList);
-	write_unlock(&GlobalSMBSeslock);
 	kfree(buf_to_free->nativeFileSystem);
 	kfree(buf_to_free);
 }
@@ -350,9 +341,9 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 				if (current->fsuid != treeCon->ses->linux_uid) {
 					cFYI(1, ("Multiuser mode and UID "
 						 "did not match tcon uid"));
-					read_lock(&GlobalSMBSeslock);
-					list_for_each(temp_item, &GlobalSMBSessionList) {
-						ses = list_entry(temp_item, struct cifsSesInfo, cifsSessionList);
+					read_lock(&cifs_tcp_ses_lock);
+					list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
+						ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
 						if (ses->linux_uid == current->fsuid) {
 							if (ses->server == treeCon->ses->server) {
 								cFYI(1, ("found matching uid substitute right smb_uid"));
@@ -364,7 +355,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 							}
 						}
 					}
-					read_unlock(&GlobalSMBSeslock);
+					read_unlock(&cifs_tcp_ses_lock);
 				}
 			}
 		}
@@ -497,9 +488,10 @@ bool
 is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 {
 	struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
-	struct list_head *tmp;
-	struct list_head *tmp1;
+	struct list_head *tmp, *tmp1, *tmp2;
+	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
+	struct cifsInodeInfo *pCifsInode;
 	struct cifsFileInfo *netfile;
 
 	cFYI(1, ("Checking for oplock break or dnotify response"));
@@ -554,42 +546,45 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 		return false;
 
 	/* look up tcon based on tid & uid */
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if ((tcon->tid == buf->Tid) && (srv == tcon->ses->server)) {
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &srv->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		list_for_each(tmp1, &ses->tcon_list) {
+			tcon = list_entry(tmp1, struct cifsTconInfo, tcon_list);
+			if (tcon->tid != buf->Tid)
+				continue;
+
 			cifs_stats_inc(&tcon->num_oplock_brks);
-			list_for_each(tmp1, &tcon->openFileList) {
-				netfile = list_entry(tmp1, struct cifsFileInfo,
+			write_lock(&GlobalSMBSeslock);
+			list_for_each(tmp2, &tcon->openFileList) {
+				netfile = list_entry(tmp2, struct cifsFileInfo,
 						     tlist);
-				if (pSMB->Fid == netfile->netfid) {
-					struct cifsInodeInfo *pCifsInode;
-					read_unlock(&GlobalSMBSeslock);
-					cFYI(1,
-					    ("file id match, oplock break"));
-					pCifsInode =
-						CIFS_I(netfile->pInode);
-					pCifsInode->clientCanCacheAll = false;
-					if (pSMB->OplockLevel == 0)
-						pCifsInode->clientCanCacheRead
-							= false;
-					pCifsInode->oplockPending = true;
-					AllocOplockQEntry(netfile->pInode,
-							  netfile->netfid,
-							  tcon);
-					cFYI(1,
-					    ("about to wake up oplock thread"));
-					if (oplockThread)
-					    wake_up_process(oplockThread);
-					return true;
-				}
+				if (pSMB->Fid != netfile->netfid)
+					continue;
+
+				write_unlock(&GlobalSMBSeslock);
+				read_unlock(&cifs_tcp_ses_lock);
+				cFYI(1, ("file id match, oplock break"));
+				pCifsInode = CIFS_I(netfile->pInode);
+				pCifsInode->clientCanCacheAll = false;
+				if (pSMB->OplockLevel == 0)
+					pCifsInode->clientCanCacheRead = false;
+				pCifsInode->oplockPending = true;
+				AllocOplockQEntry(netfile->pInode,
+						  netfile->netfid, tcon);
+				cFYI(1, ("about to wake up oplock thread"));
+				if (oplockThread)
+					wake_up_process(oplockThread);
+
+				return true;
 			}
-			read_unlock(&GlobalSMBSeslock);
+			write_unlock(&GlobalSMBSeslock);
+			read_unlock(&cifs_tcp_ses_lock);
 			cFYI(1, ("No matching file for oplock break"));
 			return true;
 		}
 	}
-	read_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 	cFYI(1, ("Can not process oplock break for non-existent connection"));
 	return true;
 }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 58d57299f2a..9f51f9bf029 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -741,11 +741,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
 		cFYI(1, ("search backing up - close and restart search"));
+		write_lock(&GlobalSMBSeslock);
 		if (!cifsFile->srch_inf.endOfSearch &&
 		    !cifsFile->invalidHandle) {
 			cifsFile->invalidHandle = true;
+			write_unlock(&GlobalSMBSeslock);
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
-		}
+		} else
+			write_unlock(&GlobalSMBSeslock);
 		if (cifsFile->srch_inf.ntwrk_buf_start) {
 			cFYI(1, ("freeing SMB ff cache buf on search rewind"));
 			if (cifsFile->srch_inf.smallBuf)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d910501de6d..8d86b7960f0 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -812,7 +812,7 @@ int dlm_release_lockspace(void *lockspace, int force)
 	error = release_lockspace(ls, force);
 	if (!error)
 		ls_count--;
-	else if (!ls_count)
+	if (!ls_count)
 		threads_stop();
 	mutex_unlock(&ls_lock);
 
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e22bc396134..0d713b69194 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1037,17 +1037,14 @@ static int
 decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 					 struct ecryptfs_crypt_stat *crypt_stat)
 {
-	struct scatterlist dst_sg;
-	struct scatterlist src_sg;
+	struct scatterlist dst_sg[2];
+	struct scatterlist src_sg[2];
 	struct mutex *tfm_mutex;
 	struct blkcipher_desc desc = {
 		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
 	};
 	int rc = 0;
 
-	sg_init_table(&dst_sg, 1);
-	sg_init_table(&src_sg, 1);
-
 	if (unlikely(ecryptfs_verbosity > 0)) {
 		ecryptfs_printk(
 			KERN_DEBUG, "Session key encryption key (size [%d]):\n",
@@ -1066,8 +1063,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	}
 	rc = virt_to_scatterlist(auth_tok->session_key.encrypted_key,
 				 auth_tok->session_key.encrypted_key_size,
-				 &src_sg, 1);
-	if (rc != 1) {
+				 src_sg, 2);
+	if (rc < 1 || rc > 2) {
 		printk(KERN_ERR "Internal error whilst attempting to convert "
 			"auth_tok->session_key.encrypted_key to scatterlist; "
 			"expected rc = 1; got rc = [%d]. "
@@ -1079,8 +1076,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 		auth_tok->session_key.encrypted_key_size;
 	rc = virt_to_scatterlist(auth_tok->session_key.decrypted_key,
 				 auth_tok->session_key.decrypted_key_size,
-				 &dst_sg, 1);
-	if (rc != 1) {
+				 dst_sg, 2);
+	if (rc < 1 || rc > 2) {
 		printk(KERN_ERR "Internal error whilst attempting to convert "
 			"auth_tok->session_key.decrypted_key to scatterlist; "
 			"expected rc = 1; got rc = [%d]\n", rc);
@@ -1096,7 +1093,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 		rc = -EINVAL;
 		goto out;
 	}
-	rc = crypto_blkcipher_decrypt(&desc, &dst_sg, &src_sg,
+	rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg,
 				      auth_tok->session_key.encrypted_key_size);
 	mutex_unlock(tfm_mutex);
 	if (unlikely(rc)) {
@@ -1539,8 +1536,8 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	size_t i;
 	size_t encrypted_session_key_valid = 0;
 	char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
-	struct scatterlist dst_sg;
-	struct scatterlist src_sg;
+	struct scatterlist dst_sg[2];
+	struct scatterlist src_sg[2];
 	struct mutex *tfm_mutex = NULL;
 	u8 cipher_code;
 	size_t packet_size_length;
@@ -1619,8 +1616,8 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 		ecryptfs_dump_hex(session_key_encryption_key, 16);
 	}
 	rc = virt_to_scatterlist(crypt_stat->key, key_rec->enc_key_size,
-				 &src_sg, 1);
-	if (rc != 1) {
+				 src_sg, 2);
+	if (rc < 1 || rc > 2) {
 		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
 				"for crypt_stat session key; expected rc = 1; "
 				"got rc = [%d]. key_rec->enc_key_size = [%d]\n",
@@ -1629,8 +1626,8 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 		goto out;
 	}
 	rc = virt_to_scatterlist(key_rec->enc_key, key_rec->enc_key_size,
-				 &dst_sg, 1);
-	if (rc != 1) {
+				 dst_sg, 2);
+	if (rc < 1 || rc > 2) {
 		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
 				"for crypt_stat encrypted session key; "
 				"expected rc = 1; got rc = [%d]. "
@@ -1651,7 +1648,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	rc = 0;
 	ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
 			crypt_stat->key_size);
-	rc = crypto_blkcipher_encrypt(&desc, &dst_sg, &src_sg,
+	rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
 				      (*key_rec).enc_key_size);
 	mutex_unlock(tfm_mutex);
 	if (rc) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index aec5c13f634..96355d50534 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -102,6 +102,8 @@
 
 #define EP_UNACTIVE_PTR ((void *) -1L)
 
+#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
+
 struct epoll_filefd {
 	struct file *file;
 	int fd;
@@ -200,6 +202,9 @@ struct eventpoll {
 	 * holding ->lock.
 	 */
 	struct epitem *ovflist;
+
+	/* The user that created the eventpoll descriptor */
+	struct user_struct *user;
 };
 
 /* Wait structure used by the poll hooks */
@@ -227,9 +232,17 @@ struct ep_pqueue {
 };
 
 /*
+ * Configuration options available inside /proc/sys/fs/epoll/
+ */
+/* Maximum number of epoll devices, per user */
+static int max_user_instances __read_mostly;
+/* Maximum number of epoll watched descriptors, per user */
+static int max_user_watches __read_mostly;
+
+/*
  * This mutex is used to serialize ep_free() and eventpoll_release_file().
  */
-static struct mutex epmutex;
+static DEFINE_MUTEX(epmutex);
 
 /* Safe wake up implementation */
 static struct poll_safewake psw;
@@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly;
 /* Slab cache used to allocate "struct eppoll_entry" */
 static struct kmem_cache *pwq_cache __read_mostly;
 
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+ctl_table epoll_table[] = {
+	{
+		.procname	= "max_user_instances",
+		.data		= &max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "max_user_watches",
+		.data		= &max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{ .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+
 
 /* Setup the structure that is used as key for the RB tree */
 static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	/* At this point it is safe to free the eventpoll item */
 	kmem_cache_free(epi_cache, epi);
 
+	atomic_dec(&ep->user->epoll_watches);
+
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
 		     current, ep, file));
 
@@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep)
 
 	mutex_unlock(&epmutex);
 	mutex_destroy(&ep->mtx);
+	atomic_dec(&ep->user->epoll_devs);
+	free_uid(ep->user);
 	kfree(ep);
 }
 
@@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file)
 
 static int ep_alloc(struct eventpoll **pep)
 {
-	struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	int error;
+	struct user_struct *user;
+	struct eventpoll *ep;
 
-	if (!ep)
-		return -ENOMEM;
+	user = get_current_user();
+	error = -EMFILE;
+	if (unlikely(atomic_read(&user->epoll_devs) >=
+			max_user_instances))
+		goto free_uid;
+	error = -ENOMEM;
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	if (unlikely(!ep))
+		goto free_uid;
 
 	spin_lock_init(&ep->lock);
 	mutex_init(&ep->mtx);
@@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep)
 	INIT_LIST_HEAD(&ep->rdllist);
 	ep->rbr = RB_ROOT;
 	ep->ovflist = EP_UNACTIVE_PTR;
+	ep->user = user;
 
 	*pep = ep;
 
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
 		     current, ep));
 	return 0;
+
+free_uid:
+	free_uid(user);
+	return error;
 }
 
 /*
@@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	struct epitem *epi;
 	struct ep_pqueue epq;
 
-	error = -ENOMEM;
+	if (unlikely(atomic_read(&ep->user->epoll_watches) >=
+		     max_user_watches))
+		return -ENOSPC;
 	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
-		goto error_return;
+		return -ENOMEM;
 
 	/* Item initialization follow here ... */
 	INIT_LIST_HEAD(&epi->rdllink);
@@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	 * install process. Namely an allocation for a wait queue failed due
 	 * high memory pressure.
 	 */
+	error = -ENOMEM;
 	if (epi->nwait < 0)
 		goto error_unregister;
 
@@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	spin_unlock_irqrestore(&ep->lock, flags);
 
+	atomic_inc(&ep->user->epoll_watches);
+
 	/* We have to call this outside the lock */
 	if (pwake)
 		ep_poll_safewake(&psw, &ep->poll_wait);
@@ -789,7 +852,7 @@ error_unregister:
 	spin_unlock_irqrestore(&ep->lock, flags);
 
 	kmem_cache_free(epi_cache, epi);
-error_return:
+
 	return error;
 }
 
@@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags)
 			      flags & O_CLOEXEC);
 	if (fd < 0)
 		ep_free(ep);
+	atomic_inc(&ep->user->epoll_devs);
 
 error_return:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
 
 static int __init eventpoll_init(void)
 {
-	mutex_init(&epmutex);
+	struct sysinfo si;
+
+	si_meminfo(&si);
+	max_user_instances = 128;
+	max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
+		EP_ITEM_COST;
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_poll_safewake_init(&psw);
diff --git a/fs/exec.c b/fs/exec.c
index 4e834f16d9d..ec5df9a3831 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1159,6 +1159,7 @@ EXPORT_SYMBOL(remove_arg_zero);
  */
 int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 {
+	unsigned int depth = bprm->recursion_depth;
 	int try,retval;
 	struct linux_binfmt *fmt;
 #ifdef __alpha__
@@ -1219,8 +1220,15 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 				continue;
 			read_unlock(&binfmt_lock);
 			retval = fn(bprm, regs);
+			/*
+			 * Restore the depth counter to its starting value
+			 * in this call, so we don't have to rely on every
+			 * load_binary function to restore it on return.
+			 */
+			bprm->recursion_depth = depth;
 			if (retval >= 0) {
-				tracehook_report_exec(fmt, bprm, regs);
+				if (depth == 0)
+					tracehook_report_exec(fmt, bprm, regs);
 				put_binfmt(fmt);
 				allow_write_access(bprm->file);
 				if (bprm->file)
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 80246bad1b7..890e0182881 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -367,6 +367,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
+	if (!result)
+		result = ERR_PTR(-ESTALE);
 	if (IS_ERR(result))
 		return result;
 
@@ -420,6 +422,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 
 		target_dir = nop->fh_to_parent(mnt->mnt_sb, fid,
 				fh_len, fileid_type);
+		if (!target_dir)
+			goto err_result;
 		err = PTR_ERR(target_dir);
 		if (IS_ERR(target_dir))
 			goto err_result;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5dec6d1356c..f6c94f232ec 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2375,12 +2375,9 @@ int ext3_force_commit(struct super_block *sb)
 /*
  * Ext3 always journals updates to the superblock itself, so we don't
  * have to propagate any other updates to the superblock on disk at this
- * point.  Just start an async writeback to get the buffers on their way
- * to the disk.
- *
- * This implicitly triggers the writebehind on sync().
+ * point.  (We can probably nuke this function altogether, and remove
+ * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
  */
-
 static void ext3_write_super (struct super_block * sb)
 {
 	if (mutex_trylock(&sb->s_lock) != 0)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2003cdc36a..db35cfdb3c8 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,8 +609,8 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 
 	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
 						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = percpu_counter_sum(fbc);
-		dirty_blocks = percpu_counter_sum(dbc);
+		free_blocks  = percpu_counter_sum_positive(fbc);
+		dirty_blocks = percpu_counter_sum_positive(dbc);
 		if (dirty_blocks < 0) {
 			printk(KERN_CRIT "Dirty block accounting "
 					"went wrong %lld\n",
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ac4f7db9f13..549daf8005f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -19,6 +19,7 @@
 #include <linux/signal.h>
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
+#include <linux/smp_lock.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -175,6 +176,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	if (error)
 		return error;
 
+	/*
+	 * We still need a lock here for now to keep multiple FASYNC calls
+	 * from racing with each other.
+	 */
+	lock_kernel();
 	if ((arg ^ filp->f_flags) & FASYNC) {
 		if (filp->f_op && filp->f_op->fasync) {
 			error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -185,6 +191,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
  out:
+	unlock_kernel();
 	return error;
 }
 
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 6ae9011b95e..2f34f8f2134 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -81,7 +81,7 @@ extern int do_rmdir(const char *file);
 extern int do_mknod(const char *file, int mode, unsigned int major,
 		    unsigned int minor);
 extern int link_file(const char *from, const char *to);
-extern int do_readlink(char *file, char *buf, int size);
+extern int hostfs_do_readlink(char *file, char *buf, int size);
 extern int rename_file(char *from, char *to);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 		     long long *bfree_out, long long *bavail_out,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7f34f4385de..3a31451ac17 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -168,7 +168,7 @@ static char *follow_link(char *link)
 		if (name == NULL)
 			goto out;
 
-		n = do_readlink(link, name, len);
+		n = hostfs_do_readlink(link, name, len);
 		if (n < len)
 			break;
 		len *= 2;
@@ -943,7 +943,7 @@ int hostfs_link_readpage(struct file *file, struct page *page)
 	name = inode_name(page->mapping->host, 0);
 	if (name == NULL)
 		return -ENOMEM;
-	err = do_readlink(name, buffer, PAGE_CACHE_SIZE);
+	err = hostfs_do_readlink(name, buffer, PAGE_CACHE_SIZE);
 	kfree(name);
 	if (err == PAGE_CACHE_SIZE)
 		err = -E2BIG;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 53fd0a67c11..b79424f9328 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -377,7 +377,7 @@ int link_file(const char *to, const char *from)
 	return 0;
 }
 
-int do_readlink(char *file, char *buf, int size)
+int hostfs_do_readlink(char *file, char *buf, int size)
 {
 	int n;
 
diff --git a/fs/inotify.c b/fs/inotify.c
index 690e72595e6..dae3f28f30d 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -106,6 +106,20 @@ void get_inotify_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(get_inotify_watch);
 
+int pin_inotify_watch(struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	spin_lock(&sb_lock);
+	if (sb->s_count >= S_BIAS) {
+		atomic_inc(&sb->s_active);
+		spin_unlock(&sb_lock);
+		atomic_inc(&watch->count);
+		return 1;
+	}
+	spin_unlock(&sb_lock);
+	return 0;
+}
+
 /**
  * put_inotify_watch - decrements the ref count on a given watch.  cleans up
  * watch references if the count reaches zero.  inotify_watch is freed by
@@ -124,6 +138,13 @@ void put_inotify_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(put_inotify_watch);
 
+void unpin_inotify_watch(struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	put_inotify_watch(watch);
+	deactivate_super(sb);
+}
+
 /*
  * inotify_handle_get_wd - returns the next WD for use by the given handle
  *
@@ -407,11 +428,13 @@ void inotify_unmount_inodes(struct list_head *list)
 		watches = &inode->inotify_watches;
 		list_for_each_entry_safe(watch, next_w, watches, i_list) {
 			struct inotify_handle *ih= watch->ih;
+			get_inotify_watch(watch);
 			mutex_lock(&ih->mutex);
 			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
 						 NULL, NULL);
 			inotify_remove_watch_locked(ih, watch);
 			mutex_unlock(&ih->mutex);
+			put_inotify_watch(watch);
 		}
 		mutex_unlock(&inode->inotify_mutex);
 		iput(inode);		
@@ -479,6 +502,112 @@ void inotify_init_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(inotify_init_watch);
 
+/*
+ * Watch removals suck violently.  To kick the watch out we need (in this
+ * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
+ * a hold on inode; however, for all other cases we need to make damn sure
+ * we don't race with umount.  We can *NOT* just grab a reference to a
+ * watch - inotify_unmount_inodes() will happily sail past it and we'll end
+ * with reference to inode potentially outliving its superblock.  Ideally
+ * we just want to grab an active reference to superblock if we can; that
+ * will make sure we won't go into inotify_umount_inodes() until we are
+ * done.  Cleanup is just deactivate_super().  However, that leaves a messy
+ * case - what if we *are* racing with umount() and active references to
+ * superblock can't be acquired anymore?  We can bump ->s_count, grab
+ * ->s_umount, which will almost certainly wait until the superblock is shut
+ * down and the watch in question is pining for fjords.  That's fine, but
+ * there is a problem - we might have hit the window between ->s_active
+ * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
+ * is past the point of no return and is heading for shutdown) and the
+ * moment when deactivate_super() acquires ->s_umount.  We could just do
+ * drop_super() yield() and retry, but that's rather antisocial and this
+ * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
+ * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
+ * that we won't race with inotify_umount_inodes().  So we could grab a
+ * reference to watch and do the rest as above, just with drop_super() instead
+ * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
+ * could grab ->s_umount.  So the watch could've been gone already.
+ *
+ * That still can be dealt with - we need to save watch->wd, do idr_find()
+ * and compare its result with our pointer.  If they match, we either have
+ * the damn thing still alive or we'd lost not one but two races at once,
+ * the watch had been killed and a new one got created with the same ->wd
+ * at the same address.  That couldn't have happened in inotify_destroy(),
+ * but inotify_rm_wd() could run into that.  Still, "new one got created"
+ * is not a problem - we have every right to kill it or leave it alone,
+ * whatever's more convenient.
+ *
+ * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
+ * "grab it and kill it" check.  If it's been our original watch, we are
+ * fine, if it's a newcomer - nevermind, just pretend that we'd won the
+ * race and kill the fscker anyway; we are safe since we know that its
+ * superblock won't be going away.
+ *
+ * And yes, this is far beyond mere "not very pretty"; so's the entire
+ * concept of inotify to start with.
+ */
+
+/**
+ * pin_to_kill - pin the watch down for removal
+ * @ih: inotify handle
+ * @watch: watch to kill
+ *
+ * Called with ih->mutex held, drops it.  Possible return values:
+ * 0 - nothing to do, it has died
+ * 1 - remove it, drop the reference and deactivate_super()
+ * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
+ * that variant, since it involved a lot of PITA, but that's the best that
+ * could've been done.
+ */
+static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	s32 wd = watch->wd;
+
+	spin_lock(&sb_lock);
+	if (sb->s_count >= S_BIAS) {
+		atomic_inc(&sb->s_active);
+		spin_unlock(&sb_lock);
+		get_inotify_watch(watch);
+		mutex_unlock(&ih->mutex);
+		return 1;	/* the best outcome */
+	}
+	sb->s_count++;
+	spin_unlock(&sb_lock);
+	mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
+	down_read(&sb->s_umount);
+	if (likely(!sb->s_root)) {
+		/* fs is already shut down; the watch is dead */
+		drop_super(sb);
+		return 0;
+	}
+	/* raced with the final deactivate_super() */
+	mutex_lock(&ih->mutex);
+	if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
+		/* the watch is dead */
+		mutex_unlock(&ih->mutex);
+		drop_super(sb);
+		return 0;
+	}
+	/* still alive or freed and reused with the same sb and wd; kill */
+	get_inotify_watch(watch);
+	mutex_unlock(&ih->mutex);
+	return 2;
+}
+
+static void unpin_and_kill(struct inotify_watch *watch, int how)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	put_inotify_watch(watch);
+	switch (how) {
+	case 1:
+		deactivate_super(sb);
+		break;
+	case 2:
+		drop_super(sb);
+	}
+}
+
 /**
  * inotify_destroy - clean up and destroy an inotify instance
  * @ih: inotify handle
@@ -490,11 +619,15 @@ void inotify_destroy(struct inotify_handle *ih)
 	 * pretty.  We cannot do a simple iteration over the list, because we
 	 * do not know the inode until we iterate to the watch.  But we need to
 	 * hold inode->inotify_mutex before ih->mutex.  The following works.
+	 *
+	 * AV: it had to become even uglier to start working ;-/
 	 */
 	while (1) {
 		struct inotify_watch *watch;
 		struct list_head *watches;
+		struct super_block *sb;
 		struct inode *inode;
+		int how;
 
 		mutex_lock(&ih->mutex);
 		watches = &ih->watches;
@@ -503,8 +636,10 @@ void inotify_destroy(struct inotify_handle *ih)
 			break;
 		}
 		watch = list_first_entry(watches, struct inotify_watch, h_list);
-		get_inotify_watch(watch);
-		mutex_unlock(&ih->mutex);
+		sb = watch->inode->i_sb;
+		how = pin_to_kill(ih, watch);
+		if (!how)
+			continue;
 
 		inode = watch->inode;
 		mutex_lock(&inode->inotify_mutex);
@@ -518,7 +653,7 @@ void inotify_destroy(struct inotify_handle *ih)
 
 		mutex_unlock(&ih->mutex);
 		mutex_unlock(&inode->inotify_mutex);
-		put_inotify_watch(watch);
+		unpin_and_kill(watch, how);
 	}
 
 	/* free this handle: the put matching the get in inotify_init() */
@@ -719,7 +854,9 @@ void inotify_evict_watch(struct inotify_watch *watch)
 int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 {
 	struct inotify_watch *watch;
+	struct super_block *sb;
 	struct inode *inode;
+	int how;
 
 	mutex_lock(&ih->mutex);
 	watch = idr_find(&ih->idr, wd);
@@ -727,9 +864,12 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 		mutex_unlock(&ih->mutex);
 		return -EINVAL;
 	}
-	get_inotify_watch(watch);
+	sb = watch->inode->i_sb;
+	how = pin_to_kill(ih, watch);
+	if (!how)
+		return 0;
+
 	inode = watch->inode;
-	mutex_unlock(&ih->mutex);
 
 	mutex_lock(&inode->inotify_mutex);
 	mutex_lock(&ih->mutex);
@@ -740,7 +880,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 
 	mutex_unlock(&ih->mutex);
 	mutex_unlock(&inode->inotify_mutex);
-	put_inotify_watch(watch);
+	unpin_and_kill(watch, how);
 
 	return 0;
 }
diff --git a/fs/ioctl.c b/fs/ioctl.c
index d152856c371..43e8b2c0664 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -400,11 +400,9 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 
 	/* Did FASYNC state change ? */
 	if ((flag ^ filp->f_flags) & FASYNC) {
-		if (filp->f_op && filp->f_op->fasync) {
-			lock_kernel();
+		if (filp->f_op && filp->f_op->fasync)
 			error = filp->f_op->fasync(fd, filp, on);
-			unlock_kernel();
-		} else
+		else
 			error = -ENOTTY;
 	}
 	if (error)
@@ -440,11 +438,17 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		break;
 
 	case FIONBIO:
+		/* BKL needed to avoid races tweaking f_flags */
+		lock_kernel();
 		error = ioctl_fionbio(filp, argp);
+		unlock_kernel();
 		break;
 
 	case FIOASYNC:
+		/* BKL needed to avoid races tweaking f_flags */
+		lock_kernel();
 		error = ioctl_fioasync(fd, filp, argp);
+		unlock_kernel();
 		break;
 
 	case FIOQSIZE:
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 9fd8889097b..70fc63a1727 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -167,7 +167,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 			continue;
 		if (host->h_server != ni->server)
 			continue;
-		if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
+		if (ni->server &&
+		    !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
 			continue;
 
 		/* Move to head of hash chain. */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index c631a83931c..56b076736b5 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -181,6 +181,7 @@ lockd(void *vrqstp)
 	}
 	flush_signals(current);
 	cancel_delayed_work_sync(&grace_period_end);
+	locks_end_grace(&lockd_manager);
 	if (nlmsvc_ops)
 		nlmsvc_invalidate_all();
 	nlm_shutdown_hosts();
diff --git a/fs/namei.c b/fs/namei.c
index 09ce58e49e7..d34e0f9681c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1378,7 +1378,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 	if (IS_APPEND(dir))
 		return -EPERM;
 	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
-	    IS_IMMUTABLE(victim->d_inode))
+	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
 		return -EPERM;
 	if (isdir) {
 		if (!S_ISDIR(victim->d_inode->i_mode))
diff --git a/fs/namespace.c b/fs/namespace.c
index cce46702d33..65b3dc844c8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1815,8 +1815,8 @@ static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 		while (!list_empty(&graveyard)) {
 			m = list_first_entry(&graveyard, struct vfsmount,
 						mnt_expire);
-			touch_mnt_namespace(mnt->mnt_ns);
-			umount_tree(mnt, 1, umounts);
+			touch_mnt_namespace(m->mnt_ns);
+			umount_tree(m, 1, umounts);
 		}
 	}
 }
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index bb93946ace2..b79ec930d9f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -225,12 +225,12 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 		return 0;
 
 	nfs4_save_user(&uid, &gid);
+	INIT_LIST_HEAD(dentries);
 
 	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY);
 	status = PTR_ERR(filp);
 	if (IS_ERR(filp))
 		goto out;
-	INIT_LIST_HEAD(dentries);
 	status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
 	fput(filp);
 	while (!list_empty(dentries)) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b0bebc552a1..1a052ac2bde 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3261,6 +3261,7 @@ nfs4_state_shutdown(void)
 {
 	cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
 	destroy_workqueue(laundry_wq);
+	locks_end_grace(&nfsd4_manager);
 	nfs4_lock_state();
 	nfs4_release_reclaim();
 	__nfs4_state_shutdown();
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 5e6724c1afd..2142b1c68b6 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -30,7 +30,8 @@
 
 extern int debug_msgs;
 
-#if 0 /* Fool kernel-doc since it doesn't do macros yet */
+extern void __ntfs_debug(const char *file, int line, const char *function,
+	const char *format, ...) __attribute__ ((format (printf, 4, 5)));
 /**
  * ntfs_debug - write a debug level message to syslog
  * @f:		a printf format string containing the message
@@ -39,11 +40,6 @@ extern int debug_msgs;
  * ntfs_debug() writes a DEBUG level message to the syslog but only if the
  * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
  */
-static void ntfs_debug(const char *f, ...);
-#endif
-
-extern void __ntfs_debug (const char *file, int line, const char *function,
-	const char *format, ...) __attribute__ ((format (printf, 4, 5)));
 #define ntfs_debug(f, a...)						\
 	__ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
 
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 7e947c67246..3a178ec48d7 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -112,7 +112,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 		bh = bhs[i];
 
 		if (buffer_jbd(bh)) {
-			mlog(ML_ERROR,
+			mlog(ML_BH_IO,
 			     "trying to sync read a jbd "
 			     "managed bh (blocknr = %llu), skipping\n",
 			     (unsigned long long)bh->b_blocknr);
@@ -147,15 +147,10 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 	for (i = nr; i > 0; i--) {
 		bh = bhs[i - 1];
 
-		if (buffer_jbd(bh)) {
-			mlog(ML_ERROR,
-			     "the journal got the buffer while it was "
-			     "locked for io! (blocknr = %llu)\n",
-			     (unsigned long long)bh->b_blocknr);
-			BUG();
-		}
+		/* No need to wait on the buffer if it's managed by JBD. */
+		if (!buffer_jbd(bh))
+			wait_on_buffer(bh);
 
-		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			/* Status won't be cleared from here on out,
 			 * so we can safely record this and loop back
@@ -251,8 +246,6 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 			ignore_cache = 1;
 		}
 
-		/* XXX: Can we ever get this and *not* have the cached
-		 * flag set? */
 		if (buffer_jbd(bh)) {
 			if (ignore_cache)
 				mlog(ML_BH_IO, "trying to sync read a jbd "
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 533a789c3ef..ba962d71b34 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -608,8 +608,10 @@ static int __init init_dlmfs_fs(void)
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 					SLAB_MEM_SPREAD),
 				dlmfs_init_once);
-	if (!dlmfs_inode_cache)
+	if (!dlmfs_inode_cache) {
+		status = -ENOMEM;
 		goto bail;
+	}
 	cleanup_inode = 1;
 
 	user_dlm_worker = create_singlethread_workqueue("user_dlm");
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h
index 39ec2773849..0c3cc03c61f 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlm/userdlm.h
@@ -33,7 +33,7 @@
 #include <linux/workqueue.h>
 
 /* user_lock_res->l_flags flags. */
-#define USER_LOCK_ATTACHED      (0x00000001) /* have we initialized
+#define USER_LOCK_ATTACHED      (0x00000001) /* we have initialized
 					       * the lvb */
 #define USER_LOCK_BUSY          (0x00000002) /* we are currently in
 					       * dlm_lock */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index ec684426034..6e6cc0a2e5f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2841,9 +2841,8 @@ static void ocfs2_unlock_ast(void *opaque, int error)
 
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
 	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
-	spin_unlock_irqrestore(&lockres->l_lock, flags);
-
 	wake_up(&lockres->l_event);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	mlog_exit_void();
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7efe937a415..e2570a3bc2b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -247,8 +247,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
 	mlog_entry_void();
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -312,8 +312,8 @@ static int ocfs2_simple_size_update(struct inode *inode,
 	handle_t *handle = NULL;
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1055,8 +1055,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1259,8 +1259,8 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	}
 
 	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1352,8 +1352,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
 		goto out;
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1866,6 +1866,13 @@ relock:
 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
 						    ppos, count, ocount);
 		if (written < 0) {
+			/*
+			 * direct write may have instantiated a few
+			 * blocks outside i_size. Trim these off again.
+			 * Don't need i_size_read because we hold i_mutex.
+			 */
+			if (*ppos + count > inode->i_size)
+				vmtruncate(inode, inode->i_size);
 			ret = written;
 			goto out_dio;
 		}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4903688f72a..7aa00d51187 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1106,6 +1106,12 @@ void ocfs2_clear_inode(struct inode *inode)
 	oi->ip_last_trans = 0;
 	oi->ip_dir_start_lookup = 0;
 	oi->ip_blkno = 0ULL;
+
+	/*
+	 * ip_jinode is used to track txns against this inode. We ensure that
+	 * the journal is flushed before journal shutdown. Thus it is safe to
+	 * have inodes get cleaned up after journal shutdown.
+	 */
 	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
 				       &oi->ip_jinode);
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 81e40677eec..99fe9d584f3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -690,6 +690,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 
 	/* Shutdown the kernel journal system */
 	jbd2_journal_destroy(journal->j_journal);
+	journal->j_journal = NULL;
 
 	OCFS2_I(inode)->ip_open_count--;
 
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3dc18d67557..eea1d24713e 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -113,7 +113,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
 	 * ocfs2_write_begin_nolock().
 	 */
 	if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
-		ret = -EINVAL;
+		/*
+		 * the page has been umapped in ocfs2_data_downconvert_worker.
+		 * So return 0 here and let VFS retry.
+		 */
+		ret = 0;
 		goto out;
 	}
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 485a6aa0ad3..f4967e634ff 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -378,8 +378,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	}
 
 	inode = new_inode(dir->i_sb);
-	if (IS_ERR(inode)) {
-		status = PTR_ERR(inode);
+	if (!inode) {
+		status = -ENOMEM;
 		mlog(ML_ERROR, "new_inode failed!\n");
 		goto leave;
 	}
@@ -491,8 +491,10 @@ leave:
 			brelse(*new_fe_bh);
 			*new_fe_bh = NULL;
 		}
-		if (inode)
+		if (inode) {
+			clear_nlink(inode);
 			iput(inode);
+		}
 	}
 
 	mlog_exit(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a21a465490c..3fed9e3d899 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -85,7 +85,7 @@ enum ocfs2_unlock_action {
 };
 
 /* ocfs2_lock_res->l_flags flags. */
-#define OCFS2_LOCK_ATTACHED      (0x00000001) /* have we initialized
+#define OCFS2_LOCK_ATTACHED      (0x00000001) /* we have initialized
 					       * the lvb */
 #define OCFS2_LOCK_BUSY          (0x00000002) /* we are currently in
 					       * dlm_lock */
@@ -473,6 +473,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 		(____gd)->bg_signature);				\
 } while (0)
 
+#define OCFS2_IS_VALID_XATTR_BLOCK(ptr)					\
+	(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
 {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f24ce3d3f95..5e0c0d0aef7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -86,7 +86,8 @@
 #define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
 	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
 
-#define OCFS2_FEATURE_COMPAT_SUPP	OCFS2_FEATURE_COMPAT_BACKUP_SB
+#define OCFS2_FEATURE_COMPAT_SUPP	(OCFS2_FEATURE_COMPAT_BACKUP_SB	\
+					 | OCFS2_FEATURE_COMPAT_JBD2_SB)
 #define OCFS2_FEATURE_INCOMPAT_SUPP	(OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
 					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
 					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
@@ -153,6 +154,11 @@
 #define OCFS2_FEATURE_COMPAT_BACKUP_SB		0x0001
 
 /*
+ * The filesystem will correctly handle journal feature bits.
+ */
+#define OCFS2_FEATURE_COMPAT_JBD2_SB		0x0002
+
+/*
  * Unwritten extents support.
  */
 #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN	0x0001
@@ -742,12 +748,12 @@ struct ocfs2_group_desc
  */
 struct ocfs2_xattr_entry {
 	__le32	xe_name_hash;    /* hash value of xattr prefix+suffix. */
-	__le16	xe_name_offset;  /* byte offset from the 1st etnry in the local
+	__le16	xe_name_offset;  /* byte offset from the 1st entry in the
 				    local xattr storage(inode, xattr block or
 				    xattr bucket). */
 	__u8	xe_name_len;	 /* xattr name len, does't include prefix. */
-	__u8	xe_type;         /* the low 7 bits indicates the name prefix's
-				  * type and the highest 1 bits indicate whether
+	__u8	xe_type;         /* the low 7 bits indicate the name prefix
+				  * type and the highest bit indicates whether
 				  * the EA is stored in the local storage. */
 	__le64	xe_value_size;	 /* real xattr value length. */
 };
@@ -766,9 +772,10 @@ struct ocfs2_xattr_header {
 						   xattr. */
 	__le16	xh_name_value_len;              /* total length of name/value
 						   length in this bucket. */
-	__le16	xh_num_buckets;                 /* bucket nums in one extent
-						   record, only valid in the
-						   first bucket. */
+	__le16	xh_num_buckets;                 /* Number of xattr buckets
+						   in this extent record,
+						   only valid in the first
+						   bucket. */
 	__le64  xh_csum;
 	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
@@ -776,8 +783,8 @@ struct ocfs2_xattr_header {
 /*
  * On disk structure for xattr value root.
  *
- * It is used when one extended attribute's size is larger, and we will save it
- * in an outside cluster. It will stored in a b-tree like file content.
+ * When an xattr's value is large enough, it is stored in an external
+ * b-tree like file data.  The xattr value root points to this structure.
  */
 struct ocfs2_xattr_value_root {
 /*00*/	__le32	xr_clusters;              /* clusters covered by xattr value. */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index faec2d87935..9b76d41a8ac 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -740,6 +740,9 @@ static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
 
 static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
+	if (!lksb->lksb_fsdlm.sb_lvbptr)
+		lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+					     sizeof(struct dlm_lksb);
 	return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
 }
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 802c4149221..74d7367ade1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3,25 +3,20 @@
  *
  * xattr.c
  *
- * Copyright (C) 2008 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
  *
  * CREDITS:
- * Lots of code in this file is taken from ext3.
+ * Lots of code in this file is copy from linux/fs/ext3/xattr.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * License version 2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/capability.h>
@@ -83,7 +78,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
 	NULL
 };
 
-static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
 	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
 };
@@ -116,6 +111,10 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
 					     int *block_off,
 					     int *new_offset);
 
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs);
 static int ocfs2_xattr_index_block_find(struct inode *inode,
 					struct buffer_head *root_bh,
 					int name_index,
@@ -137,6 +136,24 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
 					  struct buffer_head *xb_bh);
 
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+	u16 len = sb->s_blocksize -
+		 offsetof(struct ocfs2_xattr_header, xh_entries);
+
+	return len / sizeof(struct ocfs2_xattr_entry);
+}
+
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -542,14 +559,12 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-		ret = -EFAULT;
-		goto cleanup;
-	}
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
+		goto cleanup;
+	}
 
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
@@ -749,47 +764,25 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 				 size_t buffer_size,
 				 struct ocfs2_xattr_search *xs)
 {
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	struct buffer_head *blk_bh = NULL;
 	struct ocfs2_xattr_block *xb;
 	struct ocfs2_xattr_value_root *xv;
 	size_t size;
 	int ret = -ENODATA, name_offset, name_len, block_off, i;
 
-	if (!di->i_xattr_loc)
-		return ret;
-
 	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
-	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
-	if (ret < 0) {
+	ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
+	if (ret) {
 		mlog_errno(ret);
-		return ret;
-	}
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-		ret = -EFAULT;
 		goto cleanup;
 	}
 
-	xs->xattr_bh = blk_bh;
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-
-	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
-		xs->header = &xb->xb_attrs.xb_header;
-		xs->base = (void *)xs->header;
-		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
-		xs->here = xs->header->xh_entries;
-
-		ret = ocfs2_xattr_find_entry(name_index, name, xs);
-	} else
-		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
-						   name_index,
-						   name, xs);
-
-	if (ret)
+	if (xs->not_found) {
+		ret = -ENODATA;
 		goto cleanup;
+	}
+
+	xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 	size = le64_to_cpu(xs->here->xe_value_size);
 	if (buffer) {
 		ret = -ERANGE;
@@ -828,7 +821,8 @@ cleanup:
 		brelse(xs->bucket.bhs[i]);
 	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
-	brelse(blk_bh);
+	brelse(xs->xattr_bh);
+	xs->xattr_bh = NULL;
 	return ret;
 }
 
@@ -837,11 +831,11 @@ cleanup:
  * Copy an extended attribute into the buffer provided.
  * Buffer is NULL to compute the size of buffer required.
  */
-int ocfs2_xattr_get(struct inode *inode,
-		    int name_index,
-		    const char *name,
-		    void *buffer,
-		    size_t buffer_size)
+static int ocfs2_xattr_get(struct inode *inode,
+			   int name_index,
+			   const char *name,
+			   void *buffer,
+			   size_t buffer_size)
 {
 	int ret;
 	struct ocfs2_dinode *di = NULL;
@@ -871,7 +865,7 @@ int ocfs2_xattr_get(struct inode *inode,
 	down_read(&oi->ip_xattr_sem);
 	ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
 				    buffer_size, &xis);
-	if (ret == -ENODATA)
+	if (ret == -ENODATA && di->i_xattr_loc)
 		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
 					    buffer_size, &xbs);
 	up_read(&oi->ip_xattr_sem);
@@ -1229,7 +1223,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 
 	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
 	if (free < 0)
-		return -EFAULT;
+		return -EIO;
 
 	if (!xs->not_found) {
 		size_t size = 0;
@@ -1514,10 +1508,9 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 		goto out;
 	}
 
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-		ret = -EFAULT;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
 		goto out;
 	}
 
@@ -1527,7 +1520,6 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 		goto out;
 	}
 
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 	blk = le64_to_cpu(xb->xb_blkno);
 	bit = le16_to_cpu(xb->xb_suballoc_bit);
 	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1771,15 +1763,14 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-			ret = -EFAULT;
-			goto cleanup;
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
+		goto cleanup;
 	}
 
 	xs->xattr_bh = blk_bh;
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		xs->header = &xb->xb_attrs.xb_header;
@@ -1806,52 +1797,6 @@ cleanup:
 }
 
 /*
- * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
- * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
- * re-initialized.
- */
-static int ocfs2_restore_xattr_block(struct inode *inode,
-				     struct ocfs2_xattr_search *xs)
-{
-	int ret;
-	handle_t *handle;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_xattr_block *xb =
-		(struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
-	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
-	u16 xb_flags = le16_to_cpu(xb->xb_flags);
-
-	BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
-		le16_to_cpu(el->l_next_free_rec) != 0);
-
-	handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		goto out;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
-	       offsetof(struct ocfs2_xattr_block, xb_attrs));
-
-	xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
-
-	ocfs2_journal_dirty(handle, xs->xattr_bh);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-out:
-	return ret;
-}
-
-/*
  * ocfs2_xattr_block_set()
  *
  * Set, replace or remove an extended attribute into external block.
@@ -1961,8 +1906,6 @@ out:
 	}
 
 	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
-	if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
-		ret = ocfs2_restore_xattr_block(inode, xs);
 
 end:
 
@@ -2398,7 +2341,8 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
 	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
 
 	mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
-	     "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+	     "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno,
+	     first_hash);
 
 	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
 				      p_blkno, first_hash, num_clusters, xs);
@@ -2422,7 +2366,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 	memset(&bucket, 0, sizeof(bucket));
 
 	mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
-	     clusters, blkno);
+	     clusters, (unsigned long long)blkno);
 
 	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
 		ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
@@ -2440,7 +2384,8 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 		if (i == 0)
 			num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
 
-		mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
+		mlog(0, "iterating xattr bucket %llu, first hash %u\n",
+		     (unsigned long long)blkno,
 		     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
 		if (func) {
 			ret = func(inode, &bucket, para);
@@ -2700,9 +2645,9 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
 				return ret;
 			}
 
-			i = xs->here - old_xh->xh_entries;
-			xs->here = &xs->header->xh_entries[i];
 		}
+		i = xs->here - old_xh->xh_entries;
+		xs->here = &xs->header->xh_entries[i];
 	}
 
 	return ret;
@@ -2776,7 +2721,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	 */
 	blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
 
-	mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+	mlog(0, "allocate 1 cluster from %llu to xattr block\n",
+	     (unsigned long long)blkno);
 
 	xh_bh = sb_getblk(inode->i_sb, blkno);
 	if (!xh_bh) {
@@ -2818,7 +2764,11 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	if (data_bh)
 		ocfs2_journal_dirty(handle, data_bh);
 
-	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+	ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
 
 	/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
 	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2941,8 +2891,8 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 
 	mlog(0, "adjust xattr bucket in %llu, count = %u, "
 	     "xh_free_start = %u, xh_name_value_len = %u.\n",
-	     blkno, le16_to_cpu(xh->xh_count), xh_free_start,
-	     le16_to_cpu(xh->xh_name_value_len));
+	     (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
+	     xh_free_start, le16_to_cpu(xh->xh_name_value_len));
 
 	/*
 	 * sort all the entries by their offset.
@@ -3058,7 +3008,7 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 	prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
 
 	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-	     prev_blkno, new_blkno);
+	     (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
 
 	/*
 	 * We need to update the 1st half of the new cluster and
@@ -3168,26 +3118,74 @@ static int ocfs2_read_xattr_bucket(struct inode *inode,
 }
 
 /*
- * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
+ * Find the suitable pos when we divide a bucket into 2.
+ * We have to make sure the xattrs with the same hash value exist
+ * in the same bucket.
+ *
+ * If this ocfs2_xattr_header covers more than one hash value, find a
+ * place where the hash value changes.  Try to find the most even split.
+ * The most common case is that all entries have different hash values,
+ * and the first check we make will find a place to split.
+ */
+static int ocfs2_xattr_find_divide_pos(struct ocfs2_xattr_header *xh)
+{
+	struct ocfs2_xattr_entry *entries = xh->xh_entries;
+	int count = le16_to_cpu(xh->xh_count);
+	int delta, middle = count / 2;
+
+	/*
+	 * We start at the middle.  Each step gets farther away in both
+	 * directions.  We therefore hit the change in hash value
+	 * nearest to the middle.  Note that this loop does not execute for
+	 * count < 2.
+	 */
+	for (delta = 0; delta < middle; delta++) {
+		/* Let's check delta earlier than middle */
+		if (cmp_xe(&entries[middle - delta - 1],
+			   &entries[middle - delta]))
+			return middle - delta;
+
+		/* For even counts, don't walk off the end */
+		if ((middle + delta + 1) == count)
+			continue;
+
+		/* Now try delta past middle */
+		if (cmp_xe(&entries[middle + delta],
+			   &entries[middle + delta + 1]))
+			return middle + delta + 1;
+	}
+
+	/* Every entry had the same hash */
+	return count;
+}
+
+/*
+ * Move some xattrs in old bucket(blk) to new bucket(new_blk).
  * first_hash will record the 1st hash of the new bucket.
+ *
+ * Normally half of the xattrs will be moved.  But we have to make
+ * sure that the xattrs with the same hash value are stored in the
+ * same bucket. If all the xattrs in this bucket have the same hash
+ * value, the new bucket will be initialized as an empty one and the
+ * first_hash will be initialized as (hash_value+1).
  */
-static int ocfs2_half_xattr_bucket(struct inode *inode,
-				   handle_t *handle,
-				   u64 blk,
-				   u64 new_blk,
-				   u32 *first_hash,
-				   int new_bucket_head)
+static int ocfs2_divide_xattr_bucket(struct inode *inode,
+				    handle_t *handle,
+				    u64 blk,
+				    u64 new_blk,
+				    u32 *first_hash,
+				    int new_bucket_head)
 {
 	int ret, i;
-	u16 count, start, len, name_value_len, xe_len, name_offset;
+	int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	struct buffer_head **s_bhs, **t_bhs = NULL;
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
 	int blocksize = inode->i_sb->s_blocksize;
 
-	mlog(0, "move half of xattrs from bucket %llu to %llu\n",
-	     blk, new_blk);
+	mlog(0, "move some of xattrs from bucket %llu to %llu\n",
+	     (unsigned long long)blk, (unsigned long long)new_blk);
 
 	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
 	if (!s_bhs)
@@ -3220,21 +3218,44 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 
 	for (i = 0; i < blk_per_bucket; i++) {
 		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
-					   OCFS2_JOURNAL_ACCESS_CREATE);
+					   new_bucket_head ?
+					   OCFS2_JOURNAL_ACCESS_CREATE :
+					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 
+	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	count = le16_to_cpu(xh->xh_count);
+	start = ocfs2_xattr_find_divide_pos(xh);
+
+	if (start == count) {
+		xe = &xh->xh_entries[start-1];
+
+		/*
+		 * initialized a new empty bucket here.
+		 * The hash value is set as one larger than
+		 * that of the last entry in the previous bucket.
+		 */
+		for (i = 0; i < blk_per_bucket; i++)
+			memset(t_bhs[i]->b_data, 0, blocksize);
+
+		xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+		xh->xh_free_start = cpu_to_le16(blocksize);
+		xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
+		le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
+
+		goto set_num_buckets;
+	}
+
 	/* copy the whole bucket to the new first. */
 	for (i = 0; i < blk_per_bucket; i++)
 		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
 
 	/* update the new bucket. */
 	xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
-	count = le16_to_cpu(xh->xh_count);
-	start = count / 2;
 
 	/*
 	 * Calculate the total name/value len and xh_free_start for
@@ -3291,6 +3312,7 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 			xh->xh_free_start = xe->xe_name_offset;
 	}
 
+set_num_buckets:
 	/* set xh->xh_num_buckets for the new xh. */
 	if (new_bucket_head)
 		xh->xh_num_buckets = cpu_to_le16(1);
@@ -3308,9 +3330,13 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
 
 	/*
-	 * Now only update the 1st block of the old bucket.
-	 * Please note that the entry has been sorted already above.
+	 * Now only update the 1st block of the old bucket.  If we
+	 * just added a new empty bucket, there is no need to modify
+	 * it.
 	 */
+	if (start == count)
+		goto out;
+
 	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
 	memset(&xh->xh_entries[start], 0,
 	       sizeof(struct ocfs2_xattr_entry) * (count - start));
@@ -3358,7 +3384,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	BUG_ON(s_blkno == t_blkno);
 
 	mlog(0, "cp bucket %llu to %llu, target is %d\n",
-	     s_blkno, t_blkno, t_is_new);
+	     (unsigned long long)s_blkno, (unsigned long long)t_blkno,
+	     t_is_new);
 
 	s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
 			GFP_NOFS);
@@ -3382,6 +3409,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 
 	for (i = 0; i < blk_per_bucket; i++) {
 		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   t_is_new ?
+					   OCFS2_JOURNAL_ACCESS_CREATE :
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret)
 			goto out;
@@ -3428,7 +3457,8 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 	struct ocfs2_xattr_header *xh;
 	u64 to_blk_start = to_blk;
 
-	mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+	mlog(0, "cp xattrs from cluster %llu to %llu\n",
+	     (unsigned long long)src_blk, (unsigned long long)to_blk);
 
 	/*
 	 * We need to update the new cluster and 1 more for the update of
@@ -3493,15 +3523,15 @@ out:
 }
 
 /*
- * Move half of the xattrs in this cluster to the new cluster.
+ * Move some xattrs in this cluster to the new cluster.
  * This function should only be called when bucket size == cluster size.
  * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
  */
-static int ocfs2_half_xattr_cluster(struct inode *inode,
-				    handle_t *handle,
-				    u64 prev_blk,
-				    u64 new_blk,
-				    u32 *first_hash)
+static int ocfs2_divide_xattr_cluster(struct inode *inode,
+				      handle_t *handle,
+				      u64 prev_blk,
+				      u64 new_blk,
+				      u32 *first_hash)
 {
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int ret, credits = 2 * blk_per_bucket;
@@ -3515,8 +3545,8 @@ static int ocfs2_half_xattr_cluster(struct inode *inode,
 	}
 
 	/* Move half of the xattr in start_blk to the next bucket. */
-	return  ocfs2_half_xattr_bucket(inode, handle, prev_blk,
-					new_blk, first_hash, 1);
+	return  ocfs2_divide_xattr_bucket(inode, handle, prev_blk,
+					  new_blk, first_hash, 1);
 }
 
 /*
@@ -3559,7 +3589,8 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 
 	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-	     prev_blk, prev_clusters, new_blk);
+	     (unsigned long long)prev_blk, prev_clusters,
+	     (unsigned long long)new_blk);
 
 	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
 		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
@@ -3578,9 +3609,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 						     last_blk, new_blk,
 						     v_start);
 		else {
-			ret = ocfs2_half_xattr_cluster(inode, handle,
-						       last_blk, new_blk,
-						       v_start);
+			ret = ocfs2_divide_xattr_cluster(inode, handle,
+							 last_blk, new_blk,
+							 v_start);
 
 			if ((*header_bh)->b_blocknr == last_blk && extend)
 				*extend = 0;
@@ -3629,7 +3660,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
 	     "previous xattr blkno = %llu\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     prev_cpos, prev_blkno);
+	     prev_cpos, (unsigned long long)prev_blkno);
 
 	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
@@ -3716,7 +3747,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		}
 	}
 	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
-	     num_bits, block, v_start);
+	     num_bits, (unsigned long long)block, v_start);
 	ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
 				  num_bits, 0, meta_ac);
 	if (ret < 0) {
@@ -3761,7 +3792,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
 
 	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-	     "from %llu, len = %u\n", start_blk,
+	     "from %llu, len = %u\n", (unsigned long long)start_blk,
 	     (unsigned long long)first_bh->b_blocknr, num_clusters);
 
 	BUG_ON(bucket >= num_buckets);
@@ -3797,8 +3828,8 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	}
 
 	/* Move half of the xattr in start_blk to the next bucket. */
-	ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
-				      start_blk + blk_per_bucket, NULL, 0);
+	ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
+					start_blk + blk_per_bucket, NULL, 0);
 
 	le16_add_cpu(&first_xh->xh_num_buckets, 1);
 	ocfs2_journal_dirty(handle, first_bh);
@@ -4146,7 +4177,7 @@ static int ocfs2_xattr_value_update_size(struct inode *inode,
 	handle_t *handle = NULL;
 
 	handle = ocfs2_start_trans(osb, 1);
-	if (handle == NULL) {
+	if (IS_ERR(handle)) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
@@ -4313,7 +4344,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 	}
 
 	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (handle == NULL) {
+	if (IS_ERR(handle)) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
@@ -4489,11 +4520,21 @@ out:
 	return ret;
 }
 
-/* check whether the xattr bucket is filled up with the same hash value. */
+/*
+ * check whether the xattr bucket is filled up with the same hash value.
+ * If we want to insert the xattr with the same hash, return -ENOSPC.
+ * If we want to insert a xattr with different hash value, go ahead
+ * and ocfs2_divide_xattr_bucket will handle this.
+ */
 static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
-					      struct ocfs2_xattr_bucket *bucket)
+					      struct ocfs2_xattr_bucket *bucket,
+					      const char *name)
 {
 	struct ocfs2_xattr_header *xh = bucket->xh;
+	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+	if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
+		return 0;
 
 	if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
 	    xh->xh_entries[0].xe_name_hash) {
@@ -4616,7 +4657,9 @@ try_again:
 		 * one bucket's worth, so check it here whether we need to
 		 * add a new bucket for the insert.
 		 */
-		ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
+		ret = ocfs2_check_xattr_bucket_collision(inode,
+							 &xs->bucket,
+							 xi->name);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4727,14 +4770,11 @@ out:
 /*
  * 'trusted' attributes support
  */
-
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
 				       size_t list_size, const char *name,
 				       size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (list && total_len <= list_size) {
@@ -4771,18 +4811,14 @@ struct xattr_handler ocfs2_xattr_trusted_handler = {
 	.set	= ocfs2_xattr_trusted_set,
 };
 
-
 /*
  * 'user' attributes support
  */
-
-#define XATTR_USER_PREFIX "user."
-
 static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
 				    size_t list_size, const char *name,
 				    size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index c25c7c62a05..1d8314c7656 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -3,24 +3,16 @@
  *
  * xattr.h
  *
- * Function prototypes
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * License version 2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_XATTR_H
@@ -40,29 +32,11 @@ enum ocfs2_xattr_type {
 
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
-
-extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
-extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
-extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
-			   size_t, int);
-extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 
-static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
-{
-	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
-}
-
-static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
-{
-	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
-}
-
-static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
-{
-	u16 len = sb->s_blocksize -
-		 offsetof(struct ocfs2_xattr_header, xh_entries);
+ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+		    size_t, int);
+int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 
-	return len / sizeof(struct ocfs2_xattr_entry);
-}
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 633f7a0ebb2..6d5b213b8a9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -348,8 +348,8 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
-int add_partition(struct gendisk *disk, int partno,
-		  sector_t start, sector_t len, int flags)
+struct hd_struct *add_partition(struct gendisk *disk, int partno,
+				sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
@@ -361,15 +361,15 @@ int add_partition(struct gendisk *disk, int partno,
 
 	err = disk_expand_part_tbl(disk, partno);
 	if (err)
-		return err;
+		return ERR_PTR(err);
 	ptbl = disk->part_tbl;
 
 	if (ptbl->part[partno])
-		return -EBUSY;
+		return ERR_PTR(-EBUSY);
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
-		return -ENOMEM;
+		return ERR_PTR(-EBUSY);
 
 	if (!init_part_stats(p)) {
 		err = -ENOMEM;
@@ -395,7 +395,7 @@ int add_partition(struct gendisk *disk, int partno,
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_free;
+		goto out_free_stats;
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
@@ -424,18 +424,20 @@ int add_partition(struct gendisk *disk, int partno,
 	if (!ddev->uevent_suppress)
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
-	return 0;
+	return p;
 
+out_free_stats:
+	free_part_stats(p);
 out_free:
 	kfree(p);
-	return err;
+	return ERR_PTR(err);
 out_del:
 	kobject_put(p->holder_dir);
 	device_del(pdev);
 out_put:
 	put_device(pdev);
 	blk_free_devt(devt);
-	return err;
+	return ERR_PTR(err);
 }
 
 /* Not exported, helper to add_disk(). */
@@ -566,15 +568,16 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 			       disk->disk_name, p, (unsigned long long) size);
 			size = get_capacity(disk) - from;
 		}
-		res = add_partition(disk, p, from, size, state->parts[p].flags);
-		if (res) {
-			printk(KERN_ERR " %s: p%d could not be added: %d\n",
-				disk->disk_name, p, -res);
+		part = add_partition(disk, p, from, size,
+				     state->parts[p].flags);
+		if (IS_ERR(part)) {
+			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
+			       disk->disk_name, p, -PTR_ERR(part));
 			continue;
 		}
 #ifdef CONFIG_BLK_DEV_MD
 		if (state->parts[p].flags & ADDPART_FLAG_RAID)
-			md_autodetect_dev(bdev->bd_dev+p);
+			md_autodetect_dev(part_to_dev(part)->devt);
 #endif
 	}
 	kfree(state);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3fe713..d4677603c88 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -371,7 +371,7 @@ static int lstats_show_proc(struct seq_file *m, void *v)
 				task->latency_record[i].time,
 				task->latency_record[i].max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-				char sym[KSYM_NAME_LEN];
+				char sym[KSYM_SYMBOL_LEN];
 				char *c;
 				if (!task->latency_record[i].backtrace[q])
 					break;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 94fcfff6863..06ed10b7da9 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,6 +31,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
 	inode->i_mode = table->mode;
+	inode->i_uid = inode->i_gid = 0;
 	if (!table->child) {
 		inode->i_mode |= S_IFREG;
 		inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b770c095e45..3a8bdd7f575 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -557,9 +557,9 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
 	return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
 }
 
-static unsigned long pte_to_pagemap_entry(pte_t pte)
+static u64 pte_to_pagemap_entry(pte_t pte)
 {
-	unsigned long pme = 0;
+	u64 pme = 0;
 	if (is_swap_pte(pte))
 		pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
 			| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 0a6aa2cc78f..b49884c8c10 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -234,8 +234,8 @@ int ubifs_bg_thread(void *info)
 	int err;
 	struct ubifs_info *c = info;
 
-	ubifs_msg("background thread \"%s\" started, PID %d",
-		  c->bgt_name, current->pid);
+	dbg_msg("background thread \"%s\" started, PID %d",
+		c->bgt_name, current->pid);
 	set_freezable();
 
 	while (1) {
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 7186400750e..510ffa0bbda 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -101,21 +101,24 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
 	if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
 		switch (type) {
 		case UBIFS_INO_KEY:
-			sprintf(p, "(%lu, %s)", key_inum(c, key),
+			sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key),
 			       get_key_type(type));
 			break;
 		case UBIFS_DENT_KEY:
 		case UBIFS_XENT_KEY:
-			sprintf(p, "(%lu, %s, %#08x)", key_inum(c, key),
+			sprintf(p, "(%lu, %s, %#08x)",
+				(unsigned long)key_inum(c, key),
 				get_key_type(type), key_hash(c, key));
 			break;
 		case UBIFS_DATA_KEY:
-			sprintf(p, "(%lu, %s, %u)", key_inum(c, key),
+			sprintf(p, "(%lu, %s, %u)",
+				(unsigned long)key_inum(c, key),
 				get_key_type(type), key_block(c, key));
 			break;
 		case UBIFS_TRUN_KEY:
 			sprintf(p, "(%lu, %s)",
-				key_inum(c, key), get_key_type(type));
+				(unsigned long)key_inum(c, key),
+				get_key_type(type));
 			break;
 		default:
 			sprintf(p, "(bad key type: %#08x, %#08x)",
@@ -364,8 +367,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 		       le32_to_cpu(mst->ihead_lnum));
 		printk(KERN_DEBUG "\tihead_offs     %u\n",
 		       le32_to_cpu(mst->ihead_offs));
-		printk(KERN_DEBUG "\tindex_size     %u\n",
-		       le32_to_cpu(mst->index_size));
+		printk(KERN_DEBUG "\tindex_size     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->index_size));
 		printk(KERN_DEBUG "\tlpt_lnum       %u\n",
 		       le32_to_cpu(mst->lpt_lnum));
 		printk(KERN_DEBUG "\tlpt_offs       %u\n",
@@ -1589,7 +1592,7 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
 
 	if (inum > c->highest_inum) {
 		ubifs_err("too high inode number, max. is %lu",
-			  c->highest_inum);
+			  (unsigned long)c->highest_inum);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -1668,16 +1671,18 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c,
 	ino_key_init(c, &key, inum);
 	err = ubifs_lookup_level0(c, &key, &znode, &n);
 	if (!err) {
-		ubifs_err("inode %lu not found in index", inum);
+		ubifs_err("inode %lu not found in index", (unsigned long)inum);
 		return ERR_PTR(-ENOENT);
 	} else if (err < 0) {
-		ubifs_err("error %d while looking up inode %lu", err, inum);
+		ubifs_err("error %d while looking up inode %lu",
+			  err, (unsigned long)inum);
 		return ERR_PTR(err);
 	}
 
 	zbr = &znode->zbranch[n];
 	if (zbr->len < UBIFS_INO_NODE_SZ) {
-		ubifs_err("bad node %lu node length %d", inum, zbr->len);
+		ubifs_err("bad node %lu node length %d",
+			  (unsigned long)inum, zbr->len);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -1697,7 +1702,7 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c,
 	kfree(ino);
 	if (IS_ERR(fscki)) {
 		ubifs_err("error %ld while adding inode %lu node",
-			  PTR_ERR(fscki), inum);
+			  PTR_ERR(fscki), (unsigned long)inum);
 		return fscki;
 	}
 
@@ -1786,7 +1791,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 		if (IS_ERR(fscki)) {
 			err = PTR_ERR(fscki);
 			ubifs_err("error %d while processing data node and "
-				  "trying to find inode node %lu", err, inum);
+				  "trying to find inode node %lu",
+				  err, (unsigned long)inum);
 			goto out_dump;
 		}
 
@@ -1819,7 +1825,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 		if (IS_ERR(fscki)) {
 			err = PTR_ERR(fscki);
 			ubifs_err("error %d while processing entry node and "
-				  "trying to find inode node %lu", err, inum);
+				  "trying to find inode node %lu",
+				  err, (unsigned long)inum);
 			goto out_dump;
 		}
 
@@ -1832,7 +1839,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 			err = PTR_ERR(fscki);
 			ubifs_err("error %d while processing entry node and "
 				  "trying to find parent inode node %lu",
-				  err, inum);
+				  err, (unsigned long)inum);
 			goto out_dump;
 		}
 
@@ -1923,7 +1930,8 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
 			    fscki->references != 1) {
 				ubifs_err("directory inode %lu has %d "
 					  "direntries which refer it, but "
-					  "should be 1", fscki->inum,
+					  "should be 1",
+					  (unsigned long)fscki->inum,
 					  fscki->references);
 				goto out_dump;
 			}
@@ -1931,27 +1939,29 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
 			    fscki->references != 0) {
 				ubifs_err("root inode %lu has non-zero (%d) "
 					  "direntries which refer it",
-					  fscki->inum, fscki->references);
+					  (unsigned long)fscki->inum,
+					  fscki->references);
 				goto out_dump;
 			}
 			if (fscki->calc_sz != fscki->size) {
 				ubifs_err("directory inode %lu size is %lld, "
 					  "but calculated size is %lld",
-					  fscki->inum, fscki->size,
-					  fscki->calc_sz);
+					  (unsigned long)fscki->inum,
+					  fscki->size, fscki->calc_sz);
 				goto out_dump;
 			}
 			if (fscki->calc_cnt != fscki->nlink) {
 				ubifs_err("directory inode %lu nlink is %d, "
 					  "but calculated nlink is %d",
-					  fscki->inum, fscki->nlink,
-					  fscki->calc_cnt);
+					  (unsigned long)fscki->inum,
+					  fscki->nlink, fscki->calc_cnt);
 				goto out_dump;
 			}
 		} else {
 			if (fscki->references != fscki->nlink) {
 				ubifs_err("inode %lu nlink is %d, but "
-					  "calculated nlink is %d", fscki->inum,
+					  "calculated nlink is %d",
+					  (unsigned long)fscki->inum,
 					  fscki->nlink, fscki->references);
 				goto out_dump;
 			}
@@ -1959,20 +1969,21 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
 		if (fscki->xattr_sz != fscki->calc_xsz) {
 			ubifs_err("inode %lu has xattr size %u, but "
 				  "calculated size is %lld",
-				  fscki->inum, fscki->xattr_sz,
+				  (unsigned long)fscki->inum, fscki->xattr_sz,
 				  fscki->calc_xsz);
 			goto out_dump;
 		}
 		if (fscki->xattr_cnt != fscki->calc_xcnt) {
 			ubifs_err("inode %lu has %u xattrs, but "
-				  "calculated count is %lld", fscki->inum,
+				  "calculated count is %lld",
+				  (unsigned long)fscki->inum,
 				  fscki->xattr_cnt, fscki->calc_xcnt);
 			goto out_dump;
 		}
 		if (fscki->xattr_nms != fscki->calc_xnms) {
 			ubifs_err("inode %lu has xattr names' size %u, but "
 				  "calculated names' size is %lld",
-				  fscki->inum, fscki->xattr_nms,
+				  (unsigned long)fscki->inum, fscki->xattr_nms,
 				  fscki->calc_xnms);
 			goto out_dump;
 		}
@@ -1985,11 +1996,12 @@ out_dump:
 	ino_key_init(c, &key, fscki->inum);
 	err = ubifs_lookup_level0(c, &key, &znode, &n);
 	if (!err) {
-		ubifs_err("inode %lu not found in index", fscki->inum);
+		ubifs_err("inode %lu not found in index",
+			  (unsigned long)fscki->inum);
 		return -ENOENT;
 	} else if (err < 0) {
 		ubifs_err("error %d while looking up inode %lu",
-			  err, fscki->inum);
+			  err, (unsigned long)fscki->inum);
 		return err;
 	}
 
@@ -2007,7 +2019,7 @@ out_dump:
 	}
 
 	ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
-		  fscki->inum, zbr->lnum, zbr->offs);
+		  (unsigned long)fscki->inum, zbr->lnum, zbr->offs);
 	dbg_dump_node(c, ino);
 	kfree(ino);
 	return -EINVAL;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 526c01ec800..0422c98e179 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -161,7 +161,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
 			return ERR_PTR(-EINVAL);
 		}
 		ubifs_warn("running out of inode numbers (current %lu, max %d)",
-			   c->highest_inum, INUM_WATERMARK);
+			   (unsigned long)c->highest_inum, INUM_WATERMARK);
 	}
 
 	inode->i_ino = ++c->highest_inum;
@@ -428,7 +428,8 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
 		dbg_gen("feed '%s', ino %llu, new f_pos %#x",
 			dent->name, (unsigned long long)le64_to_cpu(dent->inum),
 			key_hash_flash(c, &dent->key));
-		ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
+		ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
+			     ubifs_inode(dir)->creat_sqnum);
 
 		nm.len = le16_to_cpu(dent->nlen);
 		over = filldir(dirent, dent->name, nm.len, file->f_pos,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 51cf511d44d..2624411d975 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -72,7 +72,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
 		return err;
 	}
 
-	ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum);
+	ubifs_assert(le64_to_cpu(dn->ch.sqnum) > ubifs_inode(inode)->creat_sqnum);
 
 	len = le32_to_cpu(dn->size);
 	if (len <= 0 || len > UBIFS_BLOCK_SIZE)
@@ -626,7 +626,7 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 
 			dn = bu->buf + (bu->zbranch[nn].offs - offs);
 
-			ubifs_assert(dn->ch.sqnum >
+			ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
 				     ubifs_inode(inode)->creat_sqnum);
 
 			len = le32_to_cpu(dn->size);
@@ -691,32 +691,22 @@ out_err:
 /**
  * ubifs_do_bulk_read - do bulk-read.
  * @c: UBIFS file-system description object
- * @page1: first page
+ * @bu: bulk-read information
+ * @page1: first page to read
  *
  * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
  */
-static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1)
+static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
+			      struct page *page1)
 {
 	pgoff_t offset = page1->index, end_index;
 	struct address_space *mapping = page1->mapping;
 	struct inode *inode = mapping->host;
 	struct ubifs_inode *ui = ubifs_inode(inode);
-	struct bu_info *bu;
 	int err, page_idx, page_cnt, ret = 0, n = 0;
+	int allocate = bu->buf ? 0 : 1;
 	loff_t isize;
 
-	bu = kmalloc(sizeof(struct bu_info), GFP_NOFS);
-	if (!bu)
-		return 0;
-
-	bu->buf_len = c->bulk_read_buf_size;
-	bu->buf = kmalloc(bu->buf_len, GFP_NOFS);
-	if (!bu->buf)
-		goto out_free;
-
-	data_key_init(c, &bu->key, inode->i_ino,
-		      offset << UBIFS_BLOCKS_PER_PAGE_SHIFT);
-
 	err = ubifs_tnc_get_bu_keys(c, bu);
 	if (err)
 		goto out_warn;
@@ -735,12 +725,25 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1)
 		 * together. If all the pages were like this, bulk-read would
 		 * reduce performance, so we turn it off for a while.
 		 */
-		ui->read_in_a_row = 0;
-		ui->bulk_read = 0;
-		goto out_free;
+		goto out_bu_off;
 	}
 
 	if (bu->cnt) {
+		if (allocate) {
+			/*
+			 * Allocate bulk-read buffer depending on how many data
+			 * nodes we are going to read.
+			 */
+			bu->buf_len = bu->zbranch[bu->cnt - 1].offs +
+				      bu->zbranch[bu->cnt - 1].len -
+				      bu->zbranch[0].offs;
+			ubifs_assert(bu->buf_len > 0);
+			ubifs_assert(bu->buf_len <= c->leb_size);
+			bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN);
+			if (!bu->buf)
+				goto out_bu_off;
+		}
+
 		err = ubifs_tnc_bulk_read(c, bu);
 		if (err)
 			goto out_warn;
@@ -779,13 +782,17 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1)
 	ui->last_page_read = offset + page_idx - 1;
 
 out_free:
-	kfree(bu->buf);
-	kfree(bu);
+	if (allocate)
+		kfree(bu->buf);
 	return ret;
 
 out_warn:
 	ubifs_warn("ignoring error %d and skipping bulk-read", err);
 	goto out_free;
+
+out_bu_off:
+	ui->read_in_a_row = ui->bulk_read = 0;
+	goto out_free;
 }
 
 /**
@@ -803,18 +810,20 @@ static int ubifs_bulk_read(struct page *page)
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	pgoff_t index = page->index, last_page_read = ui->last_page_read;
-	int ret = 0;
+	struct bu_info *bu;
+	int err = 0, allocated = 0;
 
 	ui->last_page_read = index;
-
 	if (!c->bulk_read)
 		return 0;
+
 	/*
-	 * Bulk-read is protected by ui_mutex, but it is an optimization, so
-	 * don't bother if we cannot lock the mutex.
+	 * Bulk-read is protected by @ui->ui_mutex, but it is an optimization,
+	 * so don't bother if we cannot lock the mutex.
 	 */
 	if (!mutex_trylock(&ui->ui_mutex))
 		return 0;
+
 	if (index != last_page_read + 1) {
 		/* Turn off bulk-read if we stop reading sequentially */
 		ui->read_in_a_row = 1;
@@ -822,6 +831,7 @@ static int ubifs_bulk_read(struct page *page)
 			ui->bulk_read = 0;
 		goto out_unlock;
 	}
+
 	if (!ui->bulk_read) {
 		ui->read_in_a_row += 1;
 		if (ui->read_in_a_row < 3)
@@ -829,10 +839,35 @@ static int ubifs_bulk_read(struct page *page)
 		/* Three reads in a row, so switch on bulk-read */
 		ui->bulk_read = 1;
 	}
-	ret = ubifs_do_bulk_read(c, page);
+
+	/*
+	 * If possible, try to use pre-allocated bulk-read information, which
+	 * is protected by @c->bu_mutex.
+	 */
+	if (mutex_trylock(&c->bu_mutex))
+		bu = &c->bu;
+	else {
+		bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN);
+		if (!bu)
+			goto out_unlock;
+
+		bu->buf = NULL;
+		allocated = 1;
+	}
+
+	bu->buf_len = c->max_bu_buf_len;
+	data_key_init(c, &bu->key, inode->i_ino,
+		      page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+	err = ubifs_do_bulk_read(c, bu, page);
+
+	if (!allocated)
+		mutex_unlock(&c->bu_mutex);
+	else
+		kfree(bu);
+
 out_unlock:
 	mutex_unlock(&ui->ui_mutex);
-	return ret;
+	return err;
 }
 
 static int ubifs_readpage(struct file *file, struct page *page)
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 22993f867d1..f91b745908e 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -690,8 +690,9 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 	int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
-	dbg_jnl("ino %lu, blk %u, len %d, key %s", key_inum(c, key),
-		key_block(c, key), len, DBGKEY(key));
+	dbg_jnl("ino %lu, blk %u, len %d, key %s",
+		(unsigned long)key_inum(c, key), key_block(c, key), len,
+		DBGKEY(key));
 	ubifs_assert(len <= UBIFS_BLOCK_SIZE);
 
 	data = kmalloc(dlen, GFP_NOFS);
@@ -1128,7 +1129,8 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 	ino_t inum = inode->i_ino;
 	unsigned int blk;
 
-	dbg_jnl("ino %lu, size %lld -> %lld", inum, old_size, new_size);
+	dbg_jnl("ino %lu, size %lld -> %lld",
+		(unsigned long)inum, old_size, new_size);
 	ubifs_assert(!ui->data_len);
 	ubifs_assert(S_ISREG(inode->i_mode));
 	ubifs_assert(mutex_is_locked(&ui->ui_mutex));
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 9ee65086f62..3f1f16bc25c 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -345,7 +345,7 @@ static inline int key_type_flash(const struct ubifs_info *c, const void *k)
 {
 	const union ubifs_key *key = k;
 
-	return le32_to_cpu(key->u32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
+	return le32_to_cpu(key->j32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
 }
 
 /**
@@ -416,7 +416,7 @@ static inline unsigned int key_block_flash(const struct ubifs_info *c,
 {
 	const union ubifs_key *key = k;
 
-	return le32_to_cpu(key->u32[1]) & UBIFS_S_KEY_BLOCK_MASK;
+	return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_BLOCK_MASK;
 }
 
 /**
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index eed5a0025d6..a41434b4278 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -571,8 +571,6 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
 		/* We assume here that LEB zero is never an LPT LEB */
 		if (nnode->nbranch[iip].lnum)
 			return ubifs_get_pnode(c, nnode, iip);
-		else
-			return NULL;
 	}
 
 	/* Go up while can't go right */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 02d3462f4d3..9bd5a43d452 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -105,7 +105,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
 	list_add_tail(&orphan->list, &c->orph_list);
 	list_add_tail(&orphan->new_list, &c->orph_new);
 	spin_unlock(&c->orphan_lock);
-	dbg_gen("ino %lu", inum);
+	dbg_gen("ino %lu", (unsigned long)inum);
 	return 0;
 }
 
@@ -132,14 +132,16 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
 		else {
 			if (o->dnext) {
 				spin_unlock(&c->orphan_lock);
-				dbg_gen("deleted twice ino %lu", inum);
+				dbg_gen("deleted twice ino %lu",
+					(unsigned long)inum);
 				return;
 			}
 			if (o->cnext) {
 				o->dnext = c->orph_dnext;
 				c->orph_dnext = o;
 				spin_unlock(&c->orphan_lock);
-				dbg_gen("delete later ino %lu", inum);
+				dbg_gen("delete later ino %lu",
+					(unsigned long)inum);
 				return;
 			}
 			rb_erase(p, &c->orph_tree);
@@ -151,12 +153,12 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
 			}
 			spin_unlock(&c->orphan_lock);
 			kfree(o);
-			dbg_gen("inum %lu", inum);
+			dbg_gen("inum %lu", (unsigned long)inum);
 			return;
 		}
 	}
 	spin_unlock(&c->orphan_lock);
-	dbg_err("missing orphan ino %lu", inum);
+	dbg_err("missing orphan ino %lu", (unsigned long)inum);
 	dbg_dump_stack();
 }
 
@@ -448,7 +450,7 @@ static void erase_deleted(struct ubifs_info *c)
 		rb_erase(&orphan->rb, &c->orph_tree);
 		list_del(&orphan->list);
 		c->tot_orphans -= 1;
-		dbg_gen("deleting orphan ino %lu", orphan->inum);
+		dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum);
 		kfree(orphan);
 	}
 	c->orph_dnext = NULL;
@@ -536,8 +538,8 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
 	list_add_tail(&orphan->list, &c->orph_list);
 	orphan->dnext = c->orph_dnext;
 	c->orph_dnext = orphan;
-	dbg_mnt("ino %lu, new %d, tot %d",
-		inum, c->new_orphans, c->tot_orphans);
+	dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum,
+		c->new_orphans, c->tot_orphans);
 	return 0;
 }
 
@@ -609,7 +611,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
 		for (i = 0; i < n; i++) {
 			inum = le64_to_cpu(orph->inos[i]);
-			dbg_rcvry("deleting orphaned inode %lu", inum);
+			dbg_rcvry("deleting orphaned inode %lu",
+				  (unsigned long)inum);
 			err = ubifs_tnc_remove_ino(c, inum);
 			if (err)
 				return err;
@@ -840,8 +843,8 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 	if (inum != ci->last_ino) {
 		/* Lowest node type is the inode node, so it comes first */
 		if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
-			ubifs_err("found orphan node ino %lu, type %d", inum,
-				  key_type(c, &zbr->key));
+			ubifs_err("found orphan node ino %lu, type %d",
+				  (unsigned long)inum, key_type(c, &zbr->key));
 		ci->last_ino = inum;
 		ci->tot_inos += 1;
 		err = ubifs_tnc_read_node(c, zbr, ci->node);
@@ -853,7 +856,8 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 			/* Must be recorded as an orphan */
 			if (!dbg_find_check_orphan(&ci->root, inum) &&
 			    !dbg_find_orphan(c, inum)) {
-				ubifs_err("missing orphan, ino %lu", inum);
+				ubifs_err("missing orphan, ino %lu",
+					  (unsigned long)inum);
 				ci->missing += 1;
 			}
 	}
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 77d26c141cf..90acac603e6 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -168,12 +168,12 @@ static int write_rcvrd_mst_node(struct ubifs_info *c,
 				struct ubifs_mst_node *mst)
 {
 	int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
-	uint32_t save_flags;
+	__le32 save_flags;
 
 	dbg_rcvry("recovery");
 
 	save_flags = mst->flags;
-	mst->flags = cpu_to_le32(le32_to_cpu(mst->flags) | UBIFS_MST_RCVRY);
+	mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY);
 
 	ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
 	err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM);
@@ -1435,13 +1435,13 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
 	err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
 	if (err)
 		goto out;
-	dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", e->inum, lnum, offs,
-		  i_size, e->d_size);
+	dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ",
+		  (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
 	return 0;
 
 out:
 	ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d",
-		   e->inum, e->i_size, e->d_size, err);
+		   (unsigned long)e->inum, e->i_size, e->d_size, err);
 	return err;
 }
 
@@ -1472,7 +1472,8 @@ int ubifs_recover_size(struct ubifs_info *c)
 				return err;
 			if (err == -ENOENT) {
 				/* Remove data nodes that have no inode */
-				dbg_rcvry("removing ino %lu", e->inum);
+				dbg_rcvry("removing ino %lu",
+					  (unsigned long)e->inum);
 				err = ubifs_tnc_remove_ino(c, e->inum);
 				if (err)
 					return err;
@@ -1493,8 +1494,8 @@ int ubifs_recover_size(struct ubifs_info *c)
 					return PTR_ERR(inode);
 				if (inode->i_size < e->d_size) {
 					dbg_rcvry("ino %lu size %lld -> %lld",
-						  e->inum, e->d_size,
-						  inode->i_size);
+						  (unsigned long)e->inum,
+						  e->d_size, inode->i_size);
 					inode->i_size = e->d_size;
 					ubifs_inode(inode)->ui_size = e->d_size;
 					e->inode = inode;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 7399692af85..21f7d047c30 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1065,7 +1065,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
 	ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
 	dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
 		"highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
-		c->highest_inum);
+		(unsigned long)c->highest_inum);
 out:
 	destroy_replay_tree(c);
 	destroy_bud_list(c);
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 2bf753b3888..0f392351dc5 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -81,6 +81,7 @@ static int create_default_filesystem(struct ubifs_info *c)
 	int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
 	int min_leb_cnt = UBIFS_MIN_LEB_CNT;
 	uint64_t tmp64, main_bytes;
+	__le64 tmp_le64;
 
 	/* Some functions called from here depend on the @c->key_len filed */
 	c->key_len = UBIFS_SK_LEN;
@@ -295,10 +296,10 @@ static int create_default_filesystem(struct ubifs_info *c)
 	ino->ch.node_type = UBIFS_INO_NODE;
 	ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
 	ino->nlink = cpu_to_le32(2);
-	tmp = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
-	ino->atime_sec   = tmp;
-	ino->ctime_sec   = tmp;
-	ino->mtime_sec   = tmp;
+	tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+	ino->atime_sec   = tmp_le64;
+	ino->ctime_sec   = tmp_le64;
+	ino->mtime_sec   = tmp_le64;
 	ino->atime_nsec  = 0;
 	ino->ctime_nsec  = 0;
 	ino->mtime_nsec  = 0;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 8780efbf40a..d80b2aef42b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,6 +36,12 @@
 #include <linux/mount.h>
 #include "ubifs.h"
 
+/*
+ * Maximum amount of memory we may 'kmalloc()' without worrying that we are
+ * allocating too much.
+ */
+#define UBIFS_KMALLOC_OK (128*1024)
+
 /* Slab cache for UBIFS inodes */
 struct kmem_cache *ubifs_inode_slab;
 
@@ -561,18 +567,11 @@ static int init_constants_early(struct ubifs_info *c)
 	 * calculations when reporting free space.
 	 */
 	c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
-	/* Buffer size for bulk-reads */
-	c->bulk_read_buf_size = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
-	if (c->bulk_read_buf_size > c->leb_size)
-		c->bulk_read_buf_size = c->leb_size;
-	if (c->bulk_read_buf_size > 128 * 1024) {
-		/* Check if we can kmalloc more than 128KiB */
-		void *try = kmalloc(c->bulk_read_buf_size, GFP_KERNEL);
 
-		kfree(try);
-		if (!try)
-			c->bulk_read_buf_size = 128 * 1024;
-	}
+	/* Buffer size for bulk-reads */
+	c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
+	if (c->max_bu_buf_len > c->leb_size)
+		c->max_bu_buf_len = c->leb_size;
 	return 0;
 }
 
@@ -992,6 +991,34 @@ static void destroy_journal(struct ubifs_info *c)
 }
 
 /**
+ * bu_init - initialize bulk-read information.
+ * @c: UBIFS file-system description object
+ */
+static void bu_init(struct ubifs_info *c)
+{
+	ubifs_assert(c->bulk_read == 1);
+
+	if (c->bu.buf)
+		return; /* Already initialized */
+
+again:
+	c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN);
+	if (!c->bu.buf) {
+		if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) {
+			c->max_bu_buf_len = UBIFS_KMALLOC_OK;
+			goto again;
+		}
+
+		/* Just disable bulk-read */
+		ubifs_warn("Cannot allocate %d bytes of memory for bulk-read, "
+			   "disabling it", c->max_bu_buf_len);
+		c->mount_opts.bulk_read = 1;
+		c->bulk_read = 0;
+		return;
+	}
+}
+
+/**
  * mount_ubifs - mount UBIFS file-system.
  * @c: UBIFS file-system description object
  *
@@ -1059,6 +1086,13 @@ static int mount_ubifs(struct ubifs_info *c)
 			goto out_free;
 	}
 
+	if (c->bulk_read == 1)
+		bu_init(c);
+
+	/*
+	 * We have to check all CRCs, even for data nodes, when we mount the FS
+	 * (specifically, when we are replaying).
+	 */
 	c->always_chk_crc = 1;
 
 	err = ubifs_read_superblock(c);
@@ -1289,6 +1323,7 @@ out_cbuf:
 out_dereg:
 	dbg_failure_mode_deregistration(c);
 out_free:
+	kfree(c->bu.buf);
 	vfree(c->ileb_buf);
 	vfree(c->sbuf);
 	kfree(c->bottom_up_buf);
@@ -1325,10 +1360,11 @@ static void ubifs_umount(struct ubifs_info *c)
 	kfree(c->cbuf);
 	kfree(c->rcvrd_mst_node);
 	kfree(c->mst_node);
+	kfree(c->bu.buf);
+	vfree(c->ileb_buf);
 	vfree(c->sbuf);
 	kfree(c->bottom_up_buf);
 	UBIFS_DBG(vfree(c->dbg_buf));
-	vfree(c->ileb_buf);
 	dbg_failure_mode_deregistration(c);
 }
 
@@ -1626,6 +1662,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		ubifs_err("invalid or unknown remount parameter");
 		return err;
 	}
+
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		err = ubifs_remount_rw(c);
 		if (err)
@@ -1633,6 +1670,14 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
 		ubifs_remount_ro(c);
 
+	if (c->bulk_read == 1)
+		bu_init(c);
+	else {
+		dbg_gen("disable bulk-read");
+		kfree(c->bu.buf);
+		c->bu.buf = NULL;
+	}
+
 	return 0;
 }
 
@@ -1723,6 +1768,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	mutex_init(&c->log_mutex);
 	mutex_init(&c->mst_mutex);
 	mutex_init(&c->umount_mutex);
+	mutex_init(&c->bu_mutex);
 	init_waitqueue_head(&c->cmt_wq);
 	c->buds = RB_ROOT;
 	c->old_idx = RB_ROOT;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index d27fd918b9c..6eef5344a14 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1501,7 +1501,12 @@ out:
  * @bu: bulk-read parameters and results
  *
  * Lookup consecutive data node keys for the same inode that reside
- * consecutively in the same LEB.
+ * consecutively in the same LEB. This function returns zero in case of success
+ * and a negative error code in case of failure.
+ *
+ * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
+ * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
+ * maxumum possible amount of nodes for bulk-read.
  */
 int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
 {
@@ -2677,7 +2682,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
 	struct ubifs_dent_node *xent, *pxent = NULL;
 	struct qstr nm = { .name = NULL };
 
-	dbg_tnc("ino %lu", inum);
+	dbg_tnc("ino %lu", (unsigned long)inum);
 
 	/*
 	 * Walk all extended attribute entries and remove them together with
@@ -2697,7 +2702,8 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
 		}
 
 		xattr_inum = le64_to_cpu(xent->inum);
-		dbg_tnc("xent '%s', ino %lu", xent->name, xattr_inum);
+		dbg_tnc("xent '%s', ino %lu", xent->name,
+			(unsigned long)xattr_inum);
 
 		nm.name = xent->name;
 		nm.len = le16_to_cpu(xent->nlen);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a7bd32fa15b..46b172560a0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -753,7 +753,7 @@ struct ubifs_znode {
 };
 
 /**
- * struct bu_info - bulk-read information
+ * struct bu_info - bulk-read information.
  * @key: first data node key
  * @zbranch: zbranches of data nodes to bulk read
  * @buf: buffer to read into
@@ -969,7 +969,10 @@ struct ubifs_mount_opts {
  * @mst_node: master node
  * @mst_offs: offset of valid master node
  * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
- * @bulk_read_buf_size: buffer size for bulk-reads
+ *
+ * @max_bu_buf_len: maximum bulk-read buffer length
+ * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
+ * @bu: pre-allocated bulk-read information
  *
  * @log_lebs: number of logical eraseblocks in the log
  * @log_bytes: log size in bytes
@@ -1217,7 +1220,10 @@ struct ubifs_info {
 	struct ubifs_mst_node *mst_node;
 	int mst_offs;
 	struct mutex mst_mutex;
-	int bulk_read_buf_size;
+
+	int max_bu_buf_len;
+	struct mutex bu_mutex;
+	struct bu_info bu;
 
 	int log_lebs;
 	long long log_bytes;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 6e74b117aaf..30ebde490f7 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -106,6 +106,7 @@ void udf_clear_inode(struct inode *inode)
 		udf_truncate_tail_extent(inode);
 		unlock_kernel();
 		write_inode_now(inode, 0);
+		invalidate_inode_buffers(inode);
 	}
 	iinfo = UDF_I(inode);
 	kfree(iinfo->i_ext.i_data);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9e561a9cefc..a11a8390bf6 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1566,11 +1566,14 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	int nmap, error, w, count, c, got, i, mapi;
 	xfs_trans_t *tp;
 	xfs_mount_t *mp;
+	xfs_drfsbno_t	nblks;
 
 	dp = args->dp;
 	mp = dp->i_mount;
 	w = args->whichfork;
 	tp = args->trans;
+	nblks = dp->i_d.di_nblocks;
+
 	/*
 	 * For new directories adjust the file offset and block count.
 	 */
@@ -1647,6 +1650,8 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	}
 	if (mapp != &map)
 		kmem_free(mapp);
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_d.di_nblocks - nblks;
 	*new_blkno = (xfs_dablk_t)bno;
 	return 0;
 }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 80e0dc51361..1afb12278b8 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -525,11 +525,13 @@ xfs_dir2_grow_inode(
 	xfs_mount_t	*mp;
 	int		nmap;		/* number of bmap entries */
 	xfs_trans_t	*tp;
+	xfs_drfsbno_t	nblks;
 
 	xfs_dir2_trace_args_s("grow_inode", args, space);
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
+	nblks = dp->i_d.di_nblocks;
 	/*
 	 * Set lowest possible block in the space requested.
 	 */
@@ -622,7 +624,11 @@ xfs_dir2_grow_inode(
 	 */
 	if (mapp != &map)
 		kmem_free(mapp);
+
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_d.di_nblocks - nblks;
 	*dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
+
 	/*
 	 * Update file's size if this is the data space and it grew.
 	 */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dbd9cef852e..a391b955df0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1414,7 +1414,7 @@ xfs_itruncate_start(
 	mp = ip->i_mount;
 
 	/* wait for the completion of any pending DIOs */
-	if (new_size < ip->i_size)
+	if (new_size == 0 || new_size < ip->i_size)
 		vn_iowait(ip);
 
 	/*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0b02c644355..3608a0f0a5f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -563,6 +563,11 @@ xfs_log_mount(
 	}
 
 	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+	if (!mp->m_log) {
+		cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!");
+		error = ENOMEM;
+		goto out;
+	}
 
 	/*
 	 * Initialize the AIL now we have a log.
@@ -601,6 +606,7 @@ xfs_log_mount(
 	return 0;
 error:
 	xfs_log_unmount_dealloc(mp);
+out:
 	return error;
 }	/* xfs_log_mount */
 
@@ -1217,7 +1223,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	int			i;
 	int			iclogsize;
 
-	log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP);
+	log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
+	if (!log)
+		return NULL;
 
 	log->l_mp	   = mp;
 	log->l_targ	   = log_target;
@@ -1249,6 +1257,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	xlog_get_iclog_buffer_size(mp, log);
 
 	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+	if (!bp)
+		goto out_free_log;
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
 	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
 	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
@@ -1275,13 +1285,17 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	iclogsize = log->l_iclog_size;
 	ASSERT(log->l_iclog_size >= 4096);
 	for (i=0; i < log->l_iclog_bufs; i++) {
-		*iclogp = (xlog_in_core_t *)
-			  kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
+		*iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
+		if (!*iclogp)
+			goto out_free_iclog;
+
 		iclog = *iclogp;
 		iclog->ic_prev = prev_iclog;
 		prev_iclog = iclog;
 
 		bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
+		if (!bp)
+			goto out_free_iclog;
 		if (!XFS_BUF_CPSEMA(bp))
 			ASSERT(0);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
@@ -1323,6 +1337,25 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
 
 	return log;
+
+out_free_iclog:
+	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
+		prev_iclog = iclog->ic_next;
+		if (iclog->ic_bp) {
+			sv_destroy(&iclog->ic_force_wait);
+			sv_destroy(&iclog->ic_write_wait);
+			xfs_buf_free(iclog->ic_bp);
+			xlog_trace_iclog_dealloc(iclog);
+		}
+		kmem_free(iclog);
+	}
+	spinlock_destroy(&log->l_icloglock);
+	spinlock_destroy(&log->l_grant_lock);
+	xlog_trace_loggrant_dealloc(log);
+	xfs_buf_free(log->l_xbuf);
+out_free_log:
+	kmem_free(log);
+	return NULL;
 }	/* xlog_alloc_log */
 
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 82d46ce69d5..70e3ba32e6b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1419,7 +1419,13 @@ xlog_recover_add_to_trans(
 		return 0;
 	item = trans->r_itemq;
 	if (item == NULL) {
-		ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
+		/* we need to catch log corruptions here */
+		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+			xlog_warn("XFS: xlog_recover_add_to_trans: "
+				  "bad header magic number");
+			ASSERT(0);
+			return XFS_ERROR(EIO);
+		}
 		if (len == sizeof(xfs_trans_header_t))
 			xlog_recover_add_item(&trans->r_itemq);
 		memcpy(&trans->r_theader, dp, len); /* d, s, l */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a4503f5e949..15f5dd22fbb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1245,6 +1245,9 @@ xfs_unmountfs(
 
 	XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
 
+	if (mp->m_quotainfo)
+		XFS_QM_DONE(mp);
+
 	/*
 	 * Flush out the log synchronously so that we know for sure
 	 * that nothing is pinned.  This is important because bflush()
@@ -1297,8 +1300,6 @@ xfs_unmountfs(
 	xfs_errortag_clearall(mp, 0);
 #endif
 	xfs_free_perag(mp);
-	if (mp->m_quotainfo)
-		XFS_QM_DONE(mp);
 }
 
 STATIC void
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d700dacdb10..c903130be7f 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -212,7 +212,7 @@ xfs_rename(
 	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
 		     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
 		error = XFS_ERROR(EXDEV);
-		xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+		xfs_rename_unlock4(inodes, XFS_ILOCK_EXCL);
 		xfs_trans_cancel(tp, cancel_flags);
 		goto std_return;
 	}