367 files changed, 11826 insertions, 9723 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 51c94e26a34..e777961939f 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -343,18 +343,7 @@ int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
 
 	BUG_ON(!vcookie->fscache);
 
-	if (PageFsCache(page)) {
-		if (fscache_check_page_write(vcookie->fscache, page)) {
-			if (!(gfp & __GFP_WAIT))
-				return 0;
-			fscache_wait_on_page_write(vcookie->fscache, page);
-		}
-
-		fscache_uncache_page(vcookie->fscache, page);
-		ClearPageFsCache(page);
-	}
-
-	return 1;
+	return fscache_maybe_release_page(vcookie->fscache, page, gfp);
 }
 
 void __v9fs_fscache_invalidate_page(struct page *page)
@@ -368,7 +357,6 @@ void __v9fs_fscache_invalidate_page(struct page *page)
 		fscache_wait_on_page_write(vcookie->fscache, page);
 		BUG_ON(!PageLocked(page));
 		fscache_uncache_page(vcookie->fscache, page);
-		ClearPageFsCache(page);
 	}
 }
 
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44efad7a..f8fccaaad62 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -6,6 +6,10 @@ menu "File systems"
 
 if BLOCK
 
+config FS_JOURNAL_INFO
+	bool
+	default n
+
 source "fs/ext2/Kconfig"
 source "fs/ext3/Kconfig"
 source "fs/ext4/Kconfig"
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 681c2a7b013..39b301662f2 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -315,7 +315,6 @@ static void afs_invalidatepage(struct page *page, unsigned long offset)
 			struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
 			fscache_wait_on_page_write(vnode->cache, page);
 			fscache_uncache_page(vnode->cache, page);
-			ClearPageFsCache(page);
 		}
 #endif
 
@@ -349,17 +348,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 	/* deny if page is being written to the cache and the caller hasn't
 	 * elected to wait */
 #ifdef CONFIG_AFS_FSCACHE
-	if (PageFsCache(page)) {
-		if (fscache_check_page_write(vnode->cache, page)) {
-			if (!(gfp_flags & __GFP_WAIT)) {
-				_leave(" = F [cache busy]");
-				return 0;
-			}
-			fscache_wait_on_page_write(vnode->cache, page);
-		}
-
-		fscache_uncache_page(vnode->cache, page);
-		ClearPageFsCache(page);
+	if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
+		_leave(" = F [cache busy]");
+		return 0;
 	}
 #endif
 
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c63a3c8beb7..5e15a21dbf9 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -671,7 +671,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
 	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
 	ssize_t result;
 	size_t count = iov_length(iov, nr_segs);
-	int ret;
 
 	_enter("{%x.%u},{%zu},%lu,",
 	       vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
@@ -691,13 +690,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
 		return result;
 	}
 
-	/* return error values for O_SYNC and IS_SYNC() */
-	if (IS_SYNC(&vnode->vfs_inode) || iocb->ki_filp->f_flags & O_SYNC) {
-		ret = afs_fsync(iocb->ki_filp, dentry, 1);
-		if (ret < 0)
-			result = ret;
-	}
-
 	_leave(" = %zd", result);
 	return result;
 }
diff --git a/fs/aio.c b/fs/aio.c
index 02a2c934057..1cf12b3dd83 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
 #include <linux/aio_abi.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/backing-dev.h>
 #include <linux/uio.h>
 
 #define DEBUG 0
@@ -32,6 +33,9 @@
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/eventfd.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/hash.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -60,6 +64,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
 
+#define AIO_BATCH_HASH_BITS	3 /* allocated on-stack, so don't go crazy */
+#define AIO_BATCH_HASH_SIZE	(1 << AIO_BATCH_HASH_BITS)
+struct aio_batch_entry {
+	struct hlist_node list;
+	struct address_space *mapping;
+};
+mempool_t *abe_pool;
+
 static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
 
@@ -73,6 +85,8 @@ static int __init aio_setup(void)
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
 	aio_wq = create_workqueue("aio");
+	abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
+	BUG_ON(!abe_pool);
 
 	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
 
@@ -697,10 +711,8 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
 	 */
 	ret = retry(iocb);
 
-	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
-		BUG_ON(!list_empty(&iocb->ki_wait.task_list));
+	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED)
 		aio_complete(iocb, ret, 0);
-	}
 out:
 	spin_lock_irq(&ctx->ctx_lock);
 
@@ -852,13 +864,6 @@ static void try_queue_kicked_iocb(struct kiocb *iocb)
 	unsigned long flags;
 	int run = 0;
 
-	/* We're supposed to be the only path putting the iocb back on the run
-	 * list.  If we find that the iocb is *back* on a wait queue already
-	 * than retry has happened before we could queue the iocb.  This also
-	 * means that the retry could have completed and freed our iocb, no
-	 * good. */
-	BUG_ON((!list_empty(&iocb->ki_wait.task_list)));
-
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 	/* set this inside the lock so that we can't race with aio_run_iocb()
 	 * testing it and putting the iocb on the run list under the lock */
@@ -872,7 +877,7 @@ static void try_queue_kicked_iocb(struct kiocb *iocb)
 /*
  * kick_iocb:
  *      Called typically from a wait queue callback context
- *      (aio_wake_function) to trigger a retry of the iocb.
+ *      to trigger a retry of the iocb.
  *      The retry is usually executed by aio workqueue
  *      threads (See aio_kick_handler).
  */
@@ -1506,33 +1511,44 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
 	return 0;
 }
 
-/*
- * aio_wake_function:
- * 	wait queue callback function for aio notification,
- * 	Simply triggers a retry of the operation via kick_iocb.
- *
- * 	This callback is specified in the wait queue entry in
- *	a kiocb.
- *
- * Note:
- * This routine is executed with the wait queue lock held.
- * Since kick_iocb acquires iocb->ctx->ctx_lock, it nests
- * the ioctx lock inside the wait queue lock. This is safe
- * because this callback isn't used for wait queues which
- * are nested inside ioctx lock (i.e. ctx->wait)
- */
-static int aio_wake_function(wait_queue_t *wait, unsigned mode,
-			     int sync, void *key)
+static void aio_batch_add(struct address_space *mapping,
+			  struct hlist_head *batch_hash)
+{
+	struct aio_batch_entry *abe;
+	struct hlist_node *pos;
+	unsigned bucket;
+
+	bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
+	hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
+		if (abe->mapping == mapping)
+			return;
+	}
+
+	abe = mempool_alloc(abe_pool, GFP_KERNEL);
+	BUG_ON(!igrab(mapping->host));
+	abe->mapping = mapping;
+	hlist_add_head(&abe->list, &batch_hash[bucket]);
+	return;
+}
+
+static void aio_batch_free(struct hlist_head *batch_hash)
 {
-	struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait);
+	struct aio_batch_entry *abe;
+	struct hlist_node *pos, *n;
+	int i;
 
-	list_del_init(&wait->task_list);
-	kick_iocb(iocb);
-	return 1;
+	for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
+			blk_run_address_space(abe->mapping);
+			iput(abe->mapping->host);
+			hlist_del(&abe->list);
+			mempool_free(abe, abe_pool);
+		}
+	}
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb)
+			 struct iocb *iocb, struct hlist_head *batch_hash)
 {
 	struct kiocb *req;
 	struct file *file;
@@ -1592,8 +1608,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf;
 	req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
 	req->ki_opcode = iocb->aio_lio_opcode;
-	init_waitqueue_func_entry(&req->ki_wait, aio_wake_function);
-	INIT_LIST_HEAD(&req->ki_wait.task_list);
 
 	ret = aio_setup_iocb(req);
 
@@ -1608,6 +1622,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			;
 	}
 	spin_unlock_irq(&ctx->ctx_lock);
+	if (req->ki_opcode == IOCB_CMD_PREAD ||
+	    req->ki_opcode == IOCB_CMD_PREADV ||
+	    req->ki_opcode == IOCB_CMD_PWRITE ||
+	    req->ki_opcode == IOCB_CMD_PWRITEV)
+		aio_batch_add(file->f_mapping, batch_hash);
+
 	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
 
@@ -1635,6 +1655,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 	struct kioctx *ctx;
 	long ret = 0;
 	int i;
+	struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
 
 	if (unlikely(nr < 0))
 		return -EINVAL;
@@ -1666,10 +1687,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp);
+		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
 		if (ret)
 			break;
 	}
+	aio_batch_free(batch_hash);
 
 	put_ioctx(ctx);
 	return i ? i : ret;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 8f7cdde4173..0118d67221b 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -60,6 +60,11 @@ do {							\
 		current->pid, __func__, ##args);	\
 } while (0)
 
+struct rehash_entry {
+	struct task_struct *task;
+	struct list_head list;
+};
+
 /* Unified info structure.  This is pointed to by both the dentry and
    inode structures.  Each file in the filesystem has an instance of this
    structure.  It holds a reference to the dentry, so dentries are never
@@ -75,6 +80,9 @@ struct autofs_info {
 	struct completion expire_complete;
 
 	struct list_head active;
+	int active_count;
+	struct list_head rehash_list;
+
 	struct list_head expiring;
 
 	struct autofs_sb_info *sbi;
@@ -95,6 +103,8 @@ struct autofs_info {
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
 #define AUTOFS_INF_MOUNTPOINT	(1<<1) /* mountpoint status for direct expire */
+#define AUTOFS_INF_PENDING	(1<<2) /* dentry pending mount */
+#define AUTOFS_INF_REHASH	(1<<3) /* dentry in transit to ->lookup() */
 
 struct autofs_wait_queue {
 	wait_queue_head_t queue;
@@ -161,7 +171,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
 {
 	struct autofs_info *inf = autofs4_dentry_ino(dentry);
 
-	if (dentry->d_flags & DCACHE_AUTOFS_PENDING)
+	if (inf->flags & AUTOFS_INF_PENDING)
 		return 1;
 
 	if (inf->flags & AUTOFS_INF_EXPIRING)
@@ -264,5 +274,31 @@ out:
 	return ret;
 }
 
+static inline void autofs4_add_expiring(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		if (list_empty(&ino->expiring))
+			list_add(&ino->expiring, &sbi->expiring_list);
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+static inline void autofs4_del_expiring(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		if (!list_empty(&ino->expiring))
+			list_del_init(&ino->expiring);
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
 void autofs4_dentry_release(struct dentry *);
 extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3da18d45348..74bc9aa6df3 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -27,7 +27,7 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 		return 0;
 
 	/* No point expiring a pending mount */
-	if (dentry->d_flags & DCACHE_AUTOFS_PENDING)
+	if (ino->flags & AUTOFS_INF_PENDING)
 		return 0;
 
 	if (!do_now) {
@@ -279,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 			root->d_mounted--;
 		}
 		ino->flags |= AUTOFS_INF_EXPIRING;
+		autofs4_add_expiring(root);
 		init_completion(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		return root;
@@ -406,6 +407,7 @@ found:
 		expired, (int)expired->d_name.len, expired->d_name.name);
 	ino = autofs4_dentry_ino(expired);
 	ino->flags |= AUTOFS_INF_EXPIRING;
+	autofs4_add_expiring(expired);
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 	spin_lock(&dcache_lock);
@@ -433,7 +435,7 @@ int autofs4_expire_wait(struct dentry *dentry)
 
 		DPRINTK("expire done status=%d", status);
 
-		if (d_unhashed(dentry))
+		if (d_unhashed(dentry) && IS_DEADDIR(dentry->d_inode))
 			return -EAGAIN;
 
 		return status;
@@ -473,6 +475,7 @@ int autofs4_expire_run(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	ino = autofs4_dentry_ino(dentry);
 	ino->flags &= ~AUTOFS_INF_EXPIRING;
+	autofs4_del_expiring(dentry);
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
@@ -503,6 +506,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
 		}
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		autofs4_del_expiring(dentry);
 		complete_all(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 69c8142da83..d0a3de24745 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -49,6 +49,8 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 		ino->dentry = NULL;
 		ino->size = 0;
 		INIT_LIST_HEAD(&ino->active);
+		INIT_LIST_HEAD(&ino->rehash_list);
+		ino->active_count = 0;
 		INIT_LIST_HEAD(&ino->expiring);
 		atomic_set(&ino->count, 0);
 	}
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index b96a3c57359..30cc9ddf4b7 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -72,6 +72,139 @@ const struct inode_operations autofs4_dir_inode_operations = {
 	.rmdir		= autofs4_dir_rmdir,
 };
 
+static void autofs4_add_active(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		if (!ino->active_count) {
+			if (list_empty(&ino->active))
+				list_add(&ino->active, &sbi->active_list);
+		}
+		ino->active_count++;
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+static void autofs4_del_active(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		ino->active_count--;
+		if (!ino->active_count) {
+			if (!list_empty(&ino->active))
+				list_del_init(&ino->active);
+		}
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+static void autofs4_add_rehash_entry(struct autofs_info *ino,
+				     struct rehash_entry *entry)
+{
+	entry->task = current;
+	INIT_LIST_HEAD(&entry->list);
+	list_add(&entry->list, &ino->rehash_list);
+	return;
+}
+
+static void autofs4_remove_rehash_entry(struct autofs_info *ino)
+{
+	struct list_head *head = &ino->rehash_list;
+	struct rehash_entry *entry;
+	list_for_each_entry(entry, head, list) {
+		if (entry->task == current) {
+			list_del(&entry->list);
+			kfree(entry);
+			break;
+		}
+	}
+	return;
+}
+
+static void autofs4_remove_rehash_entrys(struct autofs_info *ino)
+{
+	struct autofs_sb_info *sbi = ino->sbi;
+	struct rehash_entry *entry, *next;
+	struct list_head *head;
+
+	spin_lock(&sbi->fs_lock);
+	spin_lock(&sbi->lookup_lock);
+	if (!(ino->flags & AUTOFS_INF_REHASH)) {
+		spin_unlock(&sbi->lookup_lock);
+		spin_unlock(&sbi->fs_lock);
+		return;
+	}
+	ino->flags &= ~AUTOFS_INF_REHASH;
+	head = &ino->rehash_list;
+	list_for_each_entry_safe(entry, next, head, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+	spin_unlock(&sbi->lookup_lock);
+	spin_unlock(&sbi->fs_lock);
+	dput(ino->dentry);
+
+	return;
+}
+
+static void autofs4_revalidate_drop(struct dentry *dentry,
+				    struct rehash_entry *entry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	/*
+	 * Add to the active list so we can pick this up in
+	 * ->lookup(). Also add an entry to a rehash list so
+	 * we know when there are no dentrys in flight so we
+	 * know when we can rehash the dentry.
+	 */
+	spin_lock(&sbi->lookup_lock);
+	if (list_empty(&ino->active))
+		list_add(&ino->active, &sbi->active_list);
+	autofs4_add_rehash_entry(ino, entry);
+	spin_unlock(&sbi->lookup_lock);
+	if (!(ino->flags & AUTOFS_INF_REHASH)) {
+		ino->flags |= AUTOFS_INF_REHASH;
+		dget(dentry);
+		spin_lock(&dentry->d_lock);
+		__d_drop(dentry);
+		spin_unlock(&dentry->d_lock);
+	}
+	return;
+}
+
+static void autofs4_revalidate_rehash(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino->flags & AUTOFS_INF_REHASH) {
+		spin_lock(&sbi->lookup_lock);
+		autofs4_remove_rehash_entry(ino);
+		if (list_empty(&ino->rehash_list)) {
+			spin_unlock(&sbi->lookup_lock);
+			ino->flags &= ~AUTOFS_INF_REHASH;
+			d_rehash(dentry);
+			dput(ino->dentry);
+		} else
+			spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+static unsigned int autofs4_need_mount(unsigned int flags)
+{
+	unsigned int res = 0;
+	if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
+		res = 1;
+	return res;
+}
+
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
@@ -93,7 +226,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	 * it.
 	 */
 	spin_lock(&dcache_lock);
-	if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
+	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dcache_lock);
 		return -ENOENT;
 	}
@@ -103,7 +236,7 @@ out:
 	return dcache_dir_open(inode, file);
 }
 
-static int try_to_fill_dentry(struct dentry *dentry, int flags)
+static int try_to_fill_dentry(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -116,55 +249,17 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 	 * Wait for a pending mount, triggering one if there
 	 * isn't one already
 	 */
-	if (dentry->d_inode == NULL) {
-		DPRINTK("waiting for mount name=%.*s",
-			 dentry->d_name.len, dentry->d_name.name);
-
-		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-
-		DPRINTK("mount done status=%d", status);
-
-		/* Turn this into a real negative dentry? */
-		if (status == -ENOENT) {
-			spin_lock(&dentry->d_lock);
-			dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-			spin_unlock(&dentry->d_lock);
-			return status;
-		} else if (status) {
-			/* Return a negative dentry, but leave it "pending" */
-			return status;
-		}
-	/* Trigger mount for path component or follow link */
-	} else if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
-			flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) ||
-			current->link_count) {
-		DPRINTK("waiting for mount name=%.*s",
-			dentry->d_name.len, dentry->d_name.name);
-
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags |= DCACHE_AUTOFS_PENDING;
-		spin_unlock(&dentry->d_lock);
-		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+	DPRINTK("waiting for mount name=%.*s",
+		 dentry->d_name.len, dentry->d_name.name);
 
-		DPRINTK("mount done status=%d", status);
+	status = autofs4_wait(sbi, dentry, NFY_MOUNT);
 
-		if (status) {
-			spin_lock(&dentry->d_lock);
-			dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-			spin_unlock(&dentry->d_lock);
-			return status;
-		}
-	}
-
-	/* Initialize expiry counter after successful mount */
-	if (ino)
-		ino->last_used = jiffies;
+	DPRINTK("mount done status=%d", status);
 
-	spin_lock(&dentry->d_lock);
-	dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-	spin_unlock(&dentry->d_lock);
+	/* Update expiry counter */
+	ino->last_used = jiffies;
 
-	return 0;
+	return status;
 }
 
 /* For autofs direct mounts the follow link triggers the mount */
@@ -202,27 +297,39 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	autofs4_expire_wait(dentry);
 
 	/* We trigger a mount for almost all flags */
-	lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
-	if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
+	lookup_type = autofs4_need_mount(nd->flags);
+	spin_lock(&sbi->fs_lock);
+	spin_lock(&dcache_lock);
+	if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
+		spin_unlock(&dcache_lock);
+		spin_unlock(&sbi->fs_lock);
 		goto follow;
+	}
 
 	/*
 	 * If the dentry contains directories then it is an autofs
 	 * multi-mount with no root mount offset. So don't try to
 	 * mount it again.
 	 */
-	spin_lock(&dcache_lock);
-	if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
-	    (!d_mountpoint(dentry) && __simple_empty(dentry))) {
+	if (ino->flags & AUTOFS_INF_PENDING ||
+	    (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
+		ino->flags |= AUTOFS_INF_PENDING;
 		spin_unlock(&dcache_lock);
+		spin_unlock(&sbi->fs_lock);
+
+		status = try_to_fill_dentry(dentry);
+
+		spin_lock(&sbi->fs_lock);
+		ino->flags &= ~AUTOFS_INF_PENDING;
+		spin_unlock(&sbi->fs_lock);
 
-		status = try_to_fill_dentry(dentry, 0);
 		if (status)
 			goto out_error;
 
 		goto follow;
 	}
 	spin_unlock(&dcache_lock);
+	spin_unlock(&sbi->fs_lock);
 follow:
 	/*
 	 * If there is no root mount it must be an autofs
@@ -254,18 +361,47 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
-	int oz_mode = autofs4_oz_mode(sbi);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct rehash_entry *entry;
 	int flags = nd ? nd->flags : 0;
-	int status = 1;
+	unsigned int mutex_aquired;
+
+	DPRINTK("name = %.*s oz_mode = %d",
+		dentry->d_name.len, dentry->d_name.name, oz_mode);
+
+	/* Daemon never causes a mount to trigger */
+	if (autofs4_oz_mode(sbi))
+		return 1;
+
+	entry = kmalloc(sizeof(struct rehash_entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	mutex_aquired = mutex_trylock(&dir->i_mutex);
 
-	/* Pending dentry */
 	spin_lock(&sbi->fs_lock);
+	spin_lock(&dcache_lock);
+	/* Pending dentry */
 	if (autofs4_ispending(dentry)) {
-		/* The daemon never causes a mount to trigger */
-		spin_unlock(&sbi->fs_lock);
+		int status;
 
-		if (oz_mode)
-			return 1;
+		/*
+		 * We can only unhash and send this to ->lookup() if
+		 * the directory mutex is held over d_revalidate() and
+		 * ->lookup(). This prevents the VFS from incorrectly
+		 * seeing the dentry as non-existent.
+		 */
+		ino->flags |= AUTOFS_INF_PENDING;
+		if (!mutex_aquired) {
+			autofs4_revalidate_drop(dentry, entry);
+			spin_unlock(&dcache_lock);
+			spin_unlock(&sbi->fs_lock);
+			return 0;
+		}
+		spin_unlock(&dcache_lock);
+		spin_unlock(&sbi->fs_lock);
+		mutex_unlock(&dir->i_mutex);
+		kfree(entry);
 
 		/*
 		 * If the directory has gone away due to an expire
@@ -279,46 +415,82 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		 * A zero status is success otherwise we have a
 		 * negative error code.
 		 */
-		status = try_to_fill_dentry(dentry, flags);
+		status = try_to_fill_dentry(dentry);
+
+		spin_lock(&sbi->fs_lock);
+		ino->flags &= ~AUTOFS_INF_PENDING;
+		spin_unlock(&sbi->fs_lock);
+
 		if (status == 0)
 			return 1;
 
 		return status;
 	}
-	spin_unlock(&sbi->fs_lock);
-
-	/* Negative dentry.. invalidate if "old" */
-	if (dentry->d_inode == NULL)
-		return 0;
 
 	/* Check for a non-mountpoint directory with no contents */
-	spin_lock(&dcache_lock);
 	if (S_ISDIR(dentry->d_inode->i_mode) &&
-	    !d_mountpoint(dentry) && 
-	    __simple_empty(dentry)) {
+	    !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		DPRINTK("dentry=%p %.*s, emptydir",
 			 dentry, dentry->d_name.len, dentry->d_name.name);
-		spin_unlock(&dcache_lock);
 
-		/* The daemon never causes a mount to trigger */
-		if (oz_mode)
-			return 1;
+		if (autofs4_need_mount(flags) || current->link_count) {
+			int status;
 
-		/*
-		 * A zero status is success otherwise we have a
-		 * negative error code.
-		 */
-		status = try_to_fill_dentry(dentry, flags);
-		if (status == 0)
-			return 1;
+			/*
+			 * We can only unhash and send this to ->lookup() if
+			 * the directory mutex is held over d_revalidate() and
+			 * ->lookup(). This prevents the VFS from incorrectly
+			 * seeing the dentry as non-existent.
+			 */
+			ino->flags |= AUTOFS_INF_PENDING;
+			if (!mutex_aquired) {
+				autofs4_revalidate_drop(dentry, entry);
+				spin_unlock(&dcache_lock);
+				spin_unlock(&sbi->fs_lock);
+				return 0;
+			}
+			spin_unlock(&dcache_lock);
+			spin_unlock(&sbi->fs_lock);
+			mutex_unlock(&dir->i_mutex);
+			kfree(entry);
 
-		return status;
+			/*
+			 * A zero status is success otherwise we have a
+			 * negative error code.
+			 */
+			status = try_to_fill_dentry(dentry);
+
+			spin_lock(&sbi->fs_lock);
+			ino->flags &= ~AUTOFS_INF_PENDING;
+			spin_unlock(&sbi->fs_lock);
+
+			if (status == 0)
+				return 1;
+
+			return status;
+		}
 	}
 	spin_unlock(&dcache_lock);
+	spin_unlock(&sbi->fs_lock);
+
+	if (mutex_aquired)
+		mutex_unlock(&dir->i_mutex);
+
+	kfree(entry);
 
 	return 1;
 }
 
+static void autofs4_free_rehash_entrys(struct autofs_info *inf)
+{
+	struct list_head *head = &inf->rehash_list;
+	struct rehash_entry *entry, *next;
+	list_for_each_entry_safe(entry, next, head, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+}
+
 void autofs4_dentry_release(struct dentry *de)
 {
 	struct autofs_info *inf;
@@ -337,6 +509,8 @@ void autofs4_dentry_release(struct dentry *de)
 				list_del(&inf->active);
 			if (!list_empty(&inf->expiring))
 				list_del(&inf->expiring);
+			if (!list_empty(&inf->rehash_list))
+				autofs4_free_rehash_entrys(inf);
 			spin_unlock(&sbi->lookup_lock);
 		}
 
@@ -359,35 +533,52 @@ static const struct dentry_operations autofs4_dentry_operations = {
 	.d_release	= autofs4_dentry_release,
 };
 
-static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 {
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct dentry *parent = dentry->d_parent;
+	struct qstr *name = &dentry->d_name;
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 
+restart:
 	spin_lock(&dcache_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->active_list;
 	list_for_each(p, head) {
 		struct autofs_info *ino;
-		struct dentry *dentry;
+		struct dentry *active;
 		struct qstr *qstr;
 
 		ino = list_entry(p, struct autofs_info, active);
-		dentry = ino->dentry;
+		active = ino->dentry;
 
-		spin_lock(&dentry->d_lock);
+		spin_lock(&active->d_lock);
 
 		/* Already gone? */
-		if (atomic_read(&dentry->d_count) == 0)
+		if (atomic_read(&active->d_count) == 0)
 			goto next;
 
-		qstr = &dentry->d_name;
+		if (active->d_inode && IS_DEADDIR(active->d_inode)) {
+			if (!list_empty(&ino->rehash_list)) {
+				dget(active);
+				spin_unlock(&active->d_lock);
+				spin_unlock(&sbi->lookup_lock);
+				spin_unlock(&dcache_lock);
+				autofs4_remove_rehash_entrys(ino);
+				dput(active);
+				goto restart;
+			}
+			goto next;
+		}
+
+		qstr = &active->d_name;
 
-		if (dentry->d_name.hash != hash)
+		if (active->d_name.hash != hash)
 			goto next;
-		if (dentry->d_parent != parent)
+		if (active->d_parent != parent)
 			goto next;
 
 		if (qstr->len != len)
@@ -395,15 +586,13 @@ static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct d
 		if (memcmp(qstr->name, str, len))
 			goto next;
 
-		if (d_unhashed(dentry)) {
-			dget(dentry);
-			spin_unlock(&dentry->d_lock);
-			spin_unlock(&sbi->lookup_lock);
-			spin_unlock(&dcache_lock);
-			return dentry;
-		}
+		dget(active);
+		spin_unlock(&active->d_lock);
+		spin_unlock(&sbi->lookup_lock);
+		spin_unlock(&dcache_lock);
+		return active;
 next:
-		spin_unlock(&dentry->d_lock);
+		spin_unlock(&active->d_lock);
 	}
 	spin_unlock(&sbi->lookup_lock);
 	spin_unlock(&dcache_lock);
@@ -411,8 +600,11 @@ next:
 	return NULL;
 }
 
-static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 {
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct dentry *parent = dentry->d_parent;
+	struct qstr *name = &dentry->d_name;
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
 	const unsigned char *str = name->name;
@@ -423,23 +615,23 @@ static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct
 	head = &sbi->expiring_list;
 	list_for_each(p, head) {
 		struct autofs_info *ino;
-		struct dentry *dentry;
+		struct dentry *expiring;
 		struct qstr *qstr;
 
 		ino = list_entry(p, struct autofs_info, expiring);
-		dentry = ino->dentry;
+		expiring = ino->dentry;
 
-		spin_lock(&dentry->d_lock);
+		spin_lock(&expiring->d_lock);
 
 		/* Bad luck, we've already been dentry_iput */
-		if (!dentry->d_inode)
+		if (!expiring->d_inode)
 			goto next;
 
-		qstr = &dentry->d_name;
+		qstr = &expiring->d_name;
 
-		if (dentry->d_name.hash != hash)
+		if (expiring->d_name.hash != hash)
 			goto next;
-		if (dentry->d_parent != parent)
+		if (expiring->d_parent != parent)
 			goto next;
 
 		if (qstr->len != len)
@@ -447,15 +639,13 @@ static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct
 		if (memcmp(qstr->name, str, len))
 			goto next;
 
-		if (d_unhashed(dentry)) {
-			dget(dentry);
-			spin_unlock(&dentry->d_lock);
-			spin_unlock(&sbi->lookup_lock);
-			spin_unlock(&dcache_lock);
-			return dentry;
-		}
+		dget(expiring);
+		spin_unlock(&expiring->d_lock);
+		spin_unlock(&sbi->lookup_lock);
+		spin_unlock(&dcache_lock);
+		return expiring;
 next:
-		spin_unlock(&dentry->d_lock);
+		spin_unlock(&expiring->d_lock);
 	}
 	spin_unlock(&sbi->lookup_lock);
 	spin_unlock(&dcache_lock);
@@ -463,13 +653,56 @@ next:
 	return NULL;
 }
 
+static struct autofs_info *init_new_dentry(struct autofs_sb_info *sbi,
+					   struct dentry *dentry, int oz_mode)
+{
+	struct autofs_info *ino;
+
+	/*
+	 * Mark the dentry incomplete but don't hash it. We do this
+	 * to serialize our inode creation operations (symlink and
+	 * mkdir) which prevents deadlock during the callback to
+	 * the daemon. Subsequent user space lookups for the same
+	 * dentry are placed on the wait queue while the daemon
+	 * itself is allowed passage unresticted so the create
+	 * operation itself can then hash the dentry. Finally,
+	 * we check for the hashed dentry and return the newly
+	 * hashed dentry.
+	 */
+	dentry->d_op = &autofs4_root_dentry_operations;
+
+	/*
+	 * And we need to ensure that the same dentry is used for
+	 * all following lookup calls until it is hashed so that
+	 * the dentry flags are persistent throughout the request.
+	 */
+	ino = autofs4_init_ino(NULL, sbi, 0555);
+	if (!ino)
+		return ERR_PTR(-ENOMEM);
+
+	dentry->d_fsdata = ino;
+	ino->dentry = dentry;
+
+	/*
+	 * Only set the mount pending flag for new dentrys not created
+	 * by the daemon.
+	 */
+	if (!oz_mode)
+		ino->flags |= AUTOFS_INF_PENDING;
+
+	d_instantiate(dentry, NULL);
+
+	return ino;
+}
+
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
-	struct dentry *expiring, *unhashed;
+	struct dentry *expiring, *active;
 	int oz_mode;
+	int status = 0;
 
 	DPRINTK("name = %.*s",
 		dentry->d_name.len, dentry->d_name.name);
@@ -484,123 +717,100 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
 		 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
 
-	unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
-	if (unhashed)
-		dentry = unhashed;
-	else {
-		/*
-		 * Mark the dentry incomplete but don't hash it. We do this
-		 * to serialize our inode creation operations (symlink and
-		 * mkdir) which prevents deadlock during the callback to
-		 * the daemon. Subsequent user space lookups for the same
-		 * dentry are placed on the wait queue while the daemon
-		 * itself is allowed passage unresticted so the create
-		 * operation itself can then hash the dentry. Finally,
-		 * we check for the hashed dentry and return the newly
-		 * hashed dentry.
-		 */
-		dentry->d_op = &autofs4_root_dentry_operations;
-
-		/*
-		 * And we need to ensure that the same dentry is used for
-		 * all following lookup calls until it is hashed so that
-		 * the dentry flags are persistent throughout the request.
-		 */
-		ino = autofs4_init_ino(NULL, sbi, 0555);
-		if (!ino)
-			return ERR_PTR(-ENOMEM);
-
-		dentry->d_fsdata = ino;
-		ino->dentry = dentry;
-
-		spin_lock(&sbi->lookup_lock);
-		list_add(&ino->active, &sbi->active_list);
-		spin_unlock(&sbi->lookup_lock);
-
-		d_instantiate(dentry, NULL);
+	spin_lock(&sbi->fs_lock);
+	active = autofs4_lookup_active(dentry);
+	if (active) {
+		dentry = active;
+		ino = autofs4_dentry_ino(dentry);
+		/* If this came from revalidate, rehash it */
+		autofs4_revalidate_rehash(dentry);
+		spin_unlock(&sbi->fs_lock);
+	} else {
+		spin_unlock(&sbi->fs_lock);
+		ino = init_new_dentry(sbi, dentry, oz_mode);
+		if (IS_ERR(ino))
+			return (struct dentry *) ino;
 	}
 
+	autofs4_add_active(dentry);
+
 	if (!oz_mode) {
+		expiring = autofs4_lookup_expiring(dentry);
 		mutex_unlock(&dir->i_mutex);
-		expiring = autofs4_lookup_expiring(sbi,
-						   dentry->d_parent,
-						   &dentry->d_name);
 		if (expiring) {
 			/*
 			 * If we are racing with expire the request might not
 			 * be quite complete but the directory has been removed
 			 * so it must have been successful, so just wait for it.
 			 */
-			ino = autofs4_dentry_ino(expiring);
 			autofs4_expire_wait(expiring);
-			spin_lock(&sbi->lookup_lock);
-			if (!list_empty(&ino->expiring))
-				list_del_init(&ino->expiring);
-			spin_unlock(&sbi->lookup_lock);
 			dput(expiring);
 		}
-
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags |= DCACHE_AUTOFS_PENDING;
-		spin_unlock(&dentry->d_lock);
-		if (dentry->d_op && dentry->d_op->d_revalidate)
-			(dentry->d_op->d_revalidate)(dentry, nd);
+		status = try_to_fill_dentry(dentry);
 		mutex_lock(&dir->i_mutex);
+		spin_lock(&sbi->fs_lock);
+		ino->flags &= ~AUTOFS_INF_PENDING;
+		spin_unlock(&sbi->fs_lock);
 	}
 
+	autofs4_del_active(dentry);
+
 	/*
-	 * If we are still pending, check if we had to handle
+	 * If we had a mount fail, check if we had to handle
 	 * a signal. If so we can force a restart..
 	 */
-	if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
+	if (status) {
 		/* See if we were interrupted */
 		if (signal_pending(current)) {
 			sigset_t *sigset = &current->pending.signal;
 			if (sigismember (sigset, SIGKILL) ||
 			    sigismember (sigset, SIGQUIT) ||
 			    sigismember (sigset, SIGINT)) {
-			    if (unhashed)
-				dput(unhashed);
+			    if (active)
+				dput(active);
 			    return ERR_PTR(-ERESTARTNOINTR);
 			}
 		}
-		if (!oz_mode) {
-			spin_lock(&dentry->d_lock);
-			dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-			spin_unlock(&dentry->d_lock);
+	}
+
+	/*
+	 * User space can (and has done in the past) remove and re-create
+	 * this directory during the callback. This can leave us with an
+	 * unhashed dentry, but a successful mount!  So we need to
+	 * perform another cached lookup in case the dentry now exists.
+	 */
+	if (!oz_mode && !have_submounts(dentry)) {
+		struct dentry *new;
+		new = d_lookup(dentry->d_parent, &dentry->d_name);
+		if (new) {
+			if (active)
+				dput(active);
+			return new;
+		} else {
+			if (!status)
+				status = -ENOENT;
 		}
 	}
 
 	/*
-	 * If this dentry is unhashed, then we shouldn't honour this
-	 * lookup.  Returning ENOENT here doesn't do the right thing
-	 * for all system calls, but it should be OK for the operations
-	 * we permit from an autofs.
+	 * If we had a mount failure, return status to user space.
+	 * If the mount succeeded and we used a dentry from the active queue
+	 * return it.
 	 */
-	if (!oz_mode && d_unhashed(dentry)) {
+	if (status) {
+		dentry = ERR_PTR(status);
+		if (active)
+			dput(active);
+		return dentry;
+	} else {
 		/*
-		 * A user space application can (and has done in the past)
-		 * remove and re-create this directory during the callback.
-		 * This can leave us with an unhashed dentry, but a
-		 * successful mount!  So we need to perform another
-		 * cached lookup in case the dentry now exists.
+		 * Valid successful mount, return active dentry or NULL
+		 * for a new dentry.
 		 */
-		struct dentry *parent = dentry->d_parent;
-		struct dentry *new = d_lookup(parent, &dentry->d_name);
-		if (new != NULL)
-			dentry = new;
-		else
-			dentry = ERR_PTR(-ENOENT);
-
-		if (unhashed)
-			dput(unhashed);
-
-		return dentry;
+		if (active)
+			return active;
 	}
 
-	if (unhashed)
-		return unhashed;
-
 	return NULL;
 }
 
@@ -624,11 +834,6 @@ static int autofs4_dir_symlink(struct inode *dir,
 	if (!ino)
 		return -ENOMEM;
 
-	spin_lock(&sbi->lookup_lock);
-	if (!list_empty(&ino->active))
-		list_del_init(&ino->active);
-	spin_unlock(&sbi->lookup_lock);
-
 	ino->size = strlen(symname);
 	cp = kmalloc(ino->size + 1, GFP_KERNEL);
 	if (!cp) {
@@ -705,10 +910,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	dir->i_mtime = CURRENT_TIME;
 
 	spin_lock(&dcache_lock);
-	spin_lock(&sbi->lookup_lock);
-	if (list_empty(&ino->expiring))
-		list_add(&ino->expiring, &sbi->expiring_list);
-	spin_unlock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -734,10 +935,6 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
-	spin_lock(&sbi->lookup_lock);
-	if (list_empty(&ino->expiring))
-		list_add(&ino->expiring, &sbi->expiring_list);
-	spin_unlock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -775,11 +972,6 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (!ino)
 		return -ENOMEM;
 
-	spin_lock(&sbi->lookup_lock);
-	if (!list_empty(&ino->active))
-		list_del_init(&ino->active);
-	spin_unlock(&sbi->lookup_lock);
-
 	inode = autofs4_get_inode(dir->i_sb, ino);
 	if (!inode) {
 		if (!dentry->d_fsdata)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b9b3bb51b1e..97b6e9efeb7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -44,7 +44,7 @@ static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
  * If we don't support core dumping, then supply a NULL so we
  * don't even try.
  */
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
 static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
 #else
 #define elf_core_dump	NULL
@@ -767,7 +767,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	
 	current->mm->start_stack = bprm->p;
 
-	/* Now we do a little grungy work by mmaping the ELF image into
+	/* Now we do a little grungy work by mmapping the ELF image into
 	   the correct location in memory. */
 	for(i = 0, elf_ppnt = elf_phdata;
 	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
@@ -1101,12 +1101,7 @@ out:
 	return error;
 }
 
-/*
- * Note that some platforms still use traditional core dumps and not
- * the ELF core dump.  Each platform can select it as appropriate.
- */
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-
+#ifdef CONFIG_ELF_CORE
 /*
  * ELF core dumper
  *
@@ -2063,7 +2058,7 @@ out:
 	return has_dumped;
 }
 
-#endif		/* USE_ELF_CORE_DUMP */
+#endif		/* CONFIG_ELF_CORE */
 
 static int __init init_elf_binfmt(void)
 {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 38502c67987..7b055385db8 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -75,14 +75,14 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *,
 static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *,
 					     struct file *, struct mm_struct *);
 
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
 static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit);
 #endif
 
 static struct linux_binfmt elf_fdpic_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_elf_fdpic_binary,
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
 	.core_dump	= elf_fdpic_core_dump,
 #endif
 	.min_coredump	= ELF_EXEC_PAGESIZE,
@@ -380,7 +380,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	down_write(&current->mm->mmap_sem);
 	current->mm->start_brk = do_mmap(NULL, 0, stack_size,
 					 PROT_READ | PROT_WRITE | PROT_EXEC,
-					 MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN,
+					 MAP_PRIVATE | MAP_ANONYMOUS |
+					 MAP_UNINITIALIZED | MAP_GROWSDOWN,
 					 0);
 
 	if (IS_ERR_VALUE(current->mm->start_brk)) {
@@ -1200,7 +1201,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
  *
  * Modelled on fs/binfmt_elf.c core dumper
  */
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
 
 /*
  * These are the only things you should do on a core-file: use only these
@@ -1825,4 +1826,4 @@ cleanup:
 #undef NUM_NOTES
 }
 
-#endif		/* USE_ELF_CORE_DUMP */
+#endif		/* CONFIG_ELF_CORE */
diff --git a/fs/bio.c b/fs/bio.c
index 12da5db8682..76e6713abf9 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -272,7 +272,7 @@ EXPORT_SYMBOL(bio_init);
  *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
  *   fall back to just using @kmalloc to allocate the required memory.
  *
- *   Note that the caller must set ->bi_destructor on succesful return
+ *   Note that the caller must set ->bi_destructor on successful return
  *   of a bio, to do the appropriate freeing of the bio once the reference
  *   count drops to zero.
  **/
@@ -1393,6 +1393,18 @@ void bio_check_pages_dirty(struct bio *bio)
 	}
 }
 
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void bio_flush_dcache_pages(struct bio *bi)
+{
+	int i;
+	struct bio_vec *bvec;
+
+	bio_for_each_segment(bvec, bi, i)
+		flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL(bio_flush_dcache_pages);
+#endif
+
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8bed0557d88..73d6a735b8f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -405,7 +405,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
  
 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
-	return sync_blockdev(I_BDEV(filp->f_mapping->host));
+	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+	int error;
+
+	error = sync_blockdev(bdev);
+	if (error)
+		return error;
+	
+	error = blkdev_issue_flush(bdev, NULL);
+	if (error == -EOPNOTSUPP)
+		error = 0;
+	return error;
 }
 
 /*
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e57..402afe0a0bf 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,7 @@ config BTRFS_FS
 	select LIBCRC32C
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
+	select FS_JOURNAL_INFO
 	help
 	  Btrfs is a new filesystem with extents, writable snapshotting,
 	  support for multiple devices and many more features.
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ccbdcb54ec5..46bea0f4dc7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -256,7 +256,7 @@ out:
  * Insert @em into @tree or perform a simple forward/backward merge with
  * existing mappings.  The extent_map struct passed in will be inserted
  * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was sucessfull.
+ * reference dropped if the merge attempt was successfull.
  */
 int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 06550affbd2..77f759302e1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -909,7 +909,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	unsigned long last_index;
 	int will_write;
 
-	will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
+	will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
 		      (file->f_flags & O_DIRECT));
 
 	nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
@@ -1076,7 +1076,7 @@ out_nolock:
 		if (err)
 			num_written = err;
 
-		if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+		if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
 			trans = btrfs_start_transaction(root, 1);
 			ret = btrfs_log_dentry_safe(trans, root,
 						    file->f_dentry);
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 4618516dd99..c2413561ea7 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -21,6 +21,7 @@
 #include <linux/mount.h>
 #include <linux/statfs.h>
 #include <linux/ctype.h>
+#include <linux/string.h>
 #include <linux/fs_struct.h>
 #include "internal.h"
 
@@ -257,8 +258,7 @@ static ssize_t cachefiles_daemon_write(struct file *file,
 		if (args == data)
 			goto error;
 		*args = '\0';
-		for (args++; isspace(*args); args++)
-			continue;
+		args = skip_spaces(++args);
 	}
 
 	/* run the appropriate command handler */
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 431accd475a..27089311fbe 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -114,8 +114,9 @@ nomem_lookup_data:
 
 /*
  * attempt to look up the nominated node in this cache
+ * - return -ETIMEDOUT to be scheduled again
  */
-static void cachefiles_lookup_object(struct fscache_object *_object)
+static int cachefiles_lookup_object(struct fscache_object *_object)
 {
 	struct cachefiles_lookup_data *lookup_data;
 	struct cachefiles_object *parent, *object;
@@ -145,13 +146,15 @@ static void cachefiles_lookup_object(struct fscache_object *_object)
 	    object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
 		cachefiles_attr_changed(&object->fscache);
 
-	if (ret < 0) {
-		printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
-		       ret);
+	if (ret < 0 && ret != -ETIMEDOUT) {
+		if (ret != -ENOBUFS)
+			printk(KERN_WARNING
+			       "CacheFiles: Lookup failed error %d\n", ret);
 		fscache_object_lookup_error(&object->fscache);
 	}
 
 	_leave(" [%d]", ret);
+	return ret;
 }
 
 /*
@@ -331,6 +334,7 @@ static void cachefiles_put_object(struct fscache_object *_object)
 		}
 
 		cache = object->fscache.cache;
+		fscache_object_destroy(&object->fscache);
 		kmem_cache_free(cachefiles_object_jar, object);
 		fscache_object_destroyed(cache);
 	}
@@ -403,12 +407,26 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 	if (oi_size == ni_size)
 		return 0;
 
-	newattrs.ia_size = ni_size;
-	newattrs.ia_valid = ATTR_SIZE;
-
 	cachefiles_begin_secure(cache, &saved_cred);
 	mutex_lock(&object->backer->d_inode->i_mutex);
+
+	/* if there's an extension to a partial page at the end of the backing
+	 * file, we need to discard the partial page so that we pick up new
+	 * data after it */
+	if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
+		_debug("discard tail %llx", oi_size);
+		newattrs.ia_valid = ATTR_SIZE;
+		newattrs.ia_size = oi_size & PAGE_MASK;
+		ret = notify_change(object->backer, &newattrs);
+		if (ret < 0)
+			goto truncate_failed;
+	}
+
+	newattrs.ia_valid = ATTR_SIZE;
+	newattrs.ia_size = ni_size;
 	ret = notify_change(object->backer, &newattrs);
+
+truncate_failed:
 	mutex_unlock(&object->backer->d_inode->i_mutex);
 	cachefiles_end_secure(cache, saved_cred);
 
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 4ce818ae39e..14ac4806e29 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -21,17 +21,81 @@
 #include <linux/security.h>
 #include "internal.h"
 
-static int cachefiles_wait_bit(void *flags)
+#define CACHEFILES_KEYBUF_SIZE 512
+
+/*
+ * dump debugging info about an object
+ */
+static noinline
+void __cachefiles_printk_object(struct cachefiles_object *object,
+				const char *prefix,
+				u8 *keybuf)
 {
-	schedule();
-	return 0;
+	struct fscache_cookie *cookie;
+	unsigned keylen, loop;
+
+	printk(KERN_ERR "%sobject: OBJ%x\n",
+	       prefix, object->fscache.debug_id);
+	printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
+	       prefix, fscache_object_states[object->fscache.state],
+	       object->fscache.flags, object->fscache.work.flags,
+	       object->fscache.events,
+	       object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
+	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
+	       prefix, object->fscache.n_ops, object->fscache.n_in_progress,
+	       object->fscache.n_exclusive);
+	printk(KERN_ERR "%sparent=%p\n",
+	       prefix, object->fscache.parent);
+
+	spin_lock(&object->fscache.lock);
+	cookie = object->fscache.cookie;
+	if (cookie) {
+		printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n",
+		       prefix,
+		       object->fscache.cookie,
+		       object->fscache.cookie->parent,
+		       object->fscache.cookie->netfs_data,
+		       object->fscache.cookie->flags);
+		if (keybuf)
+			keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
+						      CACHEFILES_KEYBUF_SIZE);
+		else
+			keylen = 0;
+	} else {
+		printk(KERN_ERR "%scookie=NULL\n", prefix);
+		keylen = 0;
+	}
+	spin_unlock(&object->fscache.lock);
+
+	if (keylen) {
+		printk(KERN_ERR "%skey=[%u] '", prefix, keylen);
+		for (loop = 0; loop < keylen; loop++)
+			printk("%02x", keybuf[loop]);
+		printk("'\n");
+	}
+}
+
+/*
+ * dump debugging info about a pair of objects
+ */
+static noinline void cachefiles_printk_object(struct cachefiles_object *object,
+					      struct cachefiles_object *xobject)
+{
+	u8 *keybuf;
+
+	keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
+	if (object)
+		__cachefiles_printk_object(object, "", keybuf);
+	if (xobject)
+		__cachefiles_printk_object(xobject, "x", keybuf);
+	kfree(keybuf);
 }
 
 /*
  * record the fact that an object is now active
  */
-static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
-					  struct cachefiles_object *object)
+static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
+					 struct cachefiles_object *object)
 {
 	struct cachefiles_object *xobject;
 	struct rb_node **_p, *_parent = NULL;
@@ -42,8 +106,11 @@ static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
 try_again:
 	write_lock(&cache->active_lock);
 
-	if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
+	if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+		printk(KERN_ERR "CacheFiles: Error: Object already active\n");
+		cachefiles_printk_object(object, NULL);
 		BUG();
+	}
 
 	dentry = object->dentry;
 	_p = &cache->active_nodes.rb_node;
@@ -66,8 +133,8 @@ try_again:
 	rb_insert_color(&object->active_node, &cache->active_nodes);
 
 	write_unlock(&cache->active_lock);
-	_leave("");
-	return;
+	_leave(" = 0");
+	return 0;
 
 	/* an old object from a previous incarnation is hogging the slot - we
 	 * need to wait for it to be destroyed */
@@ -76,44 +143,70 @@ wait_for_old_object:
 		printk(KERN_ERR "\n");
 		printk(KERN_ERR "CacheFiles: Error:"
 		       " Unexpected object collision\n");
-		printk(KERN_ERR "xobject: OBJ%x\n",
-		       xobject->fscache.debug_id);
-		printk(KERN_ERR "xobjstate=%s\n",
-		       fscache_object_states[xobject->fscache.state]);
-		printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
-		printk(KERN_ERR "xobjevent=%lx [%lx]\n",
-		       xobject->fscache.events, xobject->fscache.event_mask);
-		printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
-		       xobject->fscache.n_ops, xobject->fscache.n_in_progress,
-		       xobject->fscache.n_exclusive);
-		printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
-		       xobject->fscache.cookie,
-		       xobject->fscache.cookie->parent,
-		       xobject->fscache.cookie->netfs_data,
-		       xobject->fscache.cookie->flags);
-		printk(KERN_ERR "xparent=%p\n",
-		       xobject->fscache.parent);
-		printk(KERN_ERR "object: OBJ%x\n",
-		       object->fscache.debug_id);
-		printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
-		       object->fscache.cookie,
-		       object->fscache.cookie->parent,
-		       object->fscache.cookie->netfs_data,
-		       object->fscache.cookie->flags);
-		printk(KERN_ERR "parent=%p\n",
-		       object->fscache.parent);
+		cachefiles_printk_object(object, xobject);
 		BUG();
 	}
 	atomic_inc(&xobject->usage);
 	write_unlock(&cache->active_lock);
 
-	_debug(">>> wait");
-	wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
-		    cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
-	_debug("<<< waited");
+	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+		wait_queue_head_t *wq;
+
+		signed long timeout = 60 * HZ;
+		wait_queue_t wait;
+		bool requeue;
+
+		/* if the object we're waiting for is queued for processing,
+		 * then just put ourselves on the queue behind it */
+		if (slow_work_is_queued(&xobject->fscache.work)) {
+			_debug("queue OBJ%x behind OBJ%x immediately",
+			       object->fscache.debug_id,
+			       xobject->fscache.debug_id);
+			goto requeue;
+		}
+
+		/* otherwise we sleep until either the object we're waiting for
+		 * is done, or the slow-work facility wants the thread back to
+		 * do other work */
+		wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
+		init_wait(&wait);
+		requeue = false;
+		do {
+			prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+			if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
+				break;
+			requeue = slow_work_sleep_till_thread_needed(
+				&object->fscache.work, &timeout);
+		} while (timeout > 0 && !requeue);
+		finish_wait(wq, &wait);
+
+		if (requeue &&
+		    test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+			_debug("queue OBJ%x behind OBJ%x after wait",
+			       object->fscache.debug_id,
+			       xobject->fscache.debug_id);
+			goto requeue;
+		}
+
+		if (timeout <= 0) {
+			printk(KERN_ERR "\n");
+			printk(KERN_ERR "CacheFiles: Error: Overlong"
+			       " wait for old active object to go away\n");
+			cachefiles_printk_object(object, xobject);
+			goto requeue;
+		}
+	}
+
+	ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
 
 	cache->cache.ops->put_object(&xobject->fscache);
 	goto try_again;
+
+requeue:
+	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+	cache->cache.ops->put_object(&xobject->fscache);
+	_leave(" = -ETIMEDOUT");
+	return -ETIMEDOUT;
 }
 
 /*
@@ -254,7 +347,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 
 	dir = dget_parent(object->dentry);
 
-	mutex_lock(&dir->d_inode->i_mutex);
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
 	ret = cachefiles_bury_object(cache, dir, object->dentry);
 
 	dput(dir);
@@ -307,7 +400,7 @@ lookup_again:
 	/* search the current directory for the element name */
 	_debug("lookup '%s'", name);
 
-	mutex_lock(&dir->d_inode->i_mutex);
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
 
 	start = jiffies;
 	next = lookup_one_len(name, dir, nlen);
@@ -418,12 +511,15 @@ lookup_again:
 	}
 
 	/* note that we're now using this object */
-	cachefiles_mark_object_active(cache, object);
+	ret = cachefiles_mark_object_active(cache, object);
 
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(dir);
 	dir = NULL;
 
+	if (ret == -ETIMEDOUT)
+		goto mark_active_timed_out;
+
 	_debug("=== OBTAINED_OBJECT ===");
 
 	if (object->new) {
@@ -467,6 +563,10 @@ create_error:
 		cachefiles_io_error(cache, "Create/mkdir failed");
 	goto error;
 
+mark_active_timed_out:
+	_debug("mark active timed out");
+	goto release_dentry;
+
 check_error:
 	_debug("check error %d", ret);
 	write_lock(&cache->active_lock);
@@ -474,7 +574,7 @@ check_error:
 	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
 	wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
 	write_unlock(&cache->active_lock);
-
+release_dentry:
 	dput(object->dentry);
 	object->dentry = NULL;
 	goto error_out;
@@ -495,9 +595,6 @@ error:
 error_out2:
 	dput(dir);
 error_out:
-	if (ret == -ENOSPC)
-		ret = -ENOBUFS;
-
 	_leave(" = error %d", -ret);
 	return ret;
 }
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index a69787e7dd9..a6c8c6fe8df 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -11,6 +11,7 @@
 
 #include <linux/mount.h>
 #include <linux/file.h>
+#include <linux/ima.h>
 #include "internal.h"
 
 /*
@@ -40,8 +41,10 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
 
 	_debug("--- monitor %p %lx ---", page, page->flags);
 
-	if (!PageUptodate(page) && !PageError(page))
-		dump_stack();
+	if (!PageUptodate(page) && !PageError(page)) {
+		/* unlocked, not uptodate and not erronous? */
+		_debug("page probably truncated");
+	}
 
 	/* remove from the waitqueue */
 	list_del(&wait->task_list);
@@ -61,6 +64,84 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
 }
 
 /*
+ * handle a probably truncated page
+ * - check to see if the page is still relevant and reissue the read if
+ *   possible
+ * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we
+ *   must wait again and 0 if successful
+ */
+static int cachefiles_read_reissue(struct cachefiles_object *object,
+				   struct cachefiles_one_read *monitor)
+{
+	struct address_space *bmapping = object->backer->d_inode->i_mapping;
+	struct page *backpage = monitor->back_page, *backpage2;
+	int ret;
+
+	kenter("{ino=%lx},{%lx,%lx}",
+	       object->backer->d_inode->i_ino,
+	       backpage->index, backpage->flags);
+
+	/* skip if the page was truncated away completely */
+	if (backpage->mapping != bmapping) {
+		kleave(" = -ENODATA [mapping]");
+		return -ENODATA;
+	}
+
+	backpage2 = find_get_page(bmapping, backpage->index);
+	if (!backpage2) {
+		kleave(" = -ENODATA [gone]");
+		return -ENODATA;
+	}
+
+	if (backpage != backpage2) {
+		put_page(backpage2);
+		kleave(" = -ENODATA [different]");
+		return -ENODATA;
+	}
+
+	/* the page is still there and we already have a ref on it, so we don't
+	 * need a second */
+	put_page(backpage2);
+
+	INIT_LIST_HEAD(&monitor->op_link);
+	add_page_wait_queue(backpage, &monitor->monitor);
+
+	if (trylock_page(backpage)) {
+		ret = -EIO;
+		if (PageError(backpage))
+			goto unlock_discard;
+		ret = 0;
+		if (PageUptodate(backpage))
+			goto unlock_discard;
+
+		kdebug("reissue read");
+		ret = bmapping->a_ops->readpage(NULL, backpage);
+		if (ret < 0)
+			goto unlock_discard;
+	}
+
+	/* but the page may have been read before the monitor was installed, so
+	 * the monitor may miss the event - so we have to ensure that we do get
+	 * one in such a case */
+	if (trylock_page(backpage)) {
+		_debug("jumpstart %p {%lx}", backpage, backpage->flags);
+		unlock_page(backpage);
+	}
+
+	/* it'll reappear on the todo list */
+	kleave(" = -EINPROGRESS");
+	return -EINPROGRESS;
+
+unlock_discard:
+	unlock_page(backpage);
+	spin_lock_irq(&object->work_lock);
+	list_del(&monitor->op_link);
+	spin_unlock_irq(&object->work_lock);
+	kleave(" = %d", ret);
+	return ret;
+}
+
+/*
  * copy data from backing pages to netfs pages to complete a read operation
  * - driven by FS-Cache's thread pool
  */
@@ -92,20 +173,26 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 
 		_debug("- copy {%lu}", monitor->back_page->index);
 
-		error = -EIO;
+	recheck:
 		if (PageUptodate(monitor->back_page)) {
 			copy_highpage(monitor->netfs_page, monitor->back_page);
 
 			pagevec_add(&pagevec, monitor->netfs_page);
 			fscache_mark_pages_cached(monitor->op, &pagevec);
 			error = 0;
-		}
-
-		if (error)
+		} else if (!PageError(monitor->back_page)) {
+			/* the page has probably been truncated */
+			error = cachefiles_read_reissue(object, monitor);
+			if (error == -EINPROGRESS)
+				goto next;
+			goto recheck;
+		} else {
 			cachefiles_io_error_obj(
 				object,
 				"Readpage failed on backing file %lx",
 				(unsigned long) monitor->back_page->flags);
+			error = -EIO;
+		}
 
 		page_cache_release(monitor->back_page);
 
@@ -114,6 +201,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 		fscache_put_retrieval(op);
 		kfree(monitor);
 
+	next:
 		/* let the thread pool have some air occasionally */
 		max--;
 		if (max < 0 || need_resched()) {
@@ -333,7 +421,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
-	op->op.flags = FSCACHE_OP_FAST;
+	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+	op->op.flags |= FSCACHE_OP_FAST;
 	op->op.processor = cachefiles_read_copier;
 
 	pagevec_init(&pagevec, 0);
@@ -639,7 +728,8 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 
 	pagevec_init(&pagevec, 0);
 
-	op->op.flags = FSCACHE_OP_FAST;
+	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+	op->op.flags |= FSCACHE_OP_FAST;
 	op->op.processor = cachefiles_read_copier;
 
 	INIT_LIST_HEAD(&backpages);
@@ -801,7 +891,8 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 	struct cachefiles_cache *cache;
 	mm_segment_t old_fs;
 	struct file *file;
-	loff_t pos;
+	loff_t pos, eof;
+	size_t len;
 	void *data;
 	int ret;
 
@@ -832,18 +923,33 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 	if (IS_ERR(file)) {
 		ret = PTR_ERR(file);
 	} else {
+		ima_counts_get(file);
 		ret = -EIO;
 		if (file->f_op->write) {
 			pos = (loff_t) page->index << PAGE_SHIFT;
+
+			/* we mustn't write more data than we have, so we have
+			 * to beware of a partial page at EOF */
+			eof = object->fscache.store_limit_l;
+			len = PAGE_SIZE;
+			if (eof & ~PAGE_MASK) {
+				ASSERTCMP(pos, <, eof);
+				if (eof - pos < PAGE_SIZE) {
+					_debug("cut short %llx to %llx",
+					       pos, eof);
+					len = eof - pos;
+					ASSERTCMP(pos + len, ==, eof);
+				}
+			}
+
 			data = kmap(page);
 			old_fs = get_fs();
 			set_fs(KERNEL_DS);
 			ret = file->f_op->write(
-				file, (const void __user *) data, PAGE_SIZE,
-				&pos);
+				file, (const void __user *) data, len, &pos);
 			set_fs(old_fs);
 			kunmap(page);
-			if (ret != PAGE_SIZE)
+			if (ret != len)
 				ret = -EIO;
 		}
 		fput(file);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 145540a316a..094ea65afc8 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,12 @@
+Version 1.61
+------------
+Fix append problem to Samba servers (files opened with O_APPEND could
+have duplicated data). Fix oops in cifs_lookup. Workaround problem
+mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
+Disable use of server inode numbers when server only
+partially supports them (e.g. for one server querying inode numbers on
+FindFirst fails but QPathInfo queries works).
+
 Version 1.60
 -------------
 Fix memory leak in reconnect.  Fix oops in DFS mount error path.
diff --git a/fs/cifs/README b/fs/cifs/README
index 79c1a93400b..a727b7cb075 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -423,7 +423,7 @@ A partial list of the supported mount options follows:
 		source name to use to represent the client netbios machine 
 		name when doing the RFC1001 netbios session initialize.
   direct        Do not do inode data caching on files opened on this mount.
-		This precludes mmaping files on this mount. In some cases
+		This precludes mmapping files on this mount. In some cases
 		with fast networks and little or no caching benefits on the
 		client (e.g. when the application is doing large sequential
 		reads bigger than page size without rereading the same data) 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 9a5e4f5f312..29f1da761bb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1037,7 +1037,7 @@ init_cifs(void)
 	if (rc)
 		goto out_unregister_key_type;
 #endif
-	rc = slow_work_register_user();
+	rc = slow_work_register_user(THIS_MODULE);
 	if (rc)
 		goto out_unregister_resolver_key;
 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5d0fde18039..4b35f7ec058 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -39,7 +39,7 @@
 
 /*
  * MAX_REQ is the maximum number of requests that WE will send
- * on one socket concurently. It also matches the most common
+ * on one socket concurrently. It also matches the most common
  * value of max multiplex returned by servers.  We may
  * eventually want to use the negotiated value (in case
  * future servers can handle more) when we are more confident that
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 2d07f890a84..3877737f96a 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1227,7 +1227,7 @@ typedef struct smb_com_setattr_rsp {
 /* empty wct response to setattr */
 
 /*******************************************************/
-/* NT Transact structure defintions follow             */
+/* NT Transact structure definitions follow            */
 /* Currently only ioctl, acl (get security descriptor) */
 /* and notify are implemented                          */
 /*******************************************************/
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 627a60a6c1b..6ccf7262d1b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -214,9 +214,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 		posix_flags |= SMB_O_EXCL;
 	if (oflags & O_TRUNC)
 		posix_flags |= SMB_O_TRUNC;
-	if (oflags & O_APPEND)
-		posix_flags |= SMB_O_APPEND;
-	if (oflags & O_SYNC)
+	/* be safe and imply O_SYNC for O_DSYNC */
+	if (oflags & O_DSYNC)
 		posix_flags |= SMB_O_SYNC;
 	if (oflags & O_DIRECTORY)
 		posix_flags |= SMB_O_DIRECTORY;
@@ -643,9 +642,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	 * O_EXCL: optimize away the lookup, but don't hash the dentry. Let
 	 * the VFS handle the create.
 	 */
-	if (nd->flags & LOOKUP_EXCL) {
+	if (nd && (nd->flags & LOOKUP_EXCL)) {
 		d_instantiate(direntry, NULL);
-		return 0;
+		return NULL;
 	}
 
 	/* can not grab the rename sem here since it would
@@ -675,7 +674,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	 * reduction in network traffic in the other paths.
 	 */
 	if (pTcon->unix_ext) {
-		if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
+		if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
 		     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
 		     (nd->intent.open.flags & O_CREAT)) {
 			rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 429337eb7af..057e1dae12a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -76,8 +76,10 @@ static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
 	   reopening a file.  They had their effect on the original open */
 	if (flags & O_APPEND)
 		posix_flags |= (fmode_t)O_APPEND;
-	if (flags & O_SYNC)
-		posix_flags |= (fmode_t)O_SYNC;
+	if (flags & O_DSYNC)
+		posix_flags |= (fmode_t)O_DSYNC;
+	if (flags & __O_SYNC)
+		posix_flags |= (fmode_t)__O_SYNC;
 	if (flags & O_DIRECTORY)
 		posix_flags |= (fmode_t)O_DIRECTORY;
 	if (flags & O_NOFOLLOW)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cababd8a52d..cf18ee76559 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -914,8 +914,8 @@ undo_setattr:
 /*
  * If dentry->d_inode is null (usually meaning the cached dentry
  * is a negative dentry) then we would attempt a standard SMB delete, but
- * if that fails we can not attempt the fall back mechanisms on EACESS
- * but will return the EACESS to the caller.  Note that the VFS does not call
+ * if that fails we can not attempt the fall back mechanisms on EACCESS
+ * but will return the EACCESS to the caller. Note that the VFS does not call
  * unlink on negative dentries currently.
  */
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index 224a1f47896..b6b6dcb500b 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -371,7 +371,7 @@ E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
 	smbhash(p24 + 16, c8, p21 + 14, 1);
 }
 
-#if 0 /* currently unsued */
+#if 0 /* currently unused */
 static void
 D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
 {
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 43c96ce2961..c6405ce3c50 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -17,28 +17,25 @@ static struct ctl_table_header *fs_table_header;
 
 static ctl_table coda_table[] = {
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "timeout",
 		.data		= &coda_timeout,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hard",
 		.data		= &coda_hard,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "fake_statfs",
 		.data		= &coda_fake_statfs,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 	{}
 };
@@ -46,7 +43,6 @@ static ctl_table coda_table[] = {
 #ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "coda",
 		.mode		= 0555,
 		.child		= coda_table
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d84e7058c29..14cbc831422 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -111,43 +111,40 @@
 #include <linux/dvb/frontend.h>
 #include <linux/dvb/video.h>
 
+#include <linux/sort.h>
+
 #ifdef CONFIG_SPARC
 #include <asm/fbio.h>
 #endif
 
-static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd,
-			      unsigned long arg, struct file *f)
-{
-	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
-}
-
-static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int w_long(unsigned int fd, unsigned int cmd,
+		compat_ulong_t __user *argp)
 {
 	mm_segment_t old_fs = get_fs();
 	int err;
 	unsigned long val;
-	
+
 	set_fs (KERNEL_DS);
 	err = sys_ioctl(fd, cmd, (unsigned long)&val);
 	set_fs (old_fs);
-	if (!err && put_user(val, (u32 __user *)compat_ptr(arg)))
+	if (!err && put_user(val, argp))
 		return -EFAULT;
 	return err;
 }
- 
-static int rw_long(unsigned int fd, unsigned int cmd, unsigned long arg)
+
+static int rw_long(unsigned int fd, unsigned int cmd,
+		compat_ulong_t __user *argp)
 {
 	mm_segment_t old_fs = get_fs();
-	u32 __user *argptr = compat_ptr(arg);
 	int err;
 	unsigned long val;
-	
-	if(get_user(val, argptr))
+
+	if(get_user(val, argp))
 		return -EFAULT;
 	set_fs (KERNEL_DS);
 	err = sys_ioctl(fd, cmd, (unsigned long)&val);
 	set_fs (old_fs);
-	if (!err && put_user(val, argptr))
+	if (!err && put_user(val, argp))
 		return -EFAULT;
 	return err;
 }
@@ -161,7 +158,8 @@ struct compat_video_event {
 	} u;
 };
 
-static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_video_get_event(unsigned int fd, unsigned int cmd,
+		struct compat_video_event __user *up)
 {
 	struct video_event kevent;
 	mm_segment_t old_fs = get_fs();
@@ -172,8 +170,6 @@ static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long a
 	set_fs(old_fs);
 
 	if (!err) {
-		struct compat_video_event __user *up = compat_ptr(arg);
-
 		err  = put_user(kevent.type, &up->type);
 		err |= put_user(kevent.timestamp, &up->timestamp);
 		err |= put_user(kevent.u.size.w, &up->u.size.w);
@@ -192,15 +188,14 @@ struct compat_video_still_picture {
         int32_t size;
 };
 
-static int do_video_stillpicture(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
+	struct compat_video_still_picture __user *up)
 {
-	struct compat_video_still_picture __user *up;
 	struct video_still_picture __user *up_native;
 	compat_uptr_t fp;
 	int32_t size;
 	int err;
 
-	up = (struct compat_video_still_picture __user *) arg;
 	err  = get_user(fp, &up->iFrame);
 	err |= get_user(size, &up->size);
 	if (err)
@@ -224,14 +219,13 @@ struct compat_video_spu_palette {
 	compat_uptr_t palette;
 };
 
-static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
+		struct compat_video_spu_palette __user *up)
 {
-	struct compat_video_spu_palette __user *up;
 	struct video_spu_palette __user *up_native;
 	compat_uptr_t palp;
 	int length, err;
 
-	up = (struct compat_video_spu_palette __user *) arg;
 	err  = get_user(palp, &up->palette);
 	err |= get_user(length, &up->length);
 
@@ -246,428 +240,6 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned
 	return err;
 }
 
-#ifdef CONFIG_NET
-static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct compat_timeval __user *up = compat_ptr(arg);
-	struct timeval ktv;
-	mm_segment_t old_fs = get_fs();
-	int err;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&ktv);
-	set_fs(old_fs);
-	if(!err) {
-		err = put_user(ktv.tv_sec, &up->tv_sec);
-		err |= __put_user(ktv.tv_usec, &up->tv_usec);
-	}
-	return err;
-}
-
-static int do_siocgstampns(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct compat_timespec __user *up = compat_ptr(arg);
-	struct timespec kts;
-	mm_segment_t old_fs = get_fs();
-	int err;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&kts);
-	set_fs(old_fs);
-	if (!err) {
-		err = put_user(kts.tv_sec, &up->tv_sec);
-		err |= __put_user(kts.tv_nsec, &up->tv_nsec);
-	}
-	return err;
-}
-
-struct ifmap32 {
-	compat_ulong_t mem_start;
-	compat_ulong_t mem_end;
-	unsigned short base_addr;
-	unsigned char irq;
-	unsigned char dma;
-	unsigned char port;
-};
-
-struct ifreq32 {
-#define IFHWADDRLEN     6
-#define IFNAMSIZ        16
-        union {
-                char    ifrn_name[IFNAMSIZ];            /* if name, e.g. "en0" */
-        } ifr_ifrn;
-        union {
-                struct  sockaddr ifru_addr;
-                struct  sockaddr ifru_dstaddr;
-                struct  sockaddr ifru_broadaddr;
-                struct  sockaddr ifru_netmask;
-                struct  sockaddr ifru_hwaddr;
-                short   ifru_flags;
-                compat_int_t     ifru_ivalue;
-                compat_int_t     ifru_mtu;
-                struct  ifmap32 ifru_map;
-                char    ifru_slave[IFNAMSIZ];   /* Just fits the size */
-		char	ifru_newname[IFNAMSIZ];
-                compat_caddr_t ifru_data;
-	    /* XXXX? ifru_settings should be here */
-        } ifr_ifru;
-};
-
-struct ifconf32 {
-        compat_int_t	ifc_len;                        /* size of buffer       */
-        compat_caddr_t  ifcbuf;
-};
-
-static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ifreq __user *uifr;
-	int err;
-
-	uifr = compat_alloc_user_space(sizeof(struct ifreq));
-	if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)))
-		return -EFAULT;
-
-	err = sys_ioctl(fd, SIOCGIFNAME, (unsigned long)uifr);
-	if (err)
-		return err;
-
-	if (copy_in_user(compat_ptr(arg), uifr, sizeof(struct ifreq32)))
-		return -EFAULT;
-
-	return 0;
-}
-
-static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ifconf32 ifc32;
-	struct ifconf ifc;
-	struct ifconf __user *uifc;
-	struct ifreq32 __user *ifr32;
-	struct ifreq __user *ifr;
-	unsigned int i, j;
-	int err;
-
-	if (copy_from_user(&ifc32, compat_ptr(arg), sizeof(struct ifconf32)))
-		return -EFAULT;
-
-	if (ifc32.ifcbuf == 0) {
-		ifc32.ifc_len = 0;
-		ifc.ifc_len = 0;
-		ifc.ifc_req = NULL;
-		uifc = compat_alloc_user_space(sizeof(struct ifconf));
-	} else {
-		size_t len =((ifc32.ifc_len / sizeof (struct ifreq32)) + 1) *
-			sizeof (struct ifreq);
-		uifc = compat_alloc_user_space(sizeof(struct ifconf) + len);
-		ifc.ifc_len = len;
-		ifr = ifc.ifc_req = (void __user *)(uifc + 1);
-		ifr32 = compat_ptr(ifc32.ifcbuf);
-		for (i = 0; i < ifc32.ifc_len; i += sizeof (struct ifreq32)) {
-			if (copy_in_user(ifr, ifr32, sizeof(struct ifreq32)))
-				return -EFAULT;
-			ifr++;
-			ifr32++; 
-		}
-	}
-	if (copy_to_user(uifc, &ifc, sizeof(struct ifconf)))
-		return -EFAULT;
-
-	err = sys_ioctl (fd, SIOCGIFCONF, (unsigned long)uifc);	
-	if (err)
-		return err;
-
-	if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) 
-		return -EFAULT;
-
-	ifr = ifc.ifc_req;
-	ifr32 = compat_ptr(ifc32.ifcbuf);
-	for (i = 0, j = 0;
-             i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len;
-	     i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) {
-		if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32)))
-			return -EFAULT;
-		ifr32++;
-		ifr++;
-	}
-
-	if (ifc32.ifcbuf == 0) {
-		/* Translate from 64-bit structure multiple to
-		 * a 32-bit one.
-		 */
-		i = ifc.ifc_len;
-		i = ((i / sizeof(struct ifreq)) * sizeof(struct ifreq32));
-		ifc32.ifc_len = i;
-	} else {
-		ifc32.ifc_len = i;
-	}
-	if (copy_to_user(compat_ptr(arg), &ifc32, sizeof(struct ifconf32)))
-		return -EFAULT;
-
-	return 0;
-}
-
-static int ethtool_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ifreq __user *ifr;
-	struct ifreq32 __user *ifr32;
-	u32 data;
-	void __user *datap;
-	
-	ifr = compat_alloc_user_space(sizeof(*ifr));
-	ifr32 = compat_ptr(arg);
-
-	if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
-		return -EFAULT;
-
-	if (get_user(data, &ifr32->ifr_ifru.ifru_data))
-		return -EFAULT;
-
-	datap = compat_ptr(data);
-	if (put_user(datap, &ifr->ifr_ifru.ifru_data))
-		return -EFAULT;
-
-	return sys_ioctl(fd, cmd, (unsigned long) ifr);
-}
-
-static int bond_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ifreq kifr;
-	struct ifreq __user *uifr;
-	struct ifreq32 __user *ifr32 = compat_ptr(arg);
-	mm_segment_t old_fs;
-	int err;
-	u32 data;
-	void __user *datap;
-
-	switch (cmd) {
-	case SIOCBONDENSLAVE:
-	case SIOCBONDRELEASE:
-	case SIOCBONDSETHWADDR:
-	case SIOCBONDCHANGEACTIVE:
-		if (copy_from_user(&kifr, ifr32, sizeof(struct ifreq32)))
-			return -EFAULT;
-
-		old_fs = get_fs();
-		set_fs (KERNEL_DS);
-		err = sys_ioctl (fd, cmd, (unsigned long)&kifr);
-		set_fs (old_fs);
-
-		return err;
-	case SIOCBONDSLAVEINFOQUERY:
-	case SIOCBONDINFOQUERY:
-		uifr = compat_alloc_user_space(sizeof(*uifr));
-		if (copy_in_user(&uifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
-			return -EFAULT;
-
-		if (get_user(data, &ifr32->ifr_ifru.ifru_data))
-			return -EFAULT;
-
-		datap = compat_ptr(data);
-		if (put_user(datap, &uifr->ifr_ifru.ifru_data))
-			return -EFAULT;
-
-		return sys_ioctl (fd, cmd, (unsigned long)uifr);
-	default:
-		return -EINVAL;
-	};
-}
-
-static int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ifreq __user *u_ifreq64;
-	struct ifreq32 __user *u_ifreq32 = compat_ptr(arg);
-	char tmp_buf[IFNAMSIZ];
-	void __user *data64;
-	u32 data32;
-
-	if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]),
-			   IFNAMSIZ))
-		return -EFAULT;
-	if (__get_user(data32, &u_ifreq32->ifr_ifru.ifru_data))
-		return -EFAULT;
-	data64 = compat_ptr(data32);
-
-	u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64));
-
-	/* Don't check these user accesses, just let that get trapped
-	 * in the ioctl handler instead.
-	 */
-	if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0],
-			 IFNAMSIZ))
-		return -EFAULT;
-	if (__put_user(data64, &u_ifreq64->ifr_ifru.ifru_data))
-		return -EFAULT;
-
-	return sys_ioctl(fd, cmd, (unsigned long) u_ifreq64);
-}
-
-static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ifreq ifr;
-	struct ifreq32 __user *uifr32;
-	struct ifmap32 __user *uifmap32;
-	mm_segment_t old_fs;
-	int err;
-	
-	uifr32 = compat_ptr(arg);
-	uifmap32 = &uifr32->ifr_ifru.ifru_map;
-	switch (cmd) {
-	case SIOCSIFMAP:
-		err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
-		err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
-		err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
-		err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
-		err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq);
-		err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma);
-		err |= __get_user(ifr.ifr_map.port, &uifmap32->port);
-		if (err)
-			return -EFAULT;
-		break;
-	case SIOCSHWTSTAMP:
-		if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
-			return -EFAULT;
-		ifr.ifr_data = compat_ptr(uifr32->ifr_ifru.ifru_data);
-		break;
-	default:
-		if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
-			return -EFAULT;
-		break;
-	}
-	old_fs = get_fs();
-	set_fs (KERNEL_DS);
-	err = sys_ioctl (fd, cmd, (unsigned long)&ifr);
-	set_fs (old_fs);
-	if (!err) {
-		switch (cmd) {
-		/* TUNSETIFF is defined as _IOW, it should be _IORW
-		 * as the data is copied back to user space, but that
-		 * cannot be fixed without breaking all existing apps.
-		 */
-		case TUNSETIFF:
-		case TUNGETIFF:
-		case SIOCGIFFLAGS:
-		case SIOCGIFMETRIC:
-		case SIOCGIFMTU:
-		case SIOCGIFMEM:
-		case SIOCGIFHWADDR:
-		case SIOCGIFINDEX:
-		case SIOCGIFADDR:
-		case SIOCGIFBRDADDR:
-		case SIOCGIFDSTADDR:
-		case SIOCGIFNETMASK:
-		case SIOCGIFTXQLEN:
-			if (copy_to_user(uifr32, &ifr, sizeof(*uifr32)))
-				return -EFAULT;
-			break;
-		case SIOCGIFMAP:
-			err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
-			err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
-			err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
-			err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
-			err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq);
-			err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma);
-			err |= __put_user(ifr.ifr_map.port, &uifmap32->port);
-			if (err)
-				err = -EFAULT;
-			break;
-		}
-	}
-	return err;
-}
-
-struct rtentry32 {
-        u32   		rt_pad1;
-        struct sockaddr rt_dst;         /* target address               */
-        struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
-        struct sockaddr rt_genmask;     /* target network mask (IP)     */
-        unsigned short  rt_flags;
-        short           rt_pad2;
-        u32   		rt_pad3;
-        unsigned char   rt_tos;
-        unsigned char   rt_class;
-        short           rt_pad4;
-        short           rt_metric;      /* +1 for binary compatibility! */
-        /* char * */ u32 rt_dev;        /* forcing the device at add    */
-        u32   		rt_mtu;         /* per route MTU/Window         */
-        u32   		rt_window;      /* Window clamping              */
-        unsigned short  rt_irtt;        /* Initial RTT                  */
-
-};
-
-struct in6_rtmsg32 {
-	struct in6_addr		rtmsg_dst;
-	struct in6_addr		rtmsg_src;
-	struct in6_addr		rtmsg_gateway;
-	u32			rtmsg_type;
-	u16			rtmsg_dst_len;
-	u16			rtmsg_src_len;
-	u32			rtmsg_metric;
-	u32			rtmsg_info;
-	u32			rtmsg_flags;
-	s32			rtmsg_ifindex;
-};
-
-static int routing_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	int ret;
-	void *r = NULL;
-	struct in6_rtmsg r6;
-	struct rtentry r4;
-	char devname[16];
-	u32 rtdev;
-	mm_segment_t old_fs = get_fs();
-	
-	struct socket *mysock = sockfd_lookup(fd, &ret);
-
-	if (mysock && mysock->sk && mysock->sk->sk_family == AF_INET6) { /* ipv6 */
-		struct in6_rtmsg32 __user *ur6 = compat_ptr(arg);
-		ret = copy_from_user (&r6.rtmsg_dst, &(ur6->rtmsg_dst),
-			3 * sizeof(struct in6_addr));
-		ret |= __get_user (r6.rtmsg_type, &(ur6->rtmsg_type));
-		ret |= __get_user (r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
-		ret |= __get_user (r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
-		ret |= __get_user (r6.rtmsg_metric, &(ur6->rtmsg_metric));
-		ret |= __get_user (r6.rtmsg_info, &(ur6->rtmsg_info));
-		ret |= __get_user (r6.rtmsg_flags, &(ur6->rtmsg_flags));
-		ret |= __get_user (r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
-		
-		r = (void *) &r6;
-	} else { /* ipv4 */
-		struct rtentry32 __user *ur4 = compat_ptr(arg);
-		ret = copy_from_user (&r4.rt_dst, &(ur4->rt_dst),
-					3 * sizeof(struct sockaddr));
-		ret |= __get_user (r4.rt_flags, &(ur4->rt_flags));
-		ret |= __get_user (r4.rt_metric, &(ur4->rt_metric));
-		ret |= __get_user (r4.rt_mtu, &(ur4->rt_mtu));
-		ret |= __get_user (r4.rt_window, &(ur4->rt_window));
-		ret |= __get_user (r4.rt_irtt, &(ur4->rt_irtt));
-		ret |= __get_user (rtdev, &(ur4->rt_dev));
-		if (rtdev) {
-			ret |= copy_from_user (devname, compat_ptr(rtdev), 15);
-			r4.rt_dev = devname; devname[15] = 0;
-		} else
-			r4.rt_dev = NULL;
-
-		r = (void *) &r4;
-	}
-
-	if (ret) {
-		ret = -EFAULT;
-		goto out;
-	}
-
-	set_fs (KERNEL_DS);
-	ret = sys_ioctl (fd, cmd, (unsigned long) r);
-	set_fs (old_fs);
-
-out:
-	if (mysock)
-		sockfd_put(mysock);
-
-	return ret;
-}
-#endif
-
 #ifdef CONFIG_BLOCK
 typedef struct sg_io_hdr32 {
 	compat_int_t interface_id;	/* [i] 'S' for SCSI generic (required) */
@@ -721,16 +293,15 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov
 	return 0;
 }
 
-static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
+			sg_io_hdr32_t __user *sgio32)
 {
 	sg_io_hdr_t __user *sgio;
-	sg_io_hdr32_t __user *sgio32;
 	u16 iovec_count;
 	u32 data;
 	void __user *dxferp;
 	int err;
 
-	sgio32 = compat_ptr(arg);
 	if (get_user(iovec_count, &sgio32->iovec_count))
 		return -EFAULT;
 
@@ -820,11 +391,11 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
 	int unused;
 };
 
-static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
+			compat_sg_req_info __user *o)
 {
 	int err, i;
 	sg_req_info_t __user *r;
-	struct compat_sg_req_info __user *o = (void __user *)arg;
 	r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
 	err = sys_ioctl(fd,cmd,(unsigned long)r);
 	if (err < 0)
@@ -852,9 +423,9 @@ struct sock_fprog32 {
 #define PPPIOCSPASS32	_IOW('t', 71, struct sock_fprog32)
 #define PPPIOCSACTIVE32	_IOW('t', 70, struct sock_fprog32)
 
-static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
+			struct sock_fprog32 __user *u_fprog32)
 {
-	struct sock_fprog32 __user *u_fprog32 = compat_ptr(arg);
 	struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
 	void __user *fptr64;
 	u32 fptr32;
@@ -891,15 +462,14 @@ struct ppp_idle32 {
 };
 #define PPPIOCGIDLE32		_IOR('t', 63, struct ppp_idle32)
 
-static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int ppp_gidle(unsigned int fd, unsigned int cmd,
+		struct ppp_idle32 __user *idle32)
 {
 	struct ppp_idle __user *idle;
-	struct ppp_idle32 __user *idle32;
 	__kernel_time_t xmit, recv;
 	int err;
 
 	idle = compat_alloc_user_space(sizeof(*idle));
-	idle32 = compat_ptr(arg);
 
 	err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
 
@@ -913,15 +483,14 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int ppp_scompress(unsigned int fd, unsigned int cmd,
+	struct ppp_option_data32 __user *odata32)
 {
 	struct ppp_option_data __user *odata;
-	struct ppp_option_data32 __user *odata32;
 	__u32 data;
 	void __user *datap;
 
 	odata = compat_alloc_user_space(sizeof(*odata));
-	odata32 = compat_ptr(arg);
 
 	if (get_user(data, &odata32->ptr))
 		return -EFAULT;
@@ -937,35 +506,6 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
 }
 
-static int ppp_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	int err;
-
-	switch (cmd) {
-	case PPPIOCGIDLE32:
-		err = ppp_gidle(fd, cmd, arg);
-		break;
-
-	case PPPIOCSCOMPRESS32:
-		err = ppp_scompress(fd, cmd, arg);
-		break;
-
-	default:
-		do {
-			static int count;
-			if (++count <= 20)
-				printk("ppp_ioctl: Unknown cmd fd(%d) "
-				       "cmd(%08x) arg(%08x)\n",
-				       (int)fd, (unsigned int)cmd, (unsigned int)arg);
-		} while(0);
-		err = -EINVAL;
-		break;
-	};
-
-	return err;
-}
-
-
 #ifdef CONFIG_BLOCK
 struct mtget32 {
 	compat_long_t	mt_type;
@@ -983,7 +523,7 @@ struct mtpos32 {
 };
 #define MTIOCPOS32	_IOR('m', 3, struct mtpos32)
 
-static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 {
 	mm_segment_t old_fs = get_fs();
 	struct mtget get;
@@ -1003,15 +543,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 		kcmd = MTIOCGET;
 		karg = &get;
 		break;
-	default:
-		do {
-			static int count;
-			if (++count <= 20)
-				printk("mt_ioctl: Unknown cmd fd(%d) "
-				       "cmd(%08x) arg(%08x)\n",
-				       (int)fd, (unsigned int)cmd, (unsigned int)arg);
-		} while(0);
-		return -EINVAL;
 	}
 	set_fs (KERNEL_DS);
 	err = sys_ioctl (fd, kcmd, (unsigned long)karg);
@@ -1020,11 +551,11 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 		return err;
 	switch (cmd) {
 	case MTIOCPOS32:
-		upos32 = compat_ptr(arg);
+		upos32 = argp;
 		err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
 		break;
 	case MTIOCGET32:
-		umget32 = compat_ptr(arg);
+		umget32 = argp;
 		err = __put_user(get.mt_type, &umget32->mt_type);
 		err |= __put_user(get.mt_resid, &umget32->mt_resid);
 		err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
@@ -1039,162 +570,8 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 
 #endif /* CONFIG_BLOCK */
 
-#ifdef CONFIG_VT
-
-static int vt_check(struct file *file)
-{
-	struct tty_struct *tty;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct vc_data *vc;
-	
-	if (file->f_op->unlocked_ioctl != tty_ioctl)
-		return -EINVAL;
-	                
-	tty = (struct tty_struct *)file->private_data;
-	if (tty_paranoia_check(tty, inode, "tty_ioctl"))
-		return -EINVAL;
-	                                                
-	if (tty->ops->ioctl != vt_ioctl)
-		return -EINVAL;
-
-	vc = (struct vc_data *)tty->driver_data;
-	if (!vc_cons_allocated(vc->vc_num)) 	/* impossible? */
-		return -ENOIOCTLCMD;
-
-	/*
-	 * To have permissions to do most of the vt ioctls, we either have
-	 * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
-	 */
-	if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG))
-		return 1;
-	return 0;                                                    
-}
-
-struct consolefontdesc32 {
-	unsigned short charcount;       /* characters in font (256 or 512) */
-	unsigned short charheight;      /* scan lines per character (1-32) */
-	compat_caddr_t chardata;	/* font data in expanded form */
-};
-
-static int do_fontx_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file)
-{
-	struct consolefontdesc32 __user *user_cfd = compat_ptr(arg);
-	struct console_font_op op;
-	compat_caddr_t data;
-	int i, perm;
-
-	perm = vt_check(file);
-	if (perm < 0) return perm;
-	
-	switch (cmd) {
-	case PIO_FONTX:
-		if (!perm)
-			return -EPERM;
-		op.op = KD_FONT_OP_SET;
-		op.flags = 0;
-		op.width = 8;
-		if (get_user(op.height, &user_cfd->charheight) ||
-		    get_user(op.charcount, &user_cfd->charcount) ||
-		    get_user(data, &user_cfd->chardata))
-			return -EFAULT;
-		op.data = compat_ptr(data);
-		return con_font_op(vc_cons[fg_console].d, &op);
-	case GIO_FONTX:
-		op.op = KD_FONT_OP_GET;
-		op.flags = 0;
-		op.width = 8;
-		if (get_user(op.height, &user_cfd->charheight) ||
-		    get_user(op.charcount, &user_cfd->charcount) ||
-		    get_user(data, &user_cfd->chardata))
-			return -EFAULT;
-		if (!data)
-			return 0;
-		op.data = compat_ptr(data);
-		i = con_font_op(vc_cons[fg_console].d, &op);
-		if (i)
-			return i;
-		if (put_user(op.height, &user_cfd->charheight) ||
-		    put_user(op.charcount, &user_cfd->charcount) ||
-		    put_user((compat_caddr_t)(unsigned long)op.data,
-				&user_cfd->chardata))
-			return -EFAULT;
-		return 0;
-	}
-	return -EINVAL;
-}
-
-struct console_font_op32 {
-	compat_uint_t op;        /* operation code KD_FONT_OP_* */
-	compat_uint_t flags;     /* KD_FONT_FLAG_* */
-	compat_uint_t width, height;     /* font size */
-	compat_uint_t charcount;
-	compat_caddr_t data;    /* font data with height fixed to 32 */
-};
-                                        
-static int do_kdfontop_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file)
-{
-	struct console_font_op op;
-	struct console_font_op32 __user *fontop = compat_ptr(arg);
-	int perm = vt_check(file), i;
-	struct vc_data *vc;
-	
-	if (perm < 0) return perm;
-	
-	if (copy_from_user(&op, fontop, sizeof(struct console_font_op32)))
-		return -EFAULT;
-	if (!perm && op.op != KD_FONT_OP_GET)
-		return -EPERM;
-	op.data = compat_ptr(((struct console_font_op32 *)&op)->data);
-	op.flags |= KD_FONT_FLAG_OLD;
-	vc = ((struct tty_struct *)file->private_data)->driver_data;
-	i = con_font_op(vc, &op);
-	if (i)
-		return i;
-	((struct console_font_op32 *)&op)->data = (unsigned long)op.data;
-	if (copy_to_user(fontop, &op, sizeof(struct console_font_op32)))
-		return -EFAULT;
-	return 0;
-}
-
-struct unimapdesc32 {
-	unsigned short entry_ct;
-	compat_caddr_t entries;
-};
-
-static int do_unimap_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file)
-{
-	struct unimapdesc32 tmp;
-	struct unimapdesc32 __user *user_ud = compat_ptr(arg);
-	int perm = vt_check(file);
-	struct vc_data *vc;
-
-	if (perm < 0)
-		return perm;
-	if (copy_from_user(&tmp, user_ud, sizeof tmp))
-		return -EFAULT;
-	if (tmp.entries)
-		if (!access_ok(VERIFY_WRITE, compat_ptr(tmp.entries),
-				tmp.entry_ct*sizeof(struct unipair)))
-			return -EFAULT;
-	vc = ((struct tty_struct *)file->private_data)->driver_data;
-	switch (cmd) {
-	case PIO_UNIMAP:
-		if (!perm)
-			return -EPERM;
-		return con_set_unimap(vc, tmp.entry_ct,
-						compat_ptr(tmp.entries));
-	case GIO_UNIMAP:
-		if (!perm && fg_console != vc->vc_num)
-			return -EPERM;
-		return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct),
-						compat_ptr(tmp.entries));
-	}
-	return 0;
-}
-
-#endif /* CONFIG_VT */
-
-static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
+			compat_uid_t __user *argp)
 {
 	mm_segment_t old_fs = get_fs();
 	__kernel_uid_t kuid;
@@ -1207,184 +584,15 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long a
 	set_fs(old_fs);
 
 	if (err >= 0)
-		err = put_user(kuid, (compat_uid_t __user *)compat_ptr(arg));
+		err = put_user(kuid, argp);
 
 	return err;
 }
 
-struct atmif_sioc32 {
-        compat_int_t	number;
-        compat_int_t	length;
-        compat_caddr_t	arg;
-};
-
-struct atm_iobuf32 {
-	compat_int_t	length;
-	compat_caddr_t	buffer;
-};
-
-#define ATM_GETLINKRATE32 _IOW('a', ATMIOC_ITF+1, struct atmif_sioc32)
-#define ATM_GETNAMES32    _IOW('a', ATMIOC_ITF+3, struct atm_iobuf32)
-#define ATM_GETTYPE32     _IOW('a', ATMIOC_ITF+4, struct atmif_sioc32)
-#define ATM_GETESI32	  _IOW('a', ATMIOC_ITF+5, struct atmif_sioc32)
-#define ATM_GETADDR32	  _IOW('a', ATMIOC_ITF+6, struct atmif_sioc32)
-#define ATM_RSTADDR32	  _IOW('a', ATMIOC_ITF+7, struct atmif_sioc32)
-#define ATM_ADDADDR32	  _IOW('a', ATMIOC_ITF+8, struct atmif_sioc32)
-#define ATM_DELADDR32	  _IOW('a', ATMIOC_ITF+9, struct atmif_sioc32)
-#define ATM_GETCIRANGE32  _IOW('a', ATMIOC_ITF+10, struct atmif_sioc32)
-#define ATM_SETCIRANGE32  _IOW('a', ATMIOC_ITF+11, struct atmif_sioc32)
-#define ATM_SETESI32      _IOW('a', ATMIOC_ITF+12, struct atmif_sioc32)
-#define ATM_SETESIF32     _IOW('a', ATMIOC_ITF+13, struct atmif_sioc32)
-#define ATM_GETSTAT32     _IOW('a', ATMIOC_SARCOM+0, struct atmif_sioc32)
-#define ATM_GETSTATZ32    _IOW('a', ATMIOC_SARCOM+1, struct atmif_sioc32)
-#define ATM_GETLOOP32	  _IOW('a', ATMIOC_SARCOM+2, struct atmif_sioc32)
-#define ATM_SETLOOP32	  _IOW('a', ATMIOC_SARCOM+3, struct atmif_sioc32)
-#define ATM_QUERYLOOP32	  _IOW('a', ATMIOC_SARCOM+4, struct atmif_sioc32)
-
-static struct {
-        unsigned int cmd32;
-        unsigned int cmd;
-} atm_ioctl_map[] = {
-        { ATM_GETLINKRATE32, ATM_GETLINKRATE },
-	{ ATM_GETNAMES32,    ATM_GETNAMES },
-        { ATM_GETTYPE32,     ATM_GETTYPE },
-        { ATM_GETESI32,      ATM_GETESI },
-        { ATM_GETADDR32,     ATM_GETADDR },
-        { ATM_RSTADDR32,     ATM_RSTADDR },
-        { ATM_ADDADDR32,     ATM_ADDADDR },
-        { ATM_DELADDR32,     ATM_DELADDR },
-        { ATM_GETCIRANGE32,  ATM_GETCIRANGE },
-	{ ATM_SETCIRANGE32,  ATM_SETCIRANGE },
-	{ ATM_SETESI32,      ATM_SETESI },
-	{ ATM_SETESIF32,     ATM_SETESIF },
-	{ ATM_GETSTAT32,     ATM_GETSTAT },
-	{ ATM_GETSTATZ32,    ATM_GETSTATZ },
-	{ ATM_GETLOOP32,     ATM_GETLOOP },
-	{ ATM_SETLOOP32,     ATM_SETLOOP },
-	{ ATM_QUERYLOOP32,   ATM_QUERYLOOP }
-};
-
-#define NR_ATM_IOCTL ARRAY_SIZE(atm_ioctl_map)
-
-static int do_atm_iobuf(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int ioc_settimeout(unsigned int fd, unsigned int cmd,
+		compat_ulong_t __user *argp)
 {
-	struct atm_iobuf   __user *iobuf;
-	struct atm_iobuf32 __user *iobuf32;
-	u32 data;
-	void __user *datap;
-	int len, err;
-
-	iobuf = compat_alloc_user_space(sizeof(*iobuf));
-	iobuf32 = compat_ptr(arg);
-
-	if (get_user(len, &iobuf32->length) ||
-	    get_user(data, &iobuf32->buffer))
-		return -EFAULT;
-	datap = compat_ptr(data);
-	if (put_user(len, &iobuf->length) ||
-	    put_user(datap, &iobuf->buffer))
-		return -EFAULT;
-
-	err = sys_ioctl(fd, cmd, (unsigned long)iobuf);
-
-	if (!err) {
-		if (copy_in_user(&iobuf32->length, &iobuf->length,
-				 sizeof(int)))
-			err = -EFAULT;
-	}
-
-	return err;
-}
-
-static int do_atmif_sioc(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct atmif_sioc   __user *sioc;
-	struct atmif_sioc32 __user *sioc32;
-	u32 data;
-	void __user *datap;
-	int err;
-        
-	sioc = compat_alloc_user_space(sizeof(*sioc));
-	sioc32 = compat_ptr(arg);
-
-	if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) ||
-	    get_user(data, &sioc32->arg))
-		return -EFAULT;
-	datap = compat_ptr(data);
-	if (put_user(datap, &sioc->arg))
-		return -EFAULT;
-
-	err = sys_ioctl(fd, cmd, (unsigned long) sioc);
-
-	if (!err) {
-		if (copy_in_user(&sioc32->length, &sioc->length,
-				 sizeof(int)))
-			err = -EFAULT;
-	}
-	return err;
-}
-
-static int do_atm_ioctl(unsigned int fd, unsigned int cmd32, unsigned long arg)
-{
-        int i;
-        unsigned int cmd = 0;
-        
-	switch (cmd32) {
-	case SONET_GETSTAT:
-	case SONET_GETSTATZ:
-	case SONET_GETDIAG:
-	case SONET_SETDIAG:
-	case SONET_CLRDIAG:
-	case SONET_SETFRAMING:
-	case SONET_GETFRAMING:
-	case SONET_GETFRSENSE:
-		return do_atmif_sioc(fd, cmd32, arg);
-	}
-
-	for (i = 0; i < NR_ATM_IOCTL; i++) {
-		if (cmd32 == atm_ioctl_map[i].cmd32) {
-			cmd = atm_ioctl_map[i].cmd;
-			break;
-		}
-	}
-	if (i == NR_ATM_IOCTL)
-	        return -EINVAL;
-        
-        switch (cmd) {
-	case ATM_GETNAMES:
-		return do_atm_iobuf(fd, cmd, arg);
-	    
-	case ATM_GETLINKRATE:
-        case ATM_GETTYPE:
-        case ATM_GETESI:
-        case ATM_GETADDR:
-        case ATM_RSTADDR:
-        case ATM_ADDADDR:
-        case ATM_DELADDR:
-        case ATM_GETCIRANGE:
-	case ATM_SETCIRANGE:
-	case ATM_SETESI:
-	case ATM_SETESIF:
-	case ATM_GETSTAT:
-	case ATM_GETSTATZ:
-	case ATM_GETLOOP:
-	case ATM_SETLOOP:
-	case ATM_QUERYLOOP:
-                return do_atmif_sioc(fd, cmd, arg);
-        }
-
-        return -EINVAL;
-}
-
-static __used int
-ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	return -EINVAL;
-}
-
-static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg);
+	return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp);
 }
 
 /* Bluetooth ioctls */
@@ -1442,7 +650,8 @@ static int set_raw32_request(struct raw_config_request *req, struct raw32_config
         return ret ? -EFAULT : 0;
 }
 
-static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
+static int raw_ioctl(unsigned fd, unsigned cmd,
+		struct raw32_config_request __user *user_req)
 {
         int ret;
 
@@ -1450,7 +659,6 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
         case RAW_SETBIND:
         case RAW_GETBIND: {
                 struct raw_config_request req;
-                struct raw32_config_request __user *user_req = compat_ptr(arg);
                 mm_segment_t oldfs = get_fs();
 
                 if ((ret = get_raw32_request(&req, user_req)))
@@ -1465,9 +673,6 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
                 }
                 break;
         }
-        default:
-                ret = sys_ioctl(fd, cmd, arg);
-                break;
         }
         return ret;
 }
@@ -1495,11 +700,11 @@ struct serial_struct32 {
         compat_int_t    reserved[1];
 };
 
-static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
+static int serial_struct_ioctl(unsigned fd, unsigned cmd,
+			struct serial_struct32 __user *ss32)
 {
         typedef struct serial_struct SS;
         typedef struct serial_struct32 SS32;
-        struct serial_struct32 __user *ss32 = compat_ptr(arg);
         int err;
         struct serial_struct ss;
         mm_segment_t oldseg = get_fs();
@@ -1537,96 +742,6 @@ static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
         return err;
 }
 
-struct usbdevfs_ctrltransfer32 {
-        u8 bRequestType;
-        u8 bRequest;
-        u16 wValue;
-        u16 wIndex;
-        u16 wLength;
-        u32 timeout;  /* in milliseconds */
-        compat_caddr_t data;
-};
-
-#define USBDEVFS_CONTROL32           _IOWR('U', 0, struct usbdevfs_ctrltransfer32)
-
-static int do_usbdevfs_control(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct usbdevfs_ctrltransfer32 __user *p32 = compat_ptr(arg);
-        struct usbdevfs_ctrltransfer __user *p;
-        __u32 udata;
-        p = compat_alloc_user_space(sizeof(*p));
-        if (copy_in_user(p, p32, (sizeof(*p32) - sizeof(compat_caddr_t))) ||
-            get_user(udata, &p32->data) ||
-	    put_user(compat_ptr(udata), &p->data))
-		return -EFAULT;
-        return sys_ioctl(fd, USBDEVFS_CONTROL, (unsigned long)p);
-}
-
-
-struct usbdevfs_bulktransfer32 {
-        compat_uint_t ep;
-        compat_uint_t len;
-        compat_uint_t timeout; /* in milliseconds */
-        compat_caddr_t data;
-};
-
-#define USBDEVFS_BULK32              _IOWR('U', 2, struct usbdevfs_bulktransfer32)
-
-static int do_usbdevfs_bulk(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct usbdevfs_bulktransfer32 __user *p32 = compat_ptr(arg);
-        struct usbdevfs_bulktransfer __user *p;
-        compat_uint_t n;
-        compat_caddr_t addr;
-
-        p = compat_alloc_user_space(sizeof(*p));
-
-        if (get_user(n, &p32->ep) || put_user(n, &p->ep) ||
-            get_user(n, &p32->len) || put_user(n, &p->len) ||
-            get_user(n, &p32->timeout) || put_user(n, &p->timeout) ||
-            get_user(addr, &p32->data) || put_user(compat_ptr(addr), &p->data))
-                return -EFAULT;
-
-        return sys_ioctl(fd, USBDEVFS_BULK, (unsigned long)p);
-}
-
-
-/*
- *  USBDEVFS_SUBMITURB, USBDEVFS_REAPURB and USBDEVFS_REAPURBNDELAY
- *  are handled in usbdevfs core.			-Christopher Li
- */
-
-struct usbdevfs_disconnectsignal32 {
-        compat_int_t signr;
-        compat_caddr_t context;
-};
-
-#define USBDEVFS_DISCSIGNAL32      _IOR('U', 14, struct usbdevfs_disconnectsignal32)
-
-static int do_usbdevfs_discsignal(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct usbdevfs_disconnectsignal kdis;
-        struct usbdevfs_disconnectsignal32 __user *udis;
-        mm_segment_t old_fs;
-        u32 uctx;
-        int err;
-
-        udis = compat_ptr(arg);
-
-        if (get_user(kdis.signr, &udis->signr) ||
-            __get_user(uctx, &udis->context))
-                return -EFAULT;
-
-        kdis.context = compat_ptr(uctx);
-
-        old_fs = get_fs();
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, USBDEVFS_DISCSIGNAL, (unsigned long) &kdis);
-        set_fs(old_fs);
-
-        return err;
-}
-
 /*
  * I2C layer ioctls
  */
@@ -1655,9 +770,9 @@ struct i2c_rdwr_aligned {
 	struct i2c_msg msgs[0];
 };
 
-static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
+			struct i2c_rdwr_ioctl_data32    __user *udata)
 {
-	struct i2c_rdwr_ioctl_data32	__user *udata = compat_ptr(arg);
 	struct i2c_rdwr_aligned		__user *tdata;
 	struct i2c_msg			__user *tmsgs;
 	struct i2c_msg32		__user *umsgs;
@@ -1691,10 +806,10 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar
 	return sys_ioctl(fd, cmd, (unsigned long)tdata);
 }
 
-static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
+			struct i2c_smbus_ioctl_data32   __user *udata)
 {
 	struct i2c_smbus_ioctl_data	__user *tdata;
-	struct i2c_smbus_ioctl_data32	__user *udata;
 	compat_caddr_t			datap;
 
 	tdata = compat_alloc_user_space(sizeof(*tdata));
@@ -1703,7 +818,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
 	if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata)))
 		return -EFAULT;
 
-	udata = compat_ptr(arg);
 	if (!access_ok(VERIFY_READ, udata, sizeof(*udata)))
 		return -EFAULT;
 
@@ -1718,27 +832,12 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
 	return sys_ioctl(fd, cmd, (unsigned long)tdata);
 }
 
-/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
- * for some operations; this forces use of the newer bridge-utils that
- * use compatible ioctls
- */
-static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	u32 tmp;
-
-	if (get_user(tmp, (u32 __user *) arg))
-		return -EFAULT;
-	if (tmp == BRCTL_GET_VERSION)
-		return BRCTL_VERSION + 1;
-	return -EINVAL;
-}
-
 #define RTC_IRQP_READ32		_IOR('p', 0x0b, compat_ulong_t)
 #define RTC_IRQP_SET32		_IOW('p', 0x0c, compat_ulong_t)
 #define RTC_EPOCH_READ32	_IOR('p', 0x0d, compat_ulong_t)
 #define RTC_EPOCH_SET32		_IOW('p', 0x0e, compat_ulong_t)
 
-static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
+static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
 {
 	mm_segment_t oldfs = get_fs();
 	compat_ulong_t val32;
@@ -1756,29 +855,14 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
 		if (ret)
 			return ret;
 		val32 = kval;
-		return put_user(val32, (unsigned int __user *)arg);
+		return put_user(val32, (unsigned int __user *)argp);
 	case RTC_IRQP_SET32:
-		return sys_ioctl(fd, RTC_IRQP_SET, arg); 
+		return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
 	case RTC_EPOCH_SET32:
-		return sys_ioctl(fd, RTC_EPOCH_SET, arg);
-	default:
-		/* unreached */
-		return -ENOIOCTLCMD;
+		return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
 	}
-}
 
-static int
-lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct compat_timeval __user *tc = (struct compat_timeval __user *)arg;
-	struct timeval __user *tn = compat_alloc_user_space(sizeof(struct timeval));
-	struct timeval ts;
-	if (get_user(ts.tv_sec, &tc->tv_sec) ||
-	    get_user(ts.tv_usec, &tc->tv_usec) ||
-	    put_user(ts.tv_sec, &tn->tv_sec) ||
-	    put_user(ts.tv_usec, &tn->tv_usec))
-		return -EFAULT;
-	return sys_ioctl(fd, cmd, (unsigned long)tn);
+	return -ENOIOCTLCMD;
 }
 
 /* on ia32 l_start is on a 32-bit boundary */
@@ -1798,9 +882,9 @@ struct space_resv_32 {
 #define FS_IOC_RESVSP64_32	_IOW ('X', 42, struct space_resv_32)
 
 /* just account for different alignment */
-static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
+static int compat_ioctl_preallocate(struct file *file,
+			struct space_resv_32    __user *p32)
 {
-	struct space_resv_32	__user *p32 = compat_ptr(arg);
 	struct space_resv	__user *p = compat_alloc_user_space(sizeof(*p));
 
 	if (copy_in_user(&p->l_type,	&p32->l_type,	sizeof(s16)) ||
@@ -1816,27 +900,13 @@ static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
 }
 #endif
 
+/*
+ * simple reversible transform to make our table more evenly
+ * distributed after sorting.
+ */
+#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
 
-typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int,
-					unsigned long, struct file *);
-
-struct ioctl_trans {
-	unsigned long cmd;
-	ioctl_trans_handler_t handler;
-	struct ioctl_trans *next;
-};
-
-#define HANDLE_IOCTL(cmd,handler) \
-	{ (cmd), (ioctl_trans_handler_t)(handler) },
-
-/* pointer to compatible structure or no argument */
-#define COMPATIBLE_IOCTL(cmd) \
-	{ (cmd), do_ioctl32_pointer },
-
-/* argument is an unsigned long integer, not a pointer */
-#define ULONG_IOCTL(cmd) \
-	{ (cmd), (ioctl_trans_handler_t)sys_ioctl },
-
+#define COMPATIBLE_IOCTL(cmd) XFORM(cmd),
 /* ioctl should not be warned about even if it's not implemented.
    Valid reasons to use this:
    - It is implemented with ->compat_ioctl on some device, but programs
@@ -1846,7 +916,7 @@ struct ioctl_trans {
    Most other reasons are not valid. */
 #define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
 
-static struct ioctl_trans ioctl_start[] = {
+static unsigned int ioctl_pointer[] = {
 /* compatible ioctls first */
 COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
 COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
@@ -1857,7 +927,6 @@ COMPATIBLE_IOCTL(TCSETA)
 COMPATIBLE_IOCTL(TCSETAW)
 COMPATIBLE_IOCTL(TCSETAF)
 COMPATIBLE_IOCTL(TCSBRK)
-ULONG_IOCTL(TCSBRKP)
 COMPATIBLE_IOCTL(TCXONC)
 COMPATIBLE_IOCTL(TCFLSH)
 COMPATIBLE_IOCTL(TCGETS)
@@ -1867,7 +936,6 @@ COMPATIBLE_IOCTL(TCSETSF)
 COMPATIBLE_IOCTL(TIOCLINUX)
 COMPATIBLE_IOCTL(TIOCSBRK)
 COMPATIBLE_IOCTL(TIOCCBRK)
-ULONG_IOCTL(TIOCMIWAIT)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
 /* Little t */
 COMPATIBLE_IOCTL(TIOCGETD)
@@ -1889,7 +957,6 @@ COMPATIBLE_IOCTL(TIOCSTI)
 COMPATIBLE_IOCTL(TIOCOUTQ)
 COMPATIBLE_IOCTL(TIOCSPGRP)
 COMPATIBLE_IOCTL(TIOCGPGRP)
-ULONG_IOCTL(TIOCSCTTY)
 COMPATIBLE_IOCTL(TIOCGPTN)
 COMPATIBLE_IOCTL(TIOCSPTLCK)
 COMPATIBLE_IOCTL(TIOCSERGETLSR)
@@ -1912,44 +979,11 @@ COMPATIBLE_IOCTL(FIGETBSZ)
 /* 'X' - originally XFS but some now in the VFS */
 COMPATIBLE_IOCTL(FIFREEZE)
 COMPATIBLE_IOCTL(FITHAW)
-/* RAID */
-COMPATIBLE_IOCTL(RAID_VERSION)
-COMPATIBLE_IOCTL(GET_ARRAY_INFO)
-COMPATIBLE_IOCTL(GET_DISK_INFO)
-COMPATIBLE_IOCTL(PRINT_RAID_DEBUG)
-COMPATIBLE_IOCTL(RAID_AUTORUN)
-COMPATIBLE_IOCTL(CLEAR_ARRAY)
-COMPATIBLE_IOCTL(ADD_NEW_DISK)
-ULONG_IOCTL(HOT_REMOVE_DISK)
-COMPATIBLE_IOCTL(SET_ARRAY_INFO)
-COMPATIBLE_IOCTL(SET_DISK_INFO)
-COMPATIBLE_IOCTL(WRITE_RAID_INFO)
-COMPATIBLE_IOCTL(UNPROTECT_ARRAY)
-COMPATIBLE_IOCTL(PROTECT_ARRAY)
-ULONG_IOCTL(HOT_ADD_DISK)
-ULONG_IOCTL(SET_DISK_FAULTY)
-COMPATIBLE_IOCTL(RUN_ARRAY)
-COMPATIBLE_IOCTL(STOP_ARRAY)
-COMPATIBLE_IOCTL(STOP_ARRAY_RO)
-COMPATIBLE_IOCTL(RESTART_ARRAY_RW)
-COMPATIBLE_IOCTL(GET_BITMAP_FILE)
-ULONG_IOCTL(SET_BITMAP_FILE)
-/* Big K */
-COMPATIBLE_IOCTL(PIO_FONT)
-COMPATIBLE_IOCTL(GIO_FONT)
-COMPATIBLE_IOCTL(PIO_CMAP)
-COMPATIBLE_IOCTL(GIO_CMAP)
-ULONG_IOCTL(KDSIGACCEPT)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
-ULONG_IOCTL(KIOCSOUND)
-ULONG_IOCTL(KDMKTONE)
 COMPATIBLE_IOCTL(KDGKBTYPE)
-ULONG_IOCTL(KDSETMODE)
 COMPATIBLE_IOCTL(KDGETMODE)
-ULONG_IOCTL(KDSKBMODE)
 COMPATIBLE_IOCTL(KDGKBMODE)
-ULONG_IOCTL(KDSKBMETA)
 COMPATIBLE_IOCTL(KDGKBMETA)
 COMPATIBLE_IOCTL(KDGKBENT)
 COMPATIBLE_IOCTL(KDSKBENT)
@@ -1959,15 +993,7 @@ COMPATIBLE_IOCTL(KDGKBDIACR)
 COMPATIBLE_IOCTL(KDSKBDIACR)
 COMPATIBLE_IOCTL(KDKBDREP)
 COMPATIBLE_IOCTL(KDGKBLED)
-ULONG_IOCTL(KDSKBLED)
 COMPATIBLE_IOCTL(KDGETLED)
-ULONG_IOCTL(KDSETLED)
-COMPATIBLE_IOCTL(GIO_SCRNMAP)
-COMPATIBLE_IOCTL(PIO_SCRNMAP)
-COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
-COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
-COMPATIBLE_IOCTL(PIO_FONTRESET)
-COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
 #ifdef CONFIG_BLOCK
 /* Big S */
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
@@ -1979,32 +1005,6 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
 COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
 #endif
-/* Big T */
-COMPATIBLE_IOCTL(TUNSETNOCSUM)
-COMPATIBLE_IOCTL(TUNSETDEBUG)
-COMPATIBLE_IOCTL(TUNSETPERSIST)
-COMPATIBLE_IOCTL(TUNSETOWNER)
-COMPATIBLE_IOCTL(TUNSETLINK)
-COMPATIBLE_IOCTL(TUNSETGROUP)
-COMPATIBLE_IOCTL(TUNGETFEATURES)
-COMPATIBLE_IOCTL(TUNSETOFFLOAD)
-COMPATIBLE_IOCTL(TUNSETTXFILTER)
-COMPATIBLE_IOCTL(TUNGETSNDBUF)
-COMPATIBLE_IOCTL(TUNSETSNDBUF)
-/* Big V */
-COMPATIBLE_IOCTL(VT_SETMODE)
-COMPATIBLE_IOCTL(VT_GETMODE)
-COMPATIBLE_IOCTL(VT_GETSTATE)
-COMPATIBLE_IOCTL(VT_OPENQRY)
-ULONG_IOCTL(VT_ACTIVATE)
-ULONG_IOCTL(VT_WAITACTIVE)
-ULONG_IOCTL(VT_RELDISP)
-ULONG_IOCTL(VT_DISALLOCATE)
-COMPATIBLE_IOCTL(VT_RESIZE)
-COMPATIBLE_IOCTL(VT_RESIZEX)
-COMPATIBLE_IOCTL(VT_LOCKSWITCH)
-COMPATIBLE_IOCTL(VT_UNLOCKSWITCH)
-COMPATIBLE_IOCTL(VT_GETHIFONTMASK)
 /* Little p (/dev/rtc, /dev/envctrl, etc.) */
 COMPATIBLE_IOCTL(RTC_AIE_ON)
 COMPATIBLE_IOCTL(RTC_AIE_OFF)
@@ -2032,36 +1032,13 @@ COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
 COMPATIBLE_IOCTL(MTIOCTOP)
 /* Socket level stuff */
 COMPATIBLE_IOCTL(FIOQSIZE)
-COMPATIBLE_IOCTL(FIOSETOWN)
-COMPATIBLE_IOCTL(SIOCSPGRP)
-COMPATIBLE_IOCTL(FIOGETOWN)
-COMPATIBLE_IOCTL(SIOCGPGRP)
-COMPATIBLE_IOCTL(SIOCATMARK)
-COMPATIBLE_IOCTL(SIOCSIFLINK)
-COMPATIBLE_IOCTL(SIOCSIFENCAP)
-COMPATIBLE_IOCTL(SIOCGIFENCAP)
-COMPATIBLE_IOCTL(SIOCSIFNAME)
-COMPATIBLE_IOCTL(SIOCSARP)
-COMPATIBLE_IOCTL(SIOCGARP)
-COMPATIBLE_IOCTL(SIOCDARP)
-COMPATIBLE_IOCTL(SIOCSRARP)
-COMPATIBLE_IOCTL(SIOCGRARP)
-COMPATIBLE_IOCTL(SIOCDRARP)
-COMPATIBLE_IOCTL(SIOCADDDLCI)
-COMPATIBLE_IOCTL(SIOCDELDLCI)
-COMPATIBLE_IOCTL(SIOCGMIIPHY)
-COMPATIBLE_IOCTL(SIOCGMIIREG)
-COMPATIBLE_IOCTL(SIOCSMIIREG)
-COMPATIBLE_IOCTL(SIOCGIFVLAN)
-COMPATIBLE_IOCTL(SIOCSIFVLAN)
-COMPATIBLE_IOCTL(SIOCBRADDBR)
-COMPATIBLE_IOCTL(SIOCBRDELBR)
 #ifdef CONFIG_BLOCK
+/* loop */
+IGNORE_IOCTL(LOOP_CLR_FD)
 /* SG stuff */
 COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
 COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
 COMPATIBLE_IOCTL(SG_EMULATED_HOST)
-ULONG_IOCTL(SG_SET_TRANSFORM)
 COMPATIBLE_IOCTL(SG_GET_TRANSFORM)
 COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE)
 COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE)
@@ -2115,8 +1092,6 @@ COMPATIBLE_IOCTL(PPPIOCGCHAN)
 /* PPPOX */
 COMPATIBLE_IOCTL(PPPOEIOCSFWD)
 COMPATIBLE_IOCTL(PPPOEIOCDFWD)
-/* LP */
-COMPATIBLE_IOCTL(LPGETSTATUS)
 /* ppdev */
 COMPATIBLE_IOCTL(PPSETMODE)
 COMPATIBLE_IOCTL(PPRSTATUS)
@@ -2298,8 +1273,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
 COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
 COMPATIBLE_IOCTL(OSS_GETVERSION)
 /* AUTOFS */
-ULONG_IOCTL(AUTOFS_IOC_READY)
-ULONG_IOCTL(AUTOFS_IOC_FAIL)
 COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC)
 COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
 COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
@@ -2311,22 +1284,6 @@ COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
 /* SMB ioctls which do not need any translations */
 COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
-/* Little a */
-COMPATIBLE_IOCTL(ATMSIGD_CTRL)
-COMPATIBLE_IOCTL(ATMARPD_CTRL)
-COMPATIBLE_IOCTL(ATMLEC_CTRL)
-COMPATIBLE_IOCTL(ATMLEC_MCAST)
-COMPATIBLE_IOCTL(ATMLEC_DATA)
-COMPATIBLE_IOCTL(ATM_SETSC)
-COMPATIBLE_IOCTL(SIOCSIFATMTCP)
-COMPATIBLE_IOCTL(SIOCMKCLIP)
-COMPATIBLE_IOCTL(ATMARP_MKIP)
-COMPATIBLE_IOCTL(ATMARP_SETENTRY)
-COMPATIBLE_IOCTL(ATMARP_ENCAP)
-COMPATIBLE_IOCTL(ATMTCP_CREATE)
-COMPATIBLE_IOCTL(ATMTCP_REMOVE)
-COMPATIBLE_IOCTL(ATMMPC_CTRL)
-COMPATIBLE_IOCTL(ATMMPC_DATA)
 /* Watchdog */
 COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -2408,30 +1365,11 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
 COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
 COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
 COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* USB */
-COMPATIBLE_IOCTL(USBDEVFS_RESETEP)
-COMPATIBLE_IOCTL(USBDEVFS_SETINTERFACE)
-COMPATIBLE_IOCTL(USBDEVFS_SETCONFIGURATION)
-COMPATIBLE_IOCTL(USBDEVFS_GETDRIVER)
-COMPATIBLE_IOCTL(USBDEVFS_DISCARDURB)
-COMPATIBLE_IOCTL(USBDEVFS_CLAIMINTERFACE)
-COMPATIBLE_IOCTL(USBDEVFS_RELEASEINTERFACE)
-COMPATIBLE_IOCTL(USBDEVFS_CONNECTINFO)
-COMPATIBLE_IOCTL(USBDEVFS_HUB_PORTINFO)
-COMPATIBLE_IOCTL(USBDEVFS_RESET)
-COMPATIBLE_IOCTL(USBDEVFS_SUBMITURB32)
-COMPATIBLE_IOCTL(USBDEVFS_REAPURB32)
-COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32)
-COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT)
 /* NBD */
-ULONG_IOCTL(NBD_SET_SOCK)
-ULONG_IOCTL(NBD_SET_BLKSIZE)
-ULONG_IOCTL(NBD_SET_SIZE)
 COMPATIBLE_IOCTL(NBD_DO_IT)
 COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
 COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
 COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
-ULONG_IOCTL(NBD_SET_SIZE_BLOCKS)
 COMPATIBLE_IOCTL(NBD_DISCONNECT)
 /* i2c */
 COMPATIBLE_IOCTL(I2C_SLAVE)
@@ -2531,131 +1469,13 @@ COMPATIBLE_IOCTL(JSIOCGAXES)
 COMPATIBLE_IOCTL(JSIOCGBUTTONS)
 COMPATIBLE_IOCTL(JSIOCGNAME(0))
 
-/* now things that need handlers */
-#ifdef CONFIG_NET
-HANDLE_IOCTL(SIOCGIFNAME, dev_ifname32)
-HANDLE_IOCTL(SIOCGIFCONF, dev_ifconf)
-HANDLE_IOCTL(SIOCGIFFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMETRIC, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMETRIC, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMTU, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMTU, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMEM, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMEM, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFHWADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFHWADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCADDMULTI, dev_ifsioc)
-HANDLE_IOCTL(SIOCDELMULTI, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFINDEX, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMAP, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc)
-HANDLE_IOCTL(SIOCSHWTSTAMP, dev_ifsioc)
-
-/* ioctls used by appletalk ddp.c */
-HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCDIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSARP, dev_ifsioc)
-HANDLE_IOCTL(SIOCDARP, dev_ifsioc)
-
-HANDLE_IOCTL(SIOCGIFBRDADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFBRDADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFDSTADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFDSTADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFNETMASK, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFNETMASK, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFPFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
-HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
-HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
-HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
-HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDSETHWADDR, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDSLAVEINFOQUERY, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDINFOQUERY, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDCHANGEACTIVE, bond_ioctl)
-HANDLE_IOCTL(SIOCADDRT, routing_ioctl)
-HANDLE_IOCTL(SIOCDELRT, routing_ioctl)
-HANDLE_IOCTL(SIOCBRADDIF, dev_ifsioc)
-HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
-/* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */
-HANDLE_IOCTL(SIOCRTMSG, ret_einval)
-HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
-HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
-#endif
-#ifdef CONFIG_BLOCK
-HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
-HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
-#endif
-HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans)
-HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans)
-HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans)
-HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans)
-#ifdef CONFIG_BLOCK
-HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans)
-HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans)
-#endif
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
-HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout)
-#ifdef CONFIG_VT
-HANDLE_IOCTL(PIO_FONTX, do_fontx_ioctl)
-HANDLE_IOCTL(GIO_FONTX, do_fontx_ioctl)
-HANDLE_IOCTL(PIO_UNIMAP, do_unimap_ioctl)
-HANDLE_IOCTL(GIO_UNIMAP, do_unimap_ioctl)
-HANDLE_IOCTL(KDFONTOP, do_kdfontop_ioctl)
-#endif
-/* One SMB ioctl needs translations. */
-#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
-HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid)
-HANDLE_IOCTL(ATM_GETLINKRATE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETNAMES32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETTYPE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETESI32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_RSTADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_ADDADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_DELADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETCIRANGE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETCIRANGE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETESI32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETESIF32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETSTAT32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETSTATZ32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_QUERYLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETSTAT, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETSTATZ, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_SETDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_CLRDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_SETFRAMING, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
-/* block stuff */
-#ifdef CONFIG_BLOCK
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
-/* Raw devices */
-HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
-HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
-#endif
-/* Serial */
-HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl)
-HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl)
 #ifdef TIOCGLTC
 COMPATIBLE_IOCTL(TIOCGLTC)
 COMPATIBLE_IOCTL(TIOCSLTC)
 #endif
 #ifdef TIOCSTART
 /*
- * For these two we have defintions in ioctls.h and/or termios.h on
+ * For these two we have definitions in ioctls.h and/or termios.h on
  * some architectures but no actual implemention.  Some applications
  * like bash call them if they are defined in the headers, so we provide
  * entries here to avoid syslog message spew.
@@ -2663,43 +1483,6 @@ COMPATIBLE_IOCTL(TIOCSLTC)
 COMPATIBLE_IOCTL(TIOCSTART)
 COMPATIBLE_IOCTL(TIOCSTOP)
 #endif
-/* Usbdevfs */
-HANDLE_IOCTL(USBDEVFS_CONTROL32, do_usbdevfs_control)
-HANDLE_IOCTL(USBDEVFS_BULK32, do_usbdevfs_bulk)
-HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal)
-COMPATIBLE_IOCTL(USBDEVFS_IOCTL32)
-/* i2c */
-HANDLE_IOCTL(I2C_FUNCS, w_long)
-HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
-HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl)
-/* bridge */
-HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
-HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
-/* Not implemented in the native kernel */
-IGNORE_IOCTL(SIOCGIFCOUNT)
-HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
-HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
-HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
-HANDLE_IOCTL(RTC_EPOCH_SET32, rtc_ioctl)
-
-/* dvb */
-HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
-HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
-HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
-
-/* parport */
-COMPATIBLE_IOCTL(LPTIME)
-COMPATIBLE_IOCTL(LPCHAR)
-COMPATIBLE_IOCTL(LPABORTOPEN)
-COMPATIBLE_IOCTL(LPCAREFUL)
-COMPATIBLE_IOCTL(LPWAIT)
-COMPATIBLE_IOCTL(LPSETIRQ)
-COMPATIBLE_IOCTL(LPGETSTATUS)
-COMPATIBLE_IOCTL(LPGETSTATUS)
-COMPATIBLE_IOCTL(LPRESET)
-/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/
-COMPATIBLE_IOCTL(LPGETFLAGS)
-HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
 
 /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
    but we don't want warnings on other file systems. So declare
@@ -2727,12 +1510,110 @@ IGNORE_IOCTL(FBIOGCURSOR32)
 #endif
 };
 
-#define IOCTL_HASHSIZE 256
-static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE];
-
-static inline unsigned long ioctl32_hash(unsigned long cmd)
+/*
+ * Convert common ioctl arguments based on their command number
+ *
+ * Please do not add any code in here. Instead, implement
+ * a compat_ioctl operation in the place that handleѕ the
+ * ioctl for the native case.
+ */
+static long do_ioctl_trans(int fd, unsigned int cmd,
+		 unsigned long arg, struct file *file)
 {
-	return (((cmd >> 6) ^ (cmd >> 4) ^ cmd)) % IOCTL_HASHSIZE;
+	void __user *argp = compat_ptr(arg);
+
+	switch (cmd) {
+	case PPPIOCGIDLE32:
+		return ppp_gidle(fd, cmd, argp);
+	case PPPIOCSCOMPRESS32:
+		return ppp_scompress(fd, cmd, argp);
+	case PPPIOCSPASS32:
+	case PPPIOCSACTIVE32:
+		return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
+#ifdef CONFIG_BLOCK
+	case SG_IO:
+		return sg_ioctl_trans(fd, cmd, argp);
+	case SG_GET_REQUEST_TABLE:
+		return sg_grt_trans(fd, cmd, argp);
+	case MTIOCGET32:
+	case MTIOCPOS32:
+		return mt_ioctl_trans(fd, cmd, argp);
+	/* Raw devices */
+	case RAW_SETBIND:
+	case RAW_GETBIND:
+		return raw_ioctl(fd, cmd, argp);
+#endif
+#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
+	case AUTOFS_IOC_SETTIMEOUT32:
+		return ioc_settimeout(fd, cmd, argp);
+	/* One SMB ioctl needs translations. */
+#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
+	case SMB_IOC_GETMOUNTUID_32:
+		return do_smb_getmountuid(fd, cmd, argp);
+	/* Serial */
+	case TIOCGSERIAL:
+	case TIOCSSERIAL:
+		return serial_struct_ioctl(fd, cmd, argp);
+	/* i2c */
+	case I2C_FUNCS:
+		return w_long(fd, cmd, argp);
+	case I2C_RDWR:
+		return do_i2c_rdwr_ioctl(fd, cmd, argp);
+	case I2C_SMBUS:
+		return do_i2c_smbus_ioctl(fd, cmd, argp);
+	/* Not implemented in the native kernel */
+	case RTC_IRQP_READ32:
+	case RTC_IRQP_SET32:
+	case RTC_EPOCH_READ32:
+	case RTC_EPOCH_SET32:
+		return rtc_ioctl(fd, cmd, argp);
+
+	/* dvb */
+	case VIDEO_GET_EVENT:
+		return do_video_get_event(fd, cmd, argp);
+	case VIDEO_STILLPICTURE:
+		return do_video_stillpicture(fd, cmd, argp);
+	case VIDEO_SET_SPU_PALETTE:
+		return do_video_set_spu_palette(fd, cmd, argp);
+	}
+
+	/*
+	 * These take an integer instead of a pointer as 'arg',
+	 * so we must not do a compat_ptr() translation.
+	 */
+	switch (cmd) {
+	/* Big T */
+	case TCSBRKP:
+	case TIOCMIWAIT:
+	case TIOCSCTTY:
+	/* RAID */
+	case HOT_REMOVE_DISK:
+	case HOT_ADD_DISK:
+	case SET_DISK_FAULTY:
+	case SET_BITMAP_FILE:
+	/* Big K */
+	case KDSIGACCEPT:
+	case KIOCSOUND:
+	case KDMKTONE:
+	case KDSETMODE:
+	case KDSKBMODE:
+	case KDSKBMETA:
+	case KDSKBLED:
+	case KDSETLED:
+	/* SG stuff */
+	case SG_SET_TRANSFORM:
+	/* AUTOFS */
+	case AUTOFS_IOC_READY:
+	case AUTOFS_IOC_FAIL:
+	/* NBD */
+	case NBD_SET_SOCK:
+	case NBD_SET_BLKSIZE:
+	case NBD_SET_SIZE:
+	case NBD_SET_SIZE_BLOCKS:
+		return do_vfs_ioctl(file, fd, cmd, arg);
+	}
+
+	return -ENOIOCTLCMD;
 }
 
 static void compat_ioctl_error(struct file *filp, unsigned int fd,
@@ -2764,12 +1645,33 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
 		free_page((unsigned long)path);
 }
 
+static int compat_ioctl_check_table(unsigned int xcmd)
+{
+	int i;
+	const int max = ARRAY_SIZE(ioctl_pointer) - 1;
+
+	BUILD_BUG_ON(max >= (1 << 16));
+
+	/* guess initial offset into table, assuming a
+	   normalized distribution */
+	i = ((xcmd >> 16) * max) >> 16;
+
+	/* do linear search up first, until greater or equal */
+	while (ioctl_pointer[i] < xcmd && i < max)
+		i++;
+
+	/* then do linear search down */
+	while (ioctl_pointer[i] > xcmd && i > 0)
+		i--;
+
+	return ioctl_pointer[i] == xcmd;
+}
+
 asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 				unsigned long arg)
 {
 	struct file *filp;
 	int error = -EBADF;
-	struct ioctl_trans *t;
 	int fput_needed;
 
 	filp = fget_light(fd, &fput_needed);
@@ -2797,7 +1699,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
 	case FS_IOC_RESVSP_32:
 	case FS_IOC_RESVSP64_32:
-		error = compat_ioctl_preallocate(filp, arg);
+		error = compat_ioctl_preallocate(filp, compat_ptr(arg));
 		goto out_fput;
 #else
 	case FS_IOC_RESVSP:
@@ -2826,18 +1728,11 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 		break;
 	}
 
-	for (t = ioctl32_hash_table[ioctl32_hash(cmd)]; t; t = t->next) {
-		if (t->cmd == cmd)
-			goto found_handler;
-	}
+	if (compat_ioctl_check_table(XFORM(cmd)))
+		goto found_handler;
 
-#ifdef CONFIG_NET
-	if (S_ISSOCK(filp->f_path.dentry->d_inode->i_mode) &&
-	    cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
-		error = siocdevprivate_ioctl(fd, cmd, arg);
-	} else
-#endif
-	{
+	error = do_ioctl_trans(fd, cmd, arg, filp);
+	if (error == -ENOIOCTLCMD) {
 		static int count;
 
 		if (++count <= 50)
@@ -2848,13 +1743,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 	goto out_fput;
 
  found_handler:
-	if (t->handler) {
-		lock_kernel();
-		error = t->handler(fd, cmd, arg, filp);
-		unlock_kernel();
-		goto out_fput;
-	}
-
+	arg = (unsigned long)compat_ptr(arg);
  do_ioctl:
 	error = do_vfs_ioctl(filp, fd, cmd, arg);
  out_fput:
@@ -2863,35 +1752,22 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 	return error;
 }
 
-static void ioctl32_insert_translation(struct ioctl_trans *trans)
+static int __init init_sys32_ioctl_cmp(const void *p, const void *q)
 {
-	unsigned long hash;
-	struct ioctl_trans *t;
-
-	hash = ioctl32_hash (trans->cmd);
-	if (!ioctl32_hash_table[hash])
-		ioctl32_hash_table[hash] = trans;
-	else {
-		t = ioctl32_hash_table[hash];
-		while (t->next)
-			t = t->next;
-		trans->next = NULL;
-		t->next = trans;
-	}
+	unsigned int a, b;
+	a = *(unsigned int *)p;
+	b = *(unsigned int *)q;
+	if (a > b)
+		return 1;
+	if (a < b)
+		return -1;
+	return 0;
 }
 
 static int __init init_sys32_ioctl(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(ioctl_start); i++) {
-		if (ioctl_start[i].next) {
-			printk("ioctl translation %d bad\n",i);
-			return -1;
-		}
-
-		ioctl32_insert_translation(&ioctl_start[i]);
-	}
+	sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer),
+		init_sys32_ioctl_cmp, NULL);
 	return 0;
 }
 __initcall(init_sys32_ioctl);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d22438ef767..b486169f42b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -32,7 +32,9 @@ static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
 
-static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev,
+				       void *data, const struct file_operations *fops)
+
 {
 	struct inode *inode = new_inode(sb);
 
@@ -44,14 +46,18 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 			init_special_inode(inode, mode, dev);
 			break;
 		case S_IFREG:
-			inode->i_fop = &debugfs_file_operations;
+			inode->i_fop = fops ? fops : &debugfs_file_operations;
+			inode->i_private = data;
 			break;
 		case S_IFLNK:
 			inode->i_op = &debugfs_link_operations;
+			inode->i_fop = fops;
+			inode->i_private = data;
 			break;
 		case S_IFDIR:
 			inode->i_op = &simple_dir_inode_operations;
-			inode->i_fop = &simple_dir_operations;
+			inode->i_fop = fops ? fops : &simple_dir_operations;
+			inode->i_private = data;
 
 			/* directory inodes start off with i_nlink == 2
 			 * (for "." entry) */
@@ -64,7 +70,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 
 /* SMP-safe */
 static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
-			 int mode, dev_t dev)
+			 int mode, dev_t dev, void *data,
+			 const struct file_operations *fops)
 {
 	struct inode *inode;
 	int error = -EPERM;
@@ -72,7 +79,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (dentry->d_inode)
 		return -EEXIST;
 
-	inode = debugfs_get_inode(dir->i_sb, mode, dev);
+	inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
 	if (inode) {
 		d_instantiate(dentry, inode);
 		dget(dentry);
@@ -81,12 +88,13 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
 	return error;
 }
 
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
+			 void *data, const struct file_operations *fops)
 {
 	int res;
 
 	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
-	res = debugfs_mknod(dir, dentry, mode, 0);
+	res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
 	if (!res) {
 		inc_nlink(dir);
 		fsnotify_mkdir(dir, dentry);
@@ -94,18 +102,20 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return res;
 }
 
-static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode,
+			void *data, const struct file_operations *fops)
 {
 	mode = (mode & S_IALLUGO) | S_IFLNK;
-	return debugfs_mknod(dir, dentry, mode, 0);
+	return debugfs_mknod(dir, dentry, mode, 0, data, fops);
 }
 
-static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode,
+			  void *data, const struct file_operations *fops)
 {
 	int res;
 
 	mode = (mode & S_IALLUGO) | S_IFREG;
-	res = debugfs_mknod(dir, dentry, mode, 0);
+	res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
 	if (!res)
 		fsnotify_create(dir, dentry);
 	return res;
@@ -139,7 +149,9 @@ static struct file_system_type debug_fs_type = {
 
 static int debugfs_create_by_name(const char *name, mode_t mode,
 				  struct dentry *parent,
-				  struct dentry **dentry)
+				  struct dentry **dentry,
+				  void *data,
+				  const struct file_operations *fops)
 {
 	int error = 0;
 
@@ -164,13 +176,16 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 	if (!IS_ERR(*dentry)) {
 		switch (mode & S_IFMT) {
 		case S_IFDIR:
-			error = debugfs_mkdir(parent->d_inode, *dentry, mode);
+			error = debugfs_mkdir(parent->d_inode, *dentry, mode,
+					      data, fops);
 			break;
 		case S_IFLNK:
-			error = debugfs_link(parent->d_inode, *dentry, mode);
+			error = debugfs_link(parent->d_inode, *dentry, mode,
+					     data, fops);
 			break;
 		default:
-			error = debugfs_create(parent->d_inode, *dentry, mode);
+			error = debugfs_create(parent->d_inode, *dentry, mode,
+					       data, fops);
 			break;
 		}
 		dput(*dentry);
@@ -184,7 +199,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 /**
  * debugfs_create_file - create a file in the debugfs filesystem
  * @name: a pointer to a string containing the name of the file to create.
- * @mode: the permission that the file should have
+ * @mode: the permission that the file should have.
  * @parent: a pointer to the parent dentry for this file.  This should be a
  *          directory dentry if set.  If this paramater is NULL, then the
  *          file will be created in the root of the debugfs filesystem.
@@ -195,8 +210,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
  *        this file.
  *
  * This is the basic "create a file" function for debugfs.  It allows for a
- * wide range of flexibility in createing a file, or a directory (if you
- * want to create a directory, the debugfs_create_dir() function is
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the debugfs_create_dir() function is
  * recommended to be used instead.)
  *
  * This function will return a pointer to a dentry if it succeeds.  This
@@ -221,19 +236,13 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
 	if (error)
 		goto exit;
 
-	error = debugfs_create_by_name(name, mode, parent, &dentry);
+	error = debugfs_create_by_name(name, mode, parent, &dentry,
+				       data, fops);
 	if (error) {
 		dentry = NULL;
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 		goto exit;
 	}
-
-	if (dentry->d_inode) {
-		if (data)
-			dentry->d_inode->i_private = data;
-		if (fops)
-			dentry->d_inode->i_fop = fops;
-	}
 exit:
 	return dentry;
 }
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d5f8c96964b..8882ecc0f1b 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -517,11 +517,23 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 
 struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
 {
+	struct dentry *dentry;
+	struct tty_struct *tty;
+
 	BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
 
+	/* Ensure dentry has not been deleted by devpts_pty_kill() */
+	dentry = d_find_alias(pts_inode);
+	if (!dentry)
+		return NULL;
+
+	tty = NULL;
 	if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
-		return (struct tty_struct *)pts_inode->i_private;
-	return NULL;
+		tty = (struct tty_struct *)pts_inode->i_private;
+
+	dput(dentry);
+
+	return tty;
 }
 
 void devpts_pty_kill(struct tty_struct *tty)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87dc01..4012885d027 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
  *
  * If blkfactor is zero then the user's request was aligned to the filesystem's
  * blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks.  If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
  */
 
 struct dio {
@@ -68,7 +61,7 @@ struct dio {
 	struct inode *inode;
 	int rw;
 	loff_t i_size;			/* i_size when submitted */
-	int lock_type;			/* doesn't change */
+	int flags;			/* doesn't change */
 	unsigned blkbits;		/* doesn't change */
 	unsigned blkfactor;		/* When we're using an alignment which
 					   is finer than the filesystem's soft
@@ -104,6 +97,18 @@ struct dio {
 	unsigned cur_page_len;		/* Nr of bytes at cur_page_offset */
 	sector_t cur_page_block;	/* Where it starts */
 
+	/* BIO completion state */
+	spinlock_t bio_lock;		/* protects BIO fields below */
+	unsigned long refcount;		/* direct_io_worker() and bios */
+	struct bio *bio_list;		/* singly linked via bi_private */
+	struct task_struct *waiter;	/* waiting task (NULL if none) */
+
+	/* AIO related stuff */
+	struct kiocb *iocb;		/* kiocb */
+	int is_async;			/* is IO async ? */
+	int io_error;			/* IO error in completion path */
+	ssize_t result;                 /* IO result */
+
 	/*
 	 * Page fetching state. These variables belong to dio_refill_pages().
 	 */
@@ -115,22 +120,16 @@ struct dio {
 	 * Page queue.  These variables belong to dio_refill_pages() and
 	 * dio_get_page().
 	 */
-	struct page *pages[DIO_PAGES];	/* page buffer */
 	unsigned head;			/* next page to process */
 	unsigned tail;			/* last valid page + 1 */
 	int page_errors;		/* errno from get_user_pages() */
 
-	/* BIO completion state */
-	spinlock_t bio_lock;		/* protects BIO fields below */
-	unsigned long refcount;		/* direct_io_worker() and bios */
-	struct bio *bio_list;		/* singly linked via bi_private */
-	struct task_struct *waiter;	/* waiting task (NULL if none) */
-
-	/* AIO related stuff */
-	struct kiocb *iocb;		/* kiocb */
-	int is_async;			/* is IO async ? */
-	int io_error;			/* IO error in completion path */
-	ssize_t result;                 /* IO result */
+	/*
+	 * pages[] (and any fields placed after it) are not zeroed out at
+	 * allocation time.  Don't add new fields after pages[] unless you
+	 * wish that they not be zeroed.
+	 */
+	struct page *pages[DIO_PAGES];	/* page buffer */
 };
 
 /*
@@ -240,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
 	if (dio->end_io && dio->result)
 		dio->end_io(dio->iocb, offset, transferred,
 			    dio->map_bh.b_private);
-	if (dio->lock_type == DIO_LOCKING)
+
+	if (dio->flags & DIO_LOCKING)
 		/* lockdep: non-owner release */
 		up_read_non_owner(&dio->inode->i_alloc_sem);
 
@@ -515,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 
+		/*
+		 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
+		 * forbid block creations: only overwrites are permitted.
+		 * We will return early to the caller once we see an
+		 * unmapped buffer head returned, and the caller will fall
+		 * back to buffered I/O.
+		 *
+		 * Otherwise the decision is left to the get_blocks method,
+		 * which may decide to handle it or also return an unmapped
+		 * buffer head.
+		 */
 		create = dio->rw & WRITE;
-		if (dio->lock_type == DIO_LOCKING) {
+		if (dio->flags & DIO_SKIP_HOLES) {
 			if (dio->block_in_file < (i_size_read(dio->inode) >>
 							dio->blkbits))
 				create = 0;
-		} else if (dio->lock_type == DIO_NO_LOCKING) {
-			create = 0;
 		}
 
-		/*
-		 * For writes inside i_size we forbid block creations: only
-		 * overwrites are permitted.  We fall back to buffered writes
-		 * at a higher level for inside-i_size block-instantiating
-		 * writes.
-		 */
 		ret = (*dio->get_block)(dio->inode, fs_startblk,
 						map_bh, create);
 	}
@@ -1028,9 +1031,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	if (dio->bio)
 		dio_bio_submit(dio);
 
-	/* All IO is now issued, send it on its way */
-	blk_run_address_space(inode->i_mapping);
-
 	/*
 	 * It is possible that, we return short IO due to end of file.
 	 * In that case, we need to release all the pages we got hold on.
@@ -1042,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 * we can let i_mutex go now that its achieved its purpose
 	 * of protecting us from looking up uninitialized blocks.
 	 */
-	if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+	if (rw == READ && (dio->flags & DIO_LOCKING))
 		mutex_unlock(&dio->inode->i_mutex);
 
 	/*
@@ -1057,8 +1057,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	    ((rw & READ) || (dio->result == dio->size)))
 		ret = -EIOCBQUEUED;
 
-	if (ret != -EIOCBQUEUED)
+	if (ret != -EIOCBQUEUED) {
+		/* All IO is now issued, send it on its way */
+		blk_run_address_space(inode->i_mapping);
 		dio_await_completion(dio);
+	}
 
 	/*
 	 * Sync will always be dropping the final ref and completing the
@@ -1086,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 /*
  * This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
  *
- * DIO_NO_LOCKING (no locking, for raw block device access)
- * For writes, i_mutex is not held on entry; it is never taken.
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
  *
- * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_mutex and return with i_mutex held, even
- * though it is internally dropped.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
- * returning.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- *	uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
  */
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-	int dio_lock_type)
+	int flags)
 {
 	int seg;
 	size_t size;
@@ -1120,11 +1121,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
 	struct dio *dio;
-	int release_i_mutex = 0;
-	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		rw = WRITE_ODIRECT;
+		rw = WRITE_ODIRECT_PLUG;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
@@ -1151,48 +1150,41 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	}
 
-	dio = kzalloc(sizeof(*dio), GFP_KERNEL);
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
 	retval = -ENOMEM;
 	if (!dio)
 		goto out;
-
 	/*
-	 * For block device access DIO_NO_LOCKING is used,
-	 *	neither readers nor writers do any locking at all
-	 * For regular files using DIO_LOCKING,
-	 *	readers need to grab i_mutex and i_alloc_sem
-	 *	writers need to grab i_alloc_sem only (i_mutex is already held)
-	 * For regular files using DIO_OWN_LOCKING,
-	 *	neither readers nor writers take any locks here
+	 * Believe it or not, zeroing out the page array caused a .5%
+	 * performance regression in a database benchmark.  So, we take
+	 * care to only zero out what's needed.
 	 */
-	dio->lock_type = dio_lock_type;
-	if (dio_lock_type != DIO_NO_LOCKING) {
+	memset(dio, 0, offsetof(struct dio, pages));
+
+	dio->flags = flags;
+	if (dio->flags & DIO_LOCKING) {
 		/* watch out for a 0 len io from a tricksy fs */
 		if (rw == READ && end > offset) {
-			struct address_space *mapping;
+			struct address_space *mapping =
+					iocb->ki_filp->f_mapping;
 
-			mapping = iocb->ki_filp->f_mapping;
-			if (dio_lock_type != DIO_OWN_LOCKING) {
-				mutex_lock(&inode->i_mutex);
-				release_i_mutex = 1;
-			}
+			/* will be released by direct_io_worker */
+			mutex_lock(&inode->i_mutex);
 
 			retval = filemap_write_and_wait_range(mapping, offset,
 							      end - 1);
 			if (retval) {
+				mutex_unlock(&inode->i_mutex);
 				kfree(dio);
 				goto out;
 			}
-
-			if (dio_lock_type == DIO_OWN_LOCKING) {
-				mutex_unlock(&inode->i_mutex);
-				acquire_i_mutex = 1;
-			}
 		}
 
-		if (dio_lock_type == DIO_LOCKING)
-			/* lockdep: not the owner will release it */
-			down_read_non_owner(&inode->i_alloc_sem);
+		/*
+		 * Will be released at I/O completion, possibly in a
+		 * different thread.
+		 */
+		down_read_non_owner(&inode->i_alloc_sem);
 	}
 
 	/*
@@ -1210,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
-	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
-	 * it's own meaner.
+	 *
+	 * NOTE: filesystems with their own locking have to handle this
+	 * on their own.
 	 */
-	if (unlikely(retval < 0 && (rw & WRITE))) {
-		loff_t isize = i_size_read(inode);
-
-		if (end > isize && dio_lock_type == DIO_LOCKING)
-			vmtruncate(inode, isize);
+	if (dio->flags & DIO_LOCKING) {
+		if (unlikely((rw & WRITE) && retval < 0)) {
+			loff_t isize = i_size_read(inode);
+			if (end > isize)
+				vmtruncate(inode, isize);
+		}
 	}
 
-	if (rw == READ && dio_lock_type == DIO_LOCKING)
-		release_i_mutex = 0;
-
 out:
-	if (release_i_mutex)
-		mutex_unlock(&inode->i_mutex);
-	else if (acquire_i_mutex)
-		mutex_lock(&inode->i_mutex);
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index fd9859f92fa..0df24385081 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -410,10 +410,10 @@ static struct config_group *make_cluster(struct config_group *g,
 	struct dlm_comms *cms = NULL;
 	void *gps = NULL;
 
-	cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
-	gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
-	sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
-	cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
+	cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
+	gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
+	sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
+	cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
 
 	if (!cl || !gps || !sps || !cms)
 		goto fail;
@@ -482,9 +482,9 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 	struct dlm_nodes *nds = NULL;
 	void *gps = NULL;
 
-	sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
-	gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
-	nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
+	sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
+	gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
+	nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
 
 	if (!sp || !gps || !nds)
 		goto fail;
@@ -536,7 +536,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
 {
 	struct dlm_comm *cm;
 
-	cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
+	cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
 	if (!cm)
 		return ERR_PTR(-ENOMEM);
 
@@ -569,7 +569,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
 	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
 	struct dlm_node *nd;
 
-	nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
+	nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
 	if (!nd)
 		return ERR_PTR(-ENOMEM);
 
@@ -705,7 +705,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
 	if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
 		return -ENOSPC;
 
-	addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+	addr = kzalloc(sizeof(*addr), GFP_NOFS);
 	if (!addr)
 		return -ENOMEM;
 
@@ -868,7 +868,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
 
 	ids_count = sp->members_count;
 
-	ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL);
+	ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
 	if (!ids) {
 		rv = -ENOMEM;
 		goto out;
@@ -886,7 +886,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
 	if (!new_count)
 		goto out_ids;
 
-	new = kcalloc(new_count, sizeof(int), GFP_KERNEL);
+	new = kcalloc(new_count, sizeof(int), GFP_NOFS);
 	if (!new) {
 		kfree(ids);
 		rv = -ENOMEM;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1c8bb8c3a82..375a2359b3b 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -404,7 +404,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 	if (bucket >= ls->ls_rsbtbl_size)
 		return NULL;
 
-	ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
+	ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
 	if (!ri)
 		return NULL;
 	if (n == 0)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index c4dfa1dcc86..7b84c1dbc82 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
 	spin_unlock(&ls->ls_recover_list_lock);
 
 	if (!found)
-		de = kzalloc(sizeof(struct dlm_direntry) + len,
-			     ls->ls_allocation);
+		de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS);
 	return de;
 }
 
@@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
 
 	dlm_dir_clear(ls);
 
-	last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation);
+	last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
 	if (!last_name)
 		goto out;
 
@@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 	if (namelen > DLM_RESNAME_MAXLEN)
 		return -EINVAL;
 
-	de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation);
+	de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS);
 	if (!de)
 		return -ENOMEM;
 
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d01ca0a711d..826d3dc6e0a 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -473,7 +473,6 @@ struct dlm_ls {
 	int			ls_low_nodeid;
 	int			ls_total_weight;
 	int			*ls_node_array;
-	gfp_t			ls_allocation;
 
 	struct dlm_rsb		ls_stub_rsb;	/* for returning errors */
 	struct dlm_lkb		ls_stub_lkb;	/* for returning errors */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index eb507c453c5..9c0c1db1e10 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2689,7 +2689,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
 	   pass into lowcomms_commit and a message buffer (mb) that we
 	   write our data into */
 
-	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
+	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
 	if (!mh)
 		return -ENOBUFS;
 
@@ -4512,7 +4512,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 	}
 
 	if (flags & DLM_LKF_VALBLK) {
-		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
 		if (!ua->lksb.sb_lvbptr) {
 			kfree(ua);
 			__put_lkb(ls, lkb);
@@ -4582,7 +4582,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	ua = lkb->lkb_ua;
 
 	if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
-		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
 		if (!ua->lksb.sb_lvbptr) {
 			error = -ENOMEM;
 			goto out_put;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d489fcc8671..c010ecfc0d2 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -430,7 +430,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 
 	error = -ENOMEM;
 
-	ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+	ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
 	if (!ls)
 		goto out;
 	memcpy(ls->ls_name, name, namelen);
@@ -443,11 +443,6 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	if (flags & DLM_LSFL_TIMEWARN)
 		set_bit(LSFL_TIMEWARN, &ls->ls_flags);
 
-	if (flags & DLM_LSFL_FS)
-		ls->ls_allocation = GFP_NOFS;
-	else
-		ls->ls_allocation = GFP_KERNEL;
-
 	/* ls_exflags are forced to match among nodes, and we don't
 	   need to require all nodes to have some flags set */
 	ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
@@ -456,7 +451,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	size = dlm_config.ci_rsbtbl_size;
 	ls->ls_rsbtbl_size = size;
 
-	ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+	ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS);
 	if (!ls->ls_rsbtbl)
 		goto out_lsfree;
 	for (i = 0; i < size; i++) {
@@ -468,7 +463,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	size = dlm_config.ci_lkbtbl_size;
 	ls->ls_lkbtbl_size = size;
 
-	ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+	ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS);
 	if (!ls->ls_lkbtbl)
 		goto out_rsbfree;
 	for (i = 0; i < size; i++) {
@@ -480,7 +475,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	size = dlm_config.ci_dirtbl_size;
 	ls->ls_dirtbl_size = size;
 
-	ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+	ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS);
 	if (!ls->ls_dirtbl)
 		goto out_lkbfree;
 	for (i = 0; i < size; i++) {
@@ -527,7 +522,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	mutex_init(&ls->ls_requestqueue_mutex);
 	mutex_init(&ls->ls_clear_proc_locks);
 
-	ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+	ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
 	if (!ls->ls_recover_buf)
 		goto out_dirfree;
 
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 70736eb4b51..52cab160893 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1060,7 +1060,7 @@ static void init_local(void)
 		if (dlm_our_addr(&sas, i))
 			break;
 
-		addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+		addr = kmalloc(sizeof(*addr), GFP_NOFS);
 		if (!addr)
 			break;
 		memcpy(addr, &sas, sizeof(*addr));
@@ -1099,7 +1099,7 @@ static int sctp_listen_for_all(void)
 	struct sockaddr_storage localaddr;
 	struct sctp_event_subscribe subscribe;
 	int result = -EINVAL, num = 1, i, addr_len;
-	struct connection *con = nodeid2con(0, GFP_KERNEL);
+	struct connection *con = nodeid2con(0, GFP_NOFS);
 	int bufsize = NEEDED_RMEM;
 
 	if (!con)
@@ -1171,7 +1171,7 @@ out:
 static int tcp_listen_for_all(void)
 {
 	struct socket *sock = NULL;
-	struct connection *con = nodeid2con(0, GFP_KERNEL);
+	struct connection *con = nodeid2con(0, GFP_NOFS);
 	int result = -EINVAL;
 
 	if (!con)
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b128775913b..84f70bfb0ba 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
 	struct dlm_member *memb;
 	int w, error;
 
-	memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
+	memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
 	if (!memb)
 		return -ENOMEM;
 
@@ -143,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
 
 	ls->ls_total_weight = total;
 
-	array = kmalloc(sizeof(int) * total, ls->ls_allocation);
+	array = kmalloc(sizeof(int) * total, GFP_NOFS);
 	if (!array)
 		return;
 
@@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 			continue;
 		log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
 
-		memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
+		memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
 		if (!memb)
 			return -ENOMEM;
 		memb->nodeid = rv->new[i];
@@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
 	int *ids = NULL, *new = NULL;
 	int error, ids_count = 0, new_count = 0;
 
-	rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation);
+	rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
 	if (!rv)
 		return -ENOMEM;
 
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index c1775b84eba..8e0d00db004 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
 	char *p;
 
-	p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
+	p = kzalloc(ls->ls_lvblen, GFP_NOFS);
 	return p;
 }
 
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
 
 	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
 
-	r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
+	r = kzalloc(sizeof(*r) + namelen, GFP_NOFS);
 	return r;
 }
 
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
 
-	lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
+	lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
 	return lkb;
 }
 
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 55ea369f43a..052095cd592 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -26,7 +26,7 @@ static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
 	struct sk_buff *skb;
 	void *data;
 
-	skb = genlmsg_new(size, GFP_KERNEL);
+	skb = genlmsg_new(size, GFP_NOFS);
 	if (!skb)
 		return -ENOMEM;
 
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 16f682e26c0..b5f89aef3b2 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -82,7 +82,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (!ls)
 		return -EINVAL;
 
-	xop = kzalloc(sizeof(*xop), GFP_KERNEL);
+	xop = kzalloc(sizeof(*xop), GFP_NOFS);
 	if (!xop) {
 		rv = -ENOMEM;
 		goto out;
@@ -143,7 +143,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(dlm_posix_lock);
 
-/* Returns failure iff a succesful lock operation should be canceled */
+/* Returns failure iff a successful lock operation should be canceled */
 static int dlm_plock_callback(struct plock_op *op)
 {
 	struct file *file;
@@ -211,7 +211,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (!ls)
 		return -EINVAL;
 
-	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	op = kzalloc(sizeof(*op), GFP_NOFS);
 	if (!op) {
 		rv = -ENOMEM;
 		goto out;
@@ -266,7 +266,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (!ls)
 		return -EINVAL;
 
-	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	op = kzalloc(sizeof(*op), GFP_NOFS);
 	if (!op) {
 		rv = -ENOMEM;
 		goto out;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 67522c268c1..3c83a49a48a 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
 	char *mb;
 	int mb_len = sizeof(struct dlm_rcom) + len;
 
-	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
+	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
 	if (!mh) {
 		log_print("create_rcom to %d type %d len %d ENOBUFS",
 			  to_nodeid, type, len);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 7a2307c0891..a44fa22890e 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
 	struct rq_entry *e;
 	int length = ms->m_header.h_length - sizeof(struct dlm_message);
 
-	e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation);
+	e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
 	if (!e) {
 		log_print("dlm_add_requestqueue: out of memory len %d", length);
 		return;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebce994ab0b..e73a4bb572a 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -267,7 +267,7 @@ static int device_user_lock(struct dlm_user_proc *proc,
 		goto out;
 	}
 
-	ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+	ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
 	if (!ua)
 		goto out;
 	ua->proc = proc;
@@ -307,7 +307,7 @@ static int device_user_unlock(struct dlm_user_proc *proc,
 	if (!ls)
 		return -ENOENT;
 
-	ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+	ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
 	if (!ua)
 		goto out;
 	ua->proc = proc;
@@ -352,7 +352,7 @@ static int dlm_device_register(struct dlm_ls *ls, char *name)
 
 	error = -ENOMEM;
 	len = strlen(name) + strlen(name_prefix) + 2;
-	ls->ls_device.name = kzalloc(len, GFP_KERNEL);
+	ls->ls_device.name = kzalloc(len, GFP_NOFS);
 	if (!ls->ls_device.name)
 		goto fail;
 
@@ -520,7 +520,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 #endif
 		return -EINVAL;
 
-	kbuf = kzalloc(count + 1, GFP_KERNEL);
+	kbuf = kzalloc(count + 1, GFP_NOFS);
 	if (!kbuf)
 		return -ENOMEM;
 
@@ -546,7 +546,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 
 		/* add 1 after namelen so that the name string is terminated */
 		kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
-			       GFP_KERNEL);
+			       GFP_NOFS);
 		if (!kbuf) {
 			kfree(k32buf);
 			return -ENOMEM;
@@ -648,7 +648,7 @@ static int device_open(struct inode *inode, struct file *file)
 	if (!ls)
 		return -ENOENT;
 
-	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
+	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
 	if (!proc) {
 		dlm_put_lockspace(ls);
 		return -ENOMEM;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 085c5c06342..366c503f965 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -251,10 +251,10 @@ ctl_table epoll_table[] = {
 		.data		= &max_user_watches,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 #endif /* CONFIG_SYSCTL */
 
diff --git a/fs/exec.c b/fs/exec.c
index ba112bd4a33..623a5cc3076 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -46,7 +46,6 @@
 #include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
@@ -924,6 +923,15 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
 void set_task_comm(struct task_struct *tsk, char *buf)
 {
 	task_lock(tsk);
+
+	/*
+	 * Threads may access current->comm without holding
+	 * the task lock, so write the string carefully.
+	 * Readers without a lock may see incomplete new
+	 * names but are safe from non-terminating string reads.
+	 */
+	memset(tsk->comm, 0, TASK_COMM_LEN);
+	wmb();
 	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 	task_unlock(tsk);
 	perf_event_comm(tsk);
@@ -1209,9 +1217,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	retval = security_bprm_check(bprm);
 	if (retval)
 		return retval;
-	retval = ima_bprm_check(bprm);
-	if (retval)
-		return retval;
 
 	/* kernel module loader fixup */
 	/* so we don't try to load run modprobe in kernel space. */
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index cc2d22db119..2d0f757fda3 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,5 @@
 # Kbuild - Gets included from the Kernels Makefile and build system
 #
 
-exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index c6718e4817f..b1b178e6171 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -49,6 +49,7 @@
 #define EXOFS_MIN_PID   0x10000	/* Smallest partition ID */
 #define EXOFS_OBJ_OFF	0x10000	/* offset for objects */
 #define EXOFS_SUPER_ID	0x10000	/* object ID for on-disk superblock */
+#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
 #define EXOFS_ROOT_ID	0x10002	/* object ID for root directory */
 
 /* exofs Application specific page/attribute */
@@ -78,17 +79,67 @@ enum {
 #define EXOFS_SUPER_MAGIC	0x5DF5
 
 /*
- * The file system control block - stored in an object's data (mainly, the one
- * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
- * on disk.  Right now it just has a magic value, which is basically a sanity
- * check on our ability to communicate with the object store.
+ * The file system control block - stored in object EXOFS_SUPER_ID's data.
+ * This is where the in-memory superblock is stored on disk.
  */
+enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
 struct exofs_fscb {
 	__le64  s_nextid;	/* Highest object ID used */
-	__le32  s_numfiles;	/* Number of files on fs */
+	__le64  s_numfiles;	/* Number of files on fs */
+	__le32	s_version;	/* == EXOFS_FSCB_VER */
 	__le16  s_magic;	/* Magic signature */
 	__le16  s_newfs;	/* Non-zero if this is a new fs */
-};
+
+	/* From here on it's a static part, only written by mkexofs */
+	__le64	s_dev_table_oid;   /* Resurved, not used */
+	__le64	s_dev_table_count; /* == 0 means no dev_table */
+} __packed;
+
+/*
+ * Describes the raid used in the FS. It is part of the device table.
+ * This here is taken from the pNFS-objects definition. In exofs we
+ * use one raid policy through-out the filesystem. (NOTE: the funny
+ * alignment at begining. We take care of it at exofs_device_table.
+ */
+struct exofs_dt_data_map {
+	__le32	cb_num_comps;
+	__le64	cb_stripe_unit;
+	__le32	cb_group_width;
+	__le32	cb_group_depth;
+	__le32	cb_mirror_cnt;
+	__le32	cb_raid_algorithm;
+} __packed;
+
+/*
+ * This is an osd device information descriptor. It is a single entry in
+ * the exofs device table. It describes an osd target lun which
+ * contains data belonging to this FS. (Same partition_id on all devices)
+ */
+struct exofs_dt_device_info {
+	__le32	systemid_len;
+	u8	systemid[OSD_SYSTEMID_LEN];
+	__le64	long_name_offset;	/* If !0 then offset-in-file */
+	__le32	osdname_len;		/* */
+	u8	osdname[44];		/* Embbeded, Ususally an asci uuid */
+} __packed;
+
+/*
+ * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
+ * It contains the raid used for this multy-device FS and an array of
+ * participating devices.
+ */
+struct exofs_device_table {
+	__le32				dt_version;	/* == EXOFS_DT_VER */
+	struct exofs_dt_data_map	dt_data_map;	/* Raid policy to use */
+
+	/* Resurved space For future use. Total includeing this:
+	 * (8 * sizeof(le64))
+	 */
+	__le64				__Resurved[4];
+
+	__le64				dt_num_devices;	/* Array size */
+	struct exofs_dt_device_info	dt_dev_table[];	/* Array of devices */
+} __packed;
 
 /****************************************************************************
  * inode-related things
@@ -155,22 +206,4 @@ enum {
 	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
 	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
 
-/*************************
- * function declarations *
- *************************/
-/* osd.c                 */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
-			   const struct osd_obj_id *obj);
-
-int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
-static inline int exofs_check_ok(struct osd_request *or)
-{
-	return exofs_check_ok_resid(or, NULL, NULL);
-}
-int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
-int exofs_async_op(struct osd_request *or,
-	osd_req_done_fn *async_done, void *caller_context, u8 *cred);
-
-int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
-
 #endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 5ec72e020b2..c35fd462398 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -30,13 +30,17 @@
  * along with exofs; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
 
 #include <linux/fs.h>
 #include <linux/time.h>
 #include "common.h"
 
-#ifndef __EXOFS_H__
-#define __EXOFS_H__
+/* FIXME: Remove once pnfs hits mainline
+ * #include <linux/exportfs/pnfs_osd_xdr.h>
+ */
+#include "pnfs.h"
 
 #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
 
@@ -55,7 +59,7 @@
  * our extension to the in-memory superblock
  */
 struct exofs_sb_info {
-	struct osd_dev	*s_dev;			/* returned by get_osd_dev    */
+	struct exofs_fscb s_fscb;		/* Written often, pre-allocate*/
 	osd_id		s_pid;			/* partition ID of file system*/
 	int		s_timeout;		/* timeout for OSD operations */
 	uint64_t	s_nextid;		/* highest object ID used     */
@@ -63,7 +67,11 @@ struct exofs_sb_info {
 	spinlock_t	s_next_gen_lock;	/* spinlock for gen # update  */
 	u32		s_next_generation;	/* next gen # to use          */
 	atomic_t	s_curr_pending;		/* number of pending commands */
-	uint8_t		s_cred[OSD_CAP_LEN];	/* all-powerful credential    */
+	uint8_t		s_cred[OSD_CAP_LEN];	/* credential for the fscb    */
+
+	struct pnfs_osd_data_map data_map;	/* Default raid to use        */
+	unsigned	s_numdevs;		/* Num of devices in array    */
+	struct osd_dev	*s_ods[1];		/* Variable length, minimum 1 */
 };
 
 /*
@@ -79,6 +87,50 @@ struct exofs_i_info {
 	struct inode   vfs_inode;          /* normal in-memory inode          */
 };
 
+static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
+{
+	return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
+}
+
+struct exofs_io_state;
+typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
+
+struct exofs_io_state {
+	struct kref		kref;
+
+	void			*private;
+	exofs_io_done_fn	done;
+
+	struct exofs_sb_info	*sbi;
+	struct osd_obj_id	obj;
+	u8			*cred;
+
+	/* Global read/write IO*/
+	loff_t			offset;
+	unsigned long		length;
+	void			*kern_buff;
+	struct bio		*bio;
+
+	/* Attributes */
+	unsigned		in_attr_len;
+	struct osd_attr 	*in_attr;
+	unsigned		out_attr_len;
+	struct osd_attr 	*out_attr;
+
+	/* Variable array of size numdevs */
+	unsigned numdevs;
+	struct exofs_per_dev_state {
+		struct osd_request *or;
+		struct bio *bio;
+	} per_dev[];
+};
+
+static inline unsigned exofs_io_state_size(unsigned numdevs)
+{
+	return sizeof(struct exofs_io_state) +
+		sizeof(struct exofs_per_dev_state) * numdevs;
+}
+
 /*
  * our inode flags
  */
@@ -130,6 +182,42 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 /*************************
  * function declarations *
  *************************/
+
+/* ios.c */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+			   const struct osd_obj_id *obj);
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+		    u64 offset, void *p, unsigned length);
+
+int  exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios);
+void exofs_put_io_state(struct exofs_io_state *ios);
+
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
+
+int exofs_sbi_create(struct exofs_io_state *ios);
+int exofs_sbi_remove(struct exofs_io_state *ios);
+int exofs_sbi_write(struct exofs_io_state *ios);
+int exofs_sbi_read(struct exofs_io_state *ios);
+
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
+
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
+static inline int exofs_oi_write(struct exofs_i_info *oi,
+				 struct exofs_io_state *ios)
+{
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->cred = oi->i_cred;
+	return exofs_sbi_write(ios);
+}
+
+static inline int exofs_oi_read(struct exofs_i_info *oi,
+				struct exofs_io_state *ios)
+{
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->cred = oi->i_cred;
+	return exofs_sbi_read(ios);
+}
+
 /* inode.c               */
 void exofs_truncate(struct inode *inode);
 int exofs_setattr(struct dentry *, struct iattr *);
@@ -169,6 +257,7 @@ extern const struct file_operations exofs_file_operations;
 
 /* inode.c           */
 extern const struct address_space_operations exofs_aops;
+extern const struct osd_attr g_attr_logical_length;
 
 /* namei.c           */
 extern const struct inode_operations exofs_dir_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6c10f747669..698a8636d39 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,15 +37,18 @@
 
 #include "exofs.h"
 
-#ifdef CONFIG_EXOFS_DEBUG
-#  define EXOFS_DEBUG_OBJ_ISIZE 1
-#endif
+#define EXOFS_DBGMSG2(M...) do {} while (0)
+
+enum { BIO_MAX_PAGES_KMALLOC =
+		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
 
 struct page_collect {
 	struct exofs_sb_info *sbi;
 	struct request_queue *req_q;
 	struct inode *inode;
 	unsigned expected_pages;
+	struct exofs_io_state *ios;
 
 	struct bio *bio;
 	unsigned nr_pages;
@@ -54,22 +57,23 @@ struct page_collect {
 };
 
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
-		struct inode *inode)
+		       struct inode *inode)
 {
 	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
 
 	pcol->sbi = sbi;
-	pcol->req_q = osd_request_queue(sbi->s_dev);
+	/* Create master bios on first Q, later on cloning, each clone will be
+	 * allocated on it's destination Q
+	 */
+	pcol->req_q = osd_request_queue(sbi->s_ods[0]);
 	pcol->inode = inode;
 	pcol->expected_pages = expected_pages;
 
+	pcol->ios = NULL;
 	pcol->bio = NULL;
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
-
-	EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
-		     expected_pages);
 }
 
 static void _pcol_reset(struct page_collect *pcol)
@@ -80,35 +84,49 @@ static void _pcol_reset(struct page_collect *pcol)
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
-	EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
-		     pcol->inode->i_ino, pcol->expected_pages);
+	pcol->ios = NULL;
 
 	/* this is probably the end of the loop but in writes
 	 * it might not end here. don't be left with nothing
 	 */
 	if (!pcol->expected_pages)
-		pcol->expected_pages = 128;
+		pcol->expected_pages = BIO_MAX_PAGES_KMALLOC;
 }
 
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-	int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+	int pages = min_t(unsigned, pcol->expected_pages,
+			  BIO_MAX_PAGES_KMALLOC);
+
+	if (!pcol->ios) { /* First time allocate io_state */
+		int ret = exofs_get_io_state(pcol->sbi, &pcol->ios);
+
+		if (ret)
+			return ret;
+	}
 
 	for (; pages; pages >>= 1) {
-		pcol->bio = bio_alloc(GFP_KERNEL, pages);
+		pcol->bio = bio_kmalloc(GFP_KERNEL, pages);
 		if (likely(pcol->bio))
 			return 0;
 	}
 
-	EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+	EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n",
 		  pcol->expected_pages);
 	return -ENOMEM;
 }
 
 static void pcol_free(struct page_collect *pcol)
 {
-	bio_put(pcol->bio);
-	pcol->bio = NULL;
+	if (pcol->bio) {
+		bio_put(pcol->bio);
+		pcol->bio = NULL;
+	}
+
+	if (pcol->ios) {
+		exofs_put_io_state(pcol->ios);
+		pcol->ios = NULL;
+	}
 }
 
 static int pcol_add_page(struct page_collect *pcol, struct page *page,
@@ -161,22 +179,17 @@ static void update_write_page(struct page *page, int ret)
 /* Called at the end of reads, to optionally unlock pages and update their
  * status.
  */
-static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
-			    bool do_unlock)
+static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 {
 	struct bio_vec *bvec;
 	int i;
 	u64 resid;
 	u64 good_bytes;
 	u64 length = 0;
-	int ret = exofs_check_ok_resid(or, &resid, NULL);
-
-	osd_end_request(or);
+	int ret = exofs_check_io(pcol->ios, &resid);
 
 	if (likely(!ret))
 		good_bytes = pcol->length;
-	else if (!resid)
-		good_bytes = 0;
 	else
 		good_bytes = pcol->length - resid;
 
@@ -198,7 +211,7 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
 		else
 			page_stat = ret;
 
-		EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+		EXOFS_DBGMSG2("    readpages_done(0x%lx, 0x%lx) %s\n",
 			  inode->i_ino, page->index,
 			  page_stat ? "bad_bytes" : "good_bytes");
 
@@ -214,13 +227,13 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
 }
 
 /* callback of async reads */
-static void readpages_done(struct osd_request *or, void *p)
+static void readpages_done(struct exofs_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
 
-	__readpages_done(or, pcol, true);
+	__readpages_done(pcol, true);
 	atomic_dec(&pcol->sbi->s_curr_pending);
-	kfree(p);
+	kfree(pcol);
 }
 
 static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
@@ -238,17 +251,13 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 
 		unlock_page(page);
 	}
-	pcol_free(pcol);
 }
 
 static int read_exec(struct page_collect *pcol, bool is_sync)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
-	struct osd_obj_id obj = {pcol->sbi->s_pid,
-					pcol->inode->i_ino + EXOFS_OBJ_OFF};
-	struct osd_request *or = NULL;
+	struct exofs_io_state *ios = pcol->ios;
 	struct page_collect *pcol_copy = NULL;
-	loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
 	int ret;
 
 	if (!pcol->bio)
@@ -257,17 +266,13 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 	/* see comment in _readpage() about sync reads */
 	WARN_ON(is_sync && (pcol->nr_pages != 1));
 
-	or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
+	ios->bio = pcol->bio;
+	ios->length = pcol->length;
+	ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
 
 	if (is_sync) {
-		exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
-		return __readpages_done(or, pcol, false);
+		exofs_oi_read(oi, pcol->ios);
+		return __readpages_done(pcol, false);
 	}
 
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -277,14 +282,16 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 	}
 
 	*pcol_copy = *pcol;
-	ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+	ios->done = readpages_done;
+	ios->private = pcol_copy;
+	ret = exofs_oi_read(oi, ios);
 	if (unlikely(ret))
 		goto err;
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
 
 	EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-		  obj.id, _LLU(i_start), pcol->length);
+		  ios->obj.id, _LLU(ios->offset), pcol->length);
 
 	/* pages ownership was passed to pcol_copy */
 	_pcol_reset(pcol);
@@ -293,12 +300,10 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 err:
 	if (!is_sync)
 		_unlock_pcol_pages(pcol, ret, READ);
-	else /* Pages unlocked by caller in sync mode only free bio */
-		pcol_free(pcol);
+
+	pcol_free(pcol);
 
 	kfree(pcol_copy);
-	if (or)
-		osd_end_request(or);
 	return ret;
 }
 
@@ -370,12 +375,12 @@ try_again:
 	if (len != PAGE_CACHE_SIZE)
 		zero_user(page, len, PAGE_CACHE_SIZE - len);
 
-	EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+	EXOFS_DBGMSG2("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
 		     inode->i_ino, page->index, len);
 
 	ret = pcol_add_page(pcol, page, len);
 	if (ret) {
-		EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+		EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
 			  "this_len=0x%zx nr_pages=%u length=0x%lx\n",
 			  page, len, pcol->nr_pages, pcol->length);
 
@@ -419,9 +424,8 @@ static int _readpage(struct page *page, bool is_sync)
 
 	_pcol_init(&pcol, 1, page->mapping->host);
 
-	/* readpage_strip might call read_exec(,async) inside at several places
-	 * but this is safe for is_async=0 since read_exec will not do anything
-	 * when we have a single page.
+	/* readpage_strip might call read_exec(,is_sync==false) at several
+	 * places but not if we have a single page.
 	 */
 	ret = readpage_strip(&pcol, page);
 	if (ret) {
@@ -440,8 +444,8 @@ static int exofs_readpage(struct file *file, struct page *page)
 	return _readpage(page, false);
 }
 
-/* Callback for osd_write. All writes are asynchronouse */
-static void writepages_done(struct osd_request *or, void *p)
+/* Callback for osd_write. All writes are asynchronous */
+static void writepages_done(struct exofs_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
 	struct bio_vec *bvec;
@@ -449,16 +453,12 @@ static void writepages_done(struct osd_request *or, void *p)
 	u64 resid;
 	u64  good_bytes;
 	u64  length = 0;
+	int ret = exofs_check_io(ios, &resid);
 
-	int ret = exofs_check_ok_resid(or, NULL, &resid);
-
-	osd_end_request(or);
 	atomic_dec(&pcol->sbi->s_curr_pending);
 
 	if (likely(!ret))
 		good_bytes = pcol->length;
-	else if (!resid)
-		good_bytes = 0;
 	else
 		good_bytes = pcol->length - resid;
 
@@ -482,7 +482,7 @@ static void writepages_done(struct osd_request *or, void *p)
 
 		update_write_page(page, page_stat);
 		unlock_page(page);
-		EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+		EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
 			     inode->i_ino, page->index, page_stat);
 
 		length += bvec->bv_len;
@@ -496,23 +496,13 @@ static void writepages_done(struct osd_request *or, void *p)
 static int write_exec(struct page_collect *pcol)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
-	struct osd_obj_id obj = {pcol->sbi->s_pid,
-					pcol->inode->i_ino + EXOFS_OBJ_OFF};
-	struct osd_request *or = NULL;
+	struct exofs_io_state *ios = pcol->ios;
 	struct page_collect *pcol_copy = NULL;
-	loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
 	int ret;
 
 	if (!pcol->bio)
 		return 0;
 
-	or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
-		ret = -ENOMEM;
-		goto err;
-	}
-
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
 	if (!pcol_copy) {
 		EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
@@ -523,16 +513,22 @@ static int write_exec(struct page_collect *pcol)
 	*pcol_copy = *pcol;
 
 	pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
-	osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length);
-	ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+
+	ios->bio = pcol_copy->bio;
+	ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
+	ios->length = pcol_copy->length;
+	ios->done = writepages_done;
+	ios->private = pcol_copy;
+
+	ret = exofs_oi_write(oi, ios);
 	if (unlikely(ret)) {
-		EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+		EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
 		goto err;
 	}
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
 	EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-		  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+		  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
 		  pcol->length);
 	/* pages ownership was passed to pcol_copy */
 	_pcol_reset(pcol);
@@ -540,9 +536,9 @@ static int write_exec(struct page_collect *pcol)
 
 err:
 	_unlock_pcol_pages(pcol, ret, WRITE);
+	pcol_free(pcol);
 	kfree(pcol_copy);
-	if (or)
-		osd_end_request(or);
+
 	return ret;
 }
 
@@ -586,6 +582,9 @@ static int writepage_strip(struct page *page,
 			if (PageError(page))
 				ClearPageError(page);
 			unlock_page(page);
+			EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
+				     "outside the limits\n",
+				     inode->i_ino, page->index);
 			return 0;
 		}
 	}
@@ -600,6 +599,9 @@ try_again:
 		ret = write_exec(pcol);
 		if (unlikely(ret))
 			goto fail;
+
+		EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
+			     inode->i_ino, page->index);
 		goto try_again;
 	}
 
@@ -609,7 +611,7 @@ try_again:
 			goto fail;
 	}
 
-	EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+	EXOFS_DBGMSG2("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
 		     inode->i_ino, page->index, len);
 
 	ret = pcol_add_page(pcol, page, len);
@@ -634,6 +636,8 @@ try_again:
 	return 0;
 
 fail:
+	EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
+		     inode->i_ino, page->index, ret);
 	set_bit(AS_EIO, &page->mapping->flags);
 	unlock_page(page);
 	return ret;
@@ -652,14 +656,17 @@ static int exofs_writepages(struct address_space *mapping,
 			wbc->range_end >> PAGE_CACHE_SHIFT;
 
 	if (start || end)
-		expected_pages = min(end - start + 1, 32L);
+		expected_pages = end - start + 1;
 	else
 		expected_pages = mapping->nrpages;
 
-	EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
-		     " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+	if (expected_pages < 32L)
+		expected_pages = 32L;
+
+	EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
+		     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
 		     mapping->host->i_ino, wbc->range_start, wbc->range_end,
-		     mapping->nrpages, start, end);
+		     mapping->nrpages, start, end, expected_pages);
 
 	_pcol_init(&pcol, expected_pages, mapping->host);
 
@@ -771,19 +778,28 @@ static int exofs_get_block(struct inode *inode, sector_t iblock,
 const struct osd_attr g_attr_logical_length = ATTR_DEF(
 	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
 
+static int _do_truncate(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t isize = i_size_read(inode);
+	int ret;
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+
+	ret = exofs_oi_truncate(oi, (u64)isize);
+	EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize);
+	return ret;
+}
+
 /*
  * Truncate a file to the specified size - all we have to do is set the size
  * attribute.  We make sure the object exists first.
  */
 void exofs_truncate(struct inode *inode)
 {
-	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
 	struct exofs_i_info *oi = exofs_i(inode);
-	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
-	struct osd_request *or;
-	struct osd_attr attr;
-	loff_t isize = i_size_read(inode);
-	__be64 newsize;
 	int ret;
 
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
@@ -793,22 +809,6 @@ void exofs_truncate(struct inode *inode)
 		return;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-
-	nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
-
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
-		goto fail;
-	}
-
-	osd_req_set_attributes(or, &obj);
-
-	newsize = cpu_to_be64((u64)isize);
-	attr = g_attr_logical_length;
-	attr.val_ptr = &newsize;
-	osd_req_add_set_attr_list(or, &attr, 1);
 
 	/* if we are about to truncate an object, and it hasn't been
 	 * created yet, wait
@@ -816,8 +816,7 @@ void exofs_truncate(struct inode *inode)
 	if (unlikely(wait_obj_created(oi)))
 		goto fail;
 
-	ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
-	osd_end_request(or);
+	ret = _do_truncate(inode);
 	if (ret)
 		goto fail;
 
@@ -847,65 +846,62 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 
 /*
  * Read an inode from the OSD, and return it as is.  We also return the size
- * attribute in the 'sanity' argument if we got compiled with debugging turned
- * on.
+ * attribute in the 'obj_size' argument.
  */
 static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
-		    struct exofs_fcb *inode, uint64_t *sanity)
+		    struct exofs_fcb *inode, uint64_t *obj_size)
 {
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_request *or;
-	struct osd_attr attr;
-	struct osd_obj_id obj = {sbi->s_pid,
-				 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
+	struct osd_attr attrs[2];
+	struct exofs_io_state *ios;
 	int ret;
 
-	exofs_make_credential(oi->i_cred, &obj);
-
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
-		return -ENOMEM;
+	*obj_size = ~0;
+	ret = exofs_get_io_state(sbi, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+		return ret;
 	}
-	osd_req_get_attributes(or, &obj);
 
-	/* we need the inode attribute */
-	osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+	ios->obj.id = exofs_oi_objno(oi);
+	exofs_make_credential(oi->i_cred, &ios->obj);
+	ios->cred = oi->i_cred;
 
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
-	/* we get the size attributes to do a sanity check */
-	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
-#endif
+	attrs[0] = g_attr_inode_data;
+	attrs[1] = g_attr_logical_length;
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
 
-	ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+	ret = exofs_sbi_read(ios);
 	if (ret)
 		goto out;
 
-	attr = g_attr_inode_data;
-	ret = extract_attr_from_req(or, &attr);
+	ret = extract_attr_from_ios(ios, &attrs[0]);
 	if (ret) {
-		EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
 		goto out;
 	}
+	WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
+	memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
 
-	WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
-	memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
-
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
-	attr = g_attr_logical_length;
-	ret = extract_attr_from_req(or, &attr);
+	ret = extract_attr_from_ios(ios, &attrs[1]);
 	if (ret) {
-		EXOFS_ERR("ERROR: extract attr from or failed\n");
+		EXOFS_ERR("%s: extract_attr of logical_length failed\n",
+			  __func__);
 		goto out;
 	}
-	*sanity = get_unaligned_be64(attr.val_ptr);
-#endif
+	*obj_size = get_unaligned_be64(attrs[1].val_ptr);
 
 out:
-	osd_end_request(or);
+	exofs_put_io_state(ios);
 	return ret;
 }
 
+static void __oi_init(struct exofs_i_info *oi)
+{
+	init_waitqueue_head(&oi->i_wq);
+	oi->i_flags = 0;
+}
 /*
  * Fill in an inode read from the OSD and set it up for use
  */
@@ -914,7 +910,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	struct exofs_i_info *oi;
 	struct exofs_fcb fcb;
 	struct inode *inode;
-	uint64_t uninitialized_var(sanity);
+	uint64_t obj_size;
 	int ret;
 
 	inode = iget_locked(sb, ino);
@@ -923,13 +919,13 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	oi = exofs_i(inode);
+	__oi_init(oi);
 
 	/* read the inode from the osd */
-	ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+	ret = exofs_get_inode(sb, oi, &fcb, &obj_size);
 	if (ret)
 		goto bad_inode;
 
-	init_waitqueue_head(&oi->i_wq);
 	set_obj_created(oi);
 
 	/* copy stuff from on-disk struct to in-memory struct */
@@ -947,14 +943,12 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	inode->i_blkbits = EXOFS_BLKSHIFT;
 	inode->i_generation = le32_to_cpu(fcb.i_generation);
 
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
-	if ((inode->i_size != sanity) &&
+	if ((inode->i_size != obj_size) &&
 		(!exofs_inode_is_fast_symlink(inode))) {
-		EXOFS_ERR("WARNING: Size of object from inode and "
-			  "attributes differ (%lld != %llu)\n",
-			  inode->i_size, _LLU(sanity));
+		EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
+			  inode->i_size, _LLU(obj_size));
+		/* FIXME: call exofs_inode_recovery() */
 	}
-#endif
 
 	oi->i_dir_start_lookup = 0;
 
@@ -1020,23 +1014,30 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
  * set the obj_created flag so that other methods know that the object exists on
  * the OSD.
  */
-static void create_done(struct osd_request *or, void *p)
+static void create_done(struct exofs_io_state *ios, void *p)
 {
 	struct inode *inode = p;
 	struct exofs_i_info *oi = exofs_i(inode);
 	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
 	int ret;
 
-	ret = exofs_check_ok(or);
-	osd_end_request(or);
+	ret = exofs_check_io(ios, NULL);
+	exofs_put_io_state(ios);
+
 	atomic_dec(&sbi->s_curr_pending);
 
 	if (unlikely(ret)) {
 		EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
-			  _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
-		make_bad_inode(inode);
-	} else
-		set_obj_created(oi);
+			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid));
+		/*TODO: When FS is corrupted creation can fail, object already
+		 * exist. Get rid of this asynchronous creation, if exist
+		 * increment the obj counter and try the next object. Until we
+		 * succeed. All these dangling objects will be made into lost
+		 * files by chkfs.exofs
+		 */
+	}
+
+	set_obj_created(oi);
 
 	atomic_dec(&inode->i_count);
 	wake_up(&oi->i_wq);
@@ -1051,8 +1052,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 	struct inode *inode;
 	struct exofs_i_info *oi;
 	struct exofs_sb_info *sbi;
-	struct osd_request *or;
-	struct osd_obj_id obj;
+	struct exofs_io_state *ios;
 	int ret;
 
 	sb = dir->i_sb;
@@ -1061,8 +1061,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 		return ERR_PTR(-ENOMEM);
 
 	oi = exofs_i(inode);
+	__oi_init(oi);
 
-	init_waitqueue_head(&oi->i_wq);
 	set_obj_2bcreated(oi);
 
 	sbi = sb->s_fs_info;
@@ -1089,28 +1089,28 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	mark_inode_dirty(inode);
 
-	obj.partition = sbi->s_pid;
-	obj.id = inode->i_ino + EXOFS_OBJ_OFF;
-	exofs_make_credential(oi->i_cred, &obj);
-
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
-		return ERR_PTR(-ENOMEM);
+	ret = exofs_get_io_state(sbi, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
+		return ERR_PTR(ret);
 	}
 
-	osd_req_create_object(or, &obj);
+	ios->obj.id = exofs_oi_objno(oi);
+	exofs_make_credential(oi->i_cred, &ios->obj);
 
 	/* increment the refcount so that the inode will still be around when we
 	 * reach the callback
 	 */
 	atomic_inc(&inode->i_count);
 
-	ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+	ios->done = create_done;
+	ios->private = inode;
+	ios->cred = oi->i_cred;
+	ret = exofs_sbi_create(ios);
 	if (ret) {
 		atomic_dec(&inode->i_count);
-		osd_end_request(or);
-		return ERR_PTR(-EIO);
+		exofs_put_io_state(ios);
+		return ERR_PTR(ret);
 	}
 	atomic_inc(&sbi->s_curr_pending);
 
@@ -1128,11 +1128,11 @@ struct updatei_args {
 /*
  * Callback function from exofs_update_inode().
  */
-static void updatei_done(struct osd_request *or, void *p)
+static void updatei_done(struct exofs_io_state *ios, void *p)
 {
 	struct updatei_args *args = p;
 
-	osd_end_request(or);
+	exofs_put_io_state(ios);
 
 	atomic_dec(&args->sbi->s_curr_pending);
 
@@ -1148,8 +1148,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	struct exofs_i_info *oi = exofs_i(inode);
 	struct super_block *sb = inode->i_sb;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
-	struct osd_request *or;
+	struct exofs_io_state *ios;
 	struct osd_attr attr;
 	struct exofs_fcb *fcb;
 	struct updatei_args *args;
@@ -1186,18 +1185,16 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	} else
 		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
 
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
-		ret = -ENOMEM;
+	ret = exofs_get_io_state(sbi, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
 		goto free_args;
 	}
 
-	osd_req_set_attributes(or, &obj);
-
 	attr = g_attr_inode_data;
 	attr.val_ptr = fcb;
-	osd_req_add_set_attr_list(or, &attr, 1);
+	ios->out_attr_len = 1;
+	ios->out_attr = &attr;
 
 	if (!obj_created(oi)) {
 		EXOFS_DBGMSG("!obj_created\n");
@@ -1206,22 +1203,19 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 		EXOFS_DBGMSG("wait_event done\n");
 	}
 
-	if (do_sync) {
-		ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
-		osd_end_request(or);
-		goto free_args;
-	} else {
+	if (!do_sync) {
 		args->sbi = sbi;
+		ios->done = updatei_done;
+		ios->private = args;
+	}
 
-		ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
-		if (ret) {
-			osd_end_request(or);
-			goto free_args;
-		}
+	ret = exofs_oi_write(oi, ios);
+	if (!do_sync && !ret) {
 		atomic_inc(&sbi->s_curr_pending);
 		goto out; /* deallocation in updatei_done */
 	}
 
+	exofs_put_io_state(ios);
 free_args:
 	kfree(args);
 out:
@@ -1238,11 +1232,12 @@ int exofs_write_inode(struct inode *inode, int wait)
  * Callback function from exofs_delete_inode() - don't have much cleaning up to
  * do.
  */
-static void delete_done(struct osd_request *or, void *p)
+static void delete_done(struct exofs_io_state *ios, void *p)
 {
-	struct exofs_sb_info *sbi;
-	osd_end_request(or);
-	sbi = p;
+	struct exofs_sb_info *sbi = p;
+
+	exofs_put_io_state(ios);
+
 	atomic_dec(&sbi->s_curr_pending);
 }
 
@@ -1256,8 +1251,7 @@ void exofs_delete_inode(struct inode *inode)
 	struct exofs_i_info *oi = exofs_i(inode);
 	struct super_block *sb = inode->i_sb;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
-	struct osd_request *or;
+	struct exofs_io_state *ios;
 	int ret;
 
 	truncate_inode_pages(&inode->i_data, 0);
@@ -1274,25 +1268,26 @@ void exofs_delete_inode(struct inode *inode)
 
 	clear_inode(inode);
 
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+	ret = exofs_get_io_state(sbi, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
 		return;
 	}
 
-	osd_req_remove_object(or, &obj);
-
 	/* if we are deleting an obj that hasn't been created yet, wait */
 	if (!obj_created(oi)) {
 		BUG_ON(!obj_2bcreated(oi));
 		wait_event(oi->i_wq, obj_created(oi));
 	}
 
-	ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->done = delete_done;
+	ios->private = sbi;
+	ios->cred = oi->i_cred;
+	ret = exofs_sbi_remove(ios);
 	if (ret) {
-		EXOFS_ERR(
-		       "ERROR: @exofs_delete_inode exofs_async_op failed\n");
-		osd_end_request(or);
+		EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
+		exofs_put_io_state(ios);
 		return;
 	}
 	atomic_inc(&sbi->s_curr_pending);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
new file mode 100644
index 00000000000..5bad01fa1f9
--- /dev/null
+++ b/fs/exofs/ios.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <scsi/scsi_device.h>
+
+#include "exofs.h"
+
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+		    u64 offset, void *p, unsigned length)
+{
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+/*	struct osd_sense_info osi = {.key = 0};*/
+	int ret;
+
+	if (unlikely(!or)) {
+		EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
+		return -ENOMEM;
+	}
+	ret = osd_req_read_kern(or, obj, offset, p, length);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
+		goto out;
+	}
+
+	ret = osd_finalize_request(or, 0, cred, NULL);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		goto out;
+	}
+
+	ret = osd_execute_request(or);
+	if (unlikely(ret))
+		EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+	/* osd_req_decode_sense(or, ret); */
+
+out:
+	osd_end_request(or);
+	return ret;
+}
+
+int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
+{
+	struct exofs_io_state *ios;
+
+	/*TODO: Maybe use kmem_cach per sbi of size
+	 * exofs_io_state_size(sbi->s_numdevs)
+	 */
+	ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
+	if (unlikely(!ios)) {
+		*pios = NULL;
+		return -ENOMEM;
+	}
+
+	ios->sbi = sbi;
+	ios->obj.partition = sbi->s_pid;
+	*pios = ios;
+	return 0;
+}
+
+void exofs_put_io_state(struct exofs_io_state *ios)
+{
+	if (ios) {
+		unsigned i;
+
+		for (i = 0; i < ios->numdevs; i++) {
+			struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
+
+			if (per_dev->or)
+				osd_end_request(per_dev->or);
+			if (per_dev->bio)
+				bio_put(per_dev->bio);
+		}
+
+		kfree(ios);
+	}
+}
+
+static void _sync_done(struct exofs_io_state *ios, void *p)
+{
+	struct completion *waiting = p;
+
+	complete(waiting);
+}
+
+static void _last_io(struct kref *kref)
+{
+	struct exofs_io_state *ios = container_of(
+					kref, struct exofs_io_state, kref);
+
+	ios->done(ios, ios->private);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+	struct exofs_io_state *ios = p;
+
+	kref_put(&ios->kref, _last_io);
+}
+
+static int exofs_io_execute(struct exofs_io_state *ios)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	bool sync = (ios->done == NULL);
+	int i, ret;
+
+	if (sync) {
+		ios->done = _sync_done;
+		ios->private = &wait;
+	}
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_request *or = ios->per_dev[i].or;
+		if (unlikely(!or))
+			continue;
+
+		ret = osd_finalize_request(or, 0, ios->cred, NULL);
+		if (unlikely(ret)) {
+			EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
+				     ret);
+			return ret;
+		}
+	}
+
+	kref_init(&ios->kref);
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_request *or = ios->per_dev[i].or;
+		if (unlikely(!or))
+			continue;
+
+		kref_get(&ios->kref);
+		osd_execute_request_async(or, _done_io, ios);
+	}
+
+	kref_put(&ios->kref, _last_io);
+	ret = 0;
+
+	if (sync) {
+		wait_for_completion(&wait);
+		ret = exofs_check_io(ios, NULL);
+	}
+	return ret;
+}
+
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
+{
+	enum osd_err_priority acumulated_osd_err = 0;
+	int acumulated_lin_err = 0;
+	int i;
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_sense_info osi;
+		int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
+
+		if (likely(!ret))
+			continue;
+
+		if (unlikely(ret == -EFAULT)) {
+			EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
+			/*FIXME: All the pages in this device range should:
+			 *	clear_highpage(page);
+			 */
+		}
+
+		if (osi.osd_err_pri >= acumulated_osd_err) {
+			acumulated_osd_err = osi.osd_err_pri;
+			acumulated_lin_err = ret;
+		}
+	}
+
+	/* TODO: raid specific residual calculations */
+	if (resid) {
+		if (likely(!acumulated_lin_err))
+			*resid = 0;
+		else
+			*resid = ios->length;
+	}
+
+	return acumulated_lin_err;
+}
+
+int exofs_sbi_create(struct exofs_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		osd_req_create_object(or, &ios->obj);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	return ret;
+}
+
+int exofs_sbi_remove(struct exofs_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		osd_req_remove_object(or, &ios->obj);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	return ret;
+}
+
+int exofs_sbi_write(struct exofs_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		if (ios->bio) {
+			struct bio *bio;
+
+			if (i != 0) {
+				bio = bio_kmalloc(GFP_KERNEL,
+						  ios->bio->bi_max_vecs);
+				if (unlikely(!bio)) {
+					ret = -ENOMEM;
+					goto out;
+				}
+
+				__bio_clone(bio, ios->bio);
+				bio->bi_bdev = NULL;
+				bio->bi_next = NULL;
+				ios->per_dev[i].bio =  bio;
+			} else {
+				bio = ios->bio;
+			}
+
+			osd_req_write(or, &ios->obj, ios->offset, bio,
+				      ios->length);
+/*			EXOFS_DBGMSG("write sync=%d\n", sync);*/
+		} else if (ios->kern_buff) {
+			osd_req_write_kern(or, &ios->obj, ios->offset,
+					   ios->kern_buff, ios->length);
+/*			EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
+		} else {
+			osd_req_set_attributes(or, &ios->obj);
+/*			EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
+		}
+
+		if (ios->out_attr)
+			osd_req_add_set_attr_list(or, ios->out_attr,
+						  ios->out_attr_len);
+
+		if (ios->in_attr)
+			osd_req_add_get_attr_list(or, ios->in_attr,
+						  ios->in_attr_len);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	return ret;
+}
+
+int exofs_sbi_read(struct exofs_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < 1; i++) {
+		struct osd_request *or;
+		unsigned first_dev = (unsigned)ios->obj.id;
+
+		first_dev %= ios->sbi->s_numdevs;
+		or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		if (ios->bio) {
+			osd_req_read(or, &ios->obj, ios->offset, ios->bio,
+				     ios->length);
+/*			EXOFS_DBGMSG("read sync=%d\n", sync);*/
+		} else if (ios->kern_buff) {
+			osd_req_read_kern(or, &ios->obj, ios->offset,
+					   ios->kern_buff, ios->length);
+/*			EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
+		} else {
+			osd_req_get_attributes(or, &ios->obj);
+/*			EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
+		}
+
+		if (ios->out_attr)
+			osd_req_add_set_attr_list(or, ios->out_attr,
+						  ios->out_attr_len);
+
+		if (ios->in_attr)
+			osd_req_add_get_attr_list(or, ios->in_attr,
+						  ios->in_attr_len);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	return ret;
+}
+
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
+{
+	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(ios->per_dev[0].or,
+					     &cur_attr, &nelem, &iter);
+		if ((cur_attr.attr_page == attr->attr_page) &&
+		    (cur_attr.attr_id == attr->attr_id)) {
+			attr->len = cur_attr.len;
+			attr->val_ptr = cur_attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
+{
+	struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
+	struct exofs_io_state *ios;
+	struct osd_attr attr;
+	__be64 newsize;
+	int i, ret;
+
+	if (exofs_get_io_state(sbi, &ios))
+		return -ENOMEM;
+
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->cred = oi->i_cred;
+
+	newsize = cpu_to_be64(size);
+	attr = g_attr_logical_length;
+	attr.val_ptr = &newsize;
+
+	for (i = 0; i < sbi->s_numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		osd_req_set_attributes(or, &ios->obj);
+		osd_req_add_set_attr_list(or, &attr, 1);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	exofs_put_io_state(ios);
+	return ret;
+}
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
deleted file mode 100644
index 4372542df28..00000000000
--- a/fs/exofs/osd.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <scsi/scsi_device.h>
-#include <scsi/osd_sense.h>
-
-#include "exofs.h"
-
-int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
-{
-	struct osd_sense_info osi;
-	int ret = osd_req_decode_sense(or, &osi);
-
-	if (ret) { /* translate to Linux codes */
-		if (osi.additional_code == scsi_invalid_field_in_cdb) {
-			if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
-				ret = -EFAULT;
-			if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
-				ret = -ENOENT;
-			else
-				ret = -EINVAL;
-		} else if (osi.additional_code == osd_quota_error)
-			ret = -ENOSPC;
-		else
-			ret = -EIO;
-	}
-
-	/* FIXME: should be include in osd_sense_info */
-	if (in_resid)
-		*in_resid = or->in.req ? or->in.req->resid_len : 0;
-
-	if (out_resid)
-		*out_resid = or->out.req ? or->out.req->resid_len : 0;
-
-	return ret;
-}
-
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-{
-	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-
-/*
- * Perform a synchronous OSD operation.
- */
-int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
-{
-	int ret;
-
-	or->timeout = timeout;
-	ret = osd_finalize_request(or, 0, credential, NULL);
-	if (ret) {
-		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
-		return ret;
-	}
-
-	ret = osd_execute_request(or);
-
-	if (ret)
-		EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
-	/* osd_req_decode_sense(or, ret); */
-	return ret;
-}
-
-/*
- * Perform an asynchronous OSD operation.
- */
-int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
-		   void *caller_context, u8 *cred)
-{
-	int ret;
-
-	ret = osd_finalize_request(or, 0, cred, NULL);
-	if (ret) {
-		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
-		return ret;
-	}
-
-	ret = osd_execute_request_async(or, async_done, caller_context);
-
-	if (ret)
-		EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
-	return ret;
-}
-
-int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
-{
-	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
-	void *iter = NULL;
-	int nelem;
-
-	do {
-		nelem = 1;
-		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
-		if ((cur_attr.attr_page == attr->attr_page) &&
-		    (cur_attr.attr_id == attr->attr_id)) {
-			attr->len = cur_attr.len;
-			attr->val_ptr = cur_attr.val_ptr;
-			return 0;
-		}
-	} while (iter);
-
-	return -EIO;
-}
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
new file mode 100644
index 00000000000..423033addd1
--- /dev/null
+++ b/fs/exofs/pnfs.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License  version 2 as published by the Free
+ * Software Foundation.
+ *
+ */
+
+/* FIXME: Remove this file once pnfs hits mainline */
+
+#ifndef __EXOFS_PNFS_H__
+#define __EXOFS_PNFS_H__
+
+#if defined(CONFIG_PNFS)
+
+
+/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */
+#include "../nfs/objlayout/pnfs_osd_xdr.h"
+
+#else /* defined(CONFIG_PNFS) */
+
+enum pnfs_iomode {
+	IOMODE_READ = 1,
+	IOMODE_RW = 2,
+	IOMODE_ANY = 3,
+};
+
+/* Layout Structure */
+enum pnfs_osd_raid_algorithm4 {
+	PNFS_OSD_RAID_0		= 1,
+	PNFS_OSD_RAID_4		= 2,
+	PNFS_OSD_RAID_5		= 3,
+	PNFS_OSD_RAID_PQ	= 4     /* Reed-Solomon P+Q */
+};
+
+struct pnfs_osd_data_map {
+	u32	odm_num_comps;
+	u64	odm_stripe_unit;
+	u32	odm_group_width;
+	u32	odm_group_depth;
+	u32	odm_mirror_cnt;
+	u32	odm_raid_algorithm;
+};
+
+#endif /* else defined(CONFIG_PNFS) */
+
+#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f500dec3b5..a1d1e77b12e 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -203,49 +203,45 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 {
 	struct exofs_sb_info *sbi;
 	struct exofs_fscb *fscb;
-	struct osd_request *or;
-	struct osd_obj_id obj;
+	struct exofs_io_state *ios;
 	int ret = -ENOMEM;
 
-	fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
-	if (!fscb) {
-		EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-		return -ENOMEM;
-	}
-
 	lock_super(sb);
 	sbi = sb->s_fs_info;
+	fscb = &sbi->s_fscb;
+
+	ret = exofs_get_io_state(sbi, &ios);
+	if (ret)
+		goto out;
+
+	/* Note: We only write the changing part of the fscb. .i.e upto the
+	 *       the fscb->s_dev_table_oid member. There is no read-modify-write
+	 *       here.
+	 */
+	ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
+	memset(fscb, 0, ios->length);
 	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
 	fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
 	fscb->s_magic = cpu_to_le16(sb->s_magic);
 	fscb->s_newfs = 0;
+	fscb->s_version = EXOFS_FSCB_VER;
 
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
-		goto out;
-	}
+	ios->obj.id = EXOFS_SUPER_ID;
+	ios->offset = 0;
+	ios->kern_buff = fscb;
+	ios->cred = sbi->s_cred;
 
-	obj.partition = sbi->s_pid;
-	obj.id = EXOFS_SUPER_ID;
-	ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
+	ret = exofs_sbi_write(ios);
 	if (unlikely(ret)) {
-		EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
-		goto out;
-	}
-
-	ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
-	if (unlikely(ret)) {
-		EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
+		EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
 		goto out;
 	}
 	sb->s_dirt = 0;
 
 out:
-	if (or)
-		osd_end_request(or);
+	EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
+	exofs_put_io_state(ios);
 	unlock_super(sb);
-	kfree(fscb);
 	return ret;
 }
 
@@ -257,6 +253,29 @@ static void exofs_write_super(struct super_block *sb)
 		sb->s_dirt = 0;
 }
 
+static void _exofs_print_device(const char *msg, const char *dev_path,
+				struct osd_dev *od, u64 pid)
+{
+	const struct osd_dev_info *odi = osduld_device_info(od);
+
+	printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
+		msg, dev_path ?: "", odi->osdname, _LLU(pid));
+}
+
+void exofs_free_sbi(struct exofs_sb_info *sbi)
+{
+	while (sbi->s_numdevs) {
+		int i = --sbi->s_numdevs;
+		struct osd_dev *od = sbi->s_ods[i];
+
+		if (od) {
+			sbi->s_ods[i] = NULL;
+			osduld_put_device(od);
+		}
+	}
+	kfree(sbi);
+}
+
 /*
  * This function is called when the vfs is freeing the superblock.  We just
  * need to free our own part.
@@ -279,11 +298,182 @@ static void exofs_put_super(struct super_block *sb)
 				  msecs_to_jiffies(100));
 	}
 
-	osduld_put_device(sbi->s_dev);
-	kfree(sb->s_fs_info);
+	_exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
+
+	exofs_free_sbi(sbi);
 	sb->s_fs_info = NULL;
 }
 
+static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
+				    struct exofs_device_table *dt)
+{
+	sbi->data_map.odm_num_comps   =
+				le32_to_cpu(dt->dt_data_map.cb_num_comps);
+	sbi->data_map.odm_stripe_unit =
+				le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
+	sbi->data_map.odm_group_width =
+				le32_to_cpu(dt->dt_data_map.cb_group_width);
+	sbi->data_map.odm_group_depth =
+				le32_to_cpu(dt->dt_data_map.cb_group_depth);
+	sbi->data_map.odm_mirror_cnt  =
+				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
+	sbi->data_map.odm_raid_algorithm  =
+				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
+
+/* FIXME: Hard coded mirror only for now. if not so do not mount */
+	if ((sbi->data_map.odm_num_comps != numdevs) ||
+	    (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) ||
+	    (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
+	    (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
+		return -EINVAL;
+	else
+		return 0;
+}
+
+/* @odi is valid only as long as @fscb_dev is valid */
+static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
+			     struct osd_dev_info *odi)
+{
+	odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
+	memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
+
+	odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
+	odi->osdname = dt_dev->osdname;
+
+	/* FIXME support long names. Will need a _put function */
+	if (dt_dev->long_name_offset)
+		return -EINVAL;
+
+	/* Make sure osdname is printable!
+	 * mkexofs should give us space for a null-terminator else the
+	 * device-table is invalid.
+	 */
+	if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
+		odi->osdname_len = sizeof(dt_dev->osdname) - 1;
+	dt_dev->osdname[odi->osdname_len] = 0;
+
+	/* If it's all zeros something is bad we read past end-of-obj */
+	return !(odi->systemid_len || odi->osdname_len);
+}
+
+static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
+				       unsigned table_count)
+{
+	struct exofs_sb_info *sbi = *psbi;
+	struct osd_dev *fscb_od;
+	struct osd_obj_id obj = {.partition = sbi->s_pid,
+				 .id = EXOFS_DEVTABLE_ID};
+	struct exofs_device_table *dt;
+	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
+					     sizeof(*dt);
+	unsigned numdevs, i;
+	int ret;
+
+	dt = kmalloc(table_bytes, GFP_KERNEL);
+	if (unlikely(!dt)) {
+		EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
+			  table_bytes);
+		return -ENOMEM;
+	}
+
+	fscb_od = sbi->s_ods[0];
+	sbi->s_ods[0] = NULL;
+	sbi->s_numdevs = 0;
+	ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
+	if (unlikely(ret)) {
+		EXOFS_ERR("ERROR: reading device table\n");
+		goto out;
+	}
+
+	numdevs = le64_to_cpu(dt->dt_num_devices);
+	if (unlikely(!numdevs)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	WARN_ON(table_count != numdevs);
+
+	ret = _read_and_match_data_map(sbi, numdevs, dt);
+	if (unlikely(ret))
+		goto out;
+
+	if (likely(numdevs > 1)) {
+		unsigned size = numdevs * sizeof(sbi->s_ods[0]);
+
+		sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
+		if (unlikely(!sbi)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
+		*psbi = sbi;
+	}
+
+	for (i = 0; i < numdevs; i++) {
+		struct exofs_fscb fscb;
+		struct osd_dev_info odi;
+		struct osd_dev *od;
+
+		if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
+			EXOFS_ERR("ERROR: Read all-zeros device entry\n");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
+		       i, odi.osdname);
+
+		/* On all devices the device table is identical. The user can
+		 * specify any one of the participating devices on the command
+		 * line. We always keep them in device-table order.
+		 */
+		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
+			sbi->s_ods[i] = fscb_od;
+			++sbi->s_numdevs;
+			fscb_od = NULL;
+			continue;
+		}
+
+		od = osduld_info_lookup(&odi);
+		if (unlikely(IS_ERR(od))) {
+			ret = PTR_ERR(od);
+			EXOFS_ERR("ERROR: device requested is not found "
+				  "osd_name-%s =>%d\n", odi.osdname, ret);
+			goto out;
+		}
+
+		sbi->s_ods[i] = od;
+		++sbi->s_numdevs;
+
+		/* Read the fscb of the other devices to make sure the FS
+		 * partition is there.
+		 */
+		ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
+				      sizeof(fscb));
+		if (unlikely(ret)) {
+			EXOFS_ERR("ERROR: Malformed participating device "
+				  "error reading fscb osd_name-%s\n",
+				  odi.osdname);
+			goto out;
+		}
+
+		/* TODO: verify other information is correct and FS-uuid
+		 *	 matches. Benny what did you say about device table
+		 *	 generation and old devices?
+		 */
+	}
+
+out:
+	kfree(dt);
+	if (unlikely(!ret && fscb_od)) {
+		EXOFS_ERR(
+		      "ERROR: Bad device-table container device not present\n");
+		osduld_put_device(fscb_od);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
 /*
  * Read the superblock from the OSD and fill in the fields
  */
@@ -292,24 +482,25 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *root;
 	struct exofs_mountopt *opts = data;
 	struct exofs_sb_info *sbi;	/*extended info                  */
+	struct osd_dev *od;		/* Master device                 */
 	struct exofs_fscb fscb;		/*on-disk superblock info        */
-	struct osd_request *or = NULL;
 	struct osd_obj_id obj;
+	unsigned table_count;
 	int ret;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
-	sb->s_fs_info = sbi;
 
 	/* use mount options to fill superblock */
-	sbi->s_dev = osduld_path_lookup(opts->dev_name);
-	if (IS_ERR(sbi->s_dev)) {
-		ret = PTR_ERR(sbi->s_dev);
-		sbi->s_dev = NULL;
+	od = osduld_path_lookup(opts->dev_name);
+	if (IS_ERR(od)) {
+		ret = PTR_ERR(od);
 		goto free_sbi;
 	}
 
+	sbi->s_ods[0] = od;
+	sbi->s_numdevs = 1;
 	sbi->s_pid = opts->pid;
 	sbi->s_timeout = opts->timeout;
 
@@ -323,35 +514,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_bdev = NULL;
 	sb->s_dev = 0;
 
-	/* read data from on-disk superblock object */
 	obj.partition = sbi->s_pid;
 	obj.id = EXOFS_SUPER_ID;
 	exofs_make_credential(sbi->s_cred, &obj);
 
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		if (!silent)
-			EXOFS_ERR(
-			       "exofs_fill_super: osd_start_request failed.\n");
-		ret = -ENOMEM;
-		goto free_sbi;
-	}
-	ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
-	if (unlikely(ret)) {
-		if (!silent)
-			EXOFS_ERR(
-			       "exofs_fill_super: osd_req_read_kern failed.\n");
-		ret = -ENOMEM;
-		goto free_sbi;
-	}
-
-	ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
-	if (unlikely(ret)) {
-		if (!silent)
-			EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
-		ret = -EIO;
+	ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
+	if (unlikely(ret))
 		goto free_sbi;
-	}
 
 	sb->s_magic = le16_to_cpu(fscb.s_magic);
 	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
@@ -364,12 +533,26 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		ret = -EINVAL;
 		goto free_sbi;
 	}
+	if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
+		EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
+			  EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
+		ret = -EINVAL;
+		goto free_sbi;
+	}
 
 	/* start generation numbers from a random point */
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
+	table_count = le64_to_cpu(fscb.s_dev_table_count);
+	if (table_count) {
+		ret = exofs_read_lookup_dev_table(&sbi, table_count);
+		if (unlikely(ret))
+			goto free_sbi;
+	}
+
 	/* set up operation vectors */
+	sb->s_fs_info = sbi;
 	sb->s_op = &exofs_sops;
 	sb->s_export_op = &exofs_export_ops;
 	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
@@ -395,16 +578,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	ret = 0;
-out:
-	if (or)
-		osd_end_request(or);
-	return ret;
+	_exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
+			    sbi->s_pid);
+	return 0;
 
 free_sbi:
-	osduld_put_device(sbi->s_dev); /* NULL safe */
-	kfree(sbi);
-	goto out;
+	EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
+		  opts->dev_name, sbi->s_pid, ret);
+	exofs_free_sbi(sbi);
+	return ret;
 }
 
 /*
@@ -433,7 +615,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_obj_id obj = {sbi->s_pid, 0};
+	struct exofs_io_state *ios;
 	struct osd_attr attrs[] = {
 		ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
 			OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
@@ -442,32 +624,33 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	};
 	uint64_t capacity = ULLONG_MAX;
 	uint64_t used = ULLONG_MAX;
-	struct osd_request *or;
 	uint8_t cred_a[OSD_CAP_LEN];
 	int ret;
 
-	/* get used/capacity attributes */
-	exofs_make_credential(cred_a, &obj);
-
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
-		return -ENOMEM;
+	ret = exofs_get_io_state(sbi, &ios);
+	if (ret) {
+		EXOFS_DBGMSG("exofs_get_io_state failed.\n");
+		return ret;
 	}
 
-	osd_req_get_attributes(or, &obj);
-	osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
-	ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+	exofs_make_credential(cred_a, &ios->obj);
+	ios->cred = sbi->s_cred;
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
+
+	ret = exofs_sbi_read(ios);
 	if (unlikely(ret))
 		goto out;
 
-	ret = extract_attr_from_req(or, &attrs[0]);
-	if (likely(!ret))
+	ret = extract_attr_from_ios(ios, &attrs[0]);
+	if (likely(!ret)) {
 		capacity = get_unaligned_be64(attrs[0].val_ptr);
-	else
+		if (unlikely(!capacity))
+			capacity = ULLONG_MAX;
+	} else
 		EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
 
-	ret = extract_attr_from_req(or, &attrs[1]);
+	ret = extract_attr_from_ios(ios, &attrs[1]);
 	if (likely(!ret))
 		used = get_unaligned_be64(attrs[1].val_ptr);
 	else
@@ -476,15 +659,15 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	/* fill in the stats buffer */
 	buf->f_type = EXOFS_SUPER_MAGIC;
 	buf->f_bsize = EXOFS_BLKSIZE;
-	buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
-	buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+	buf->f_blocks = capacity >> 9;
+	buf->f_bfree = (capacity - used) >> 9;
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = sbi->s_numfiles;
 	buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
 	buf->f_namelen = EXOFS_NAME_LEN;
 
 out:
-	osd_end_request(or);
+	exofs_put_io_state(ios);
 	return ret;
 }
 
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 6cde970b0a1..7516957273e 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -353,8 +353,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
  *	ext2_find_entry()
  *
  * finds an entry in the specified directory with the wanted name. It
- * returns the page in which the entry was found, and the entry itself
- * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * returns the page in which the entry was found (as a parameter - res_page),
+ * and the entry itself. Page is returned mapped and unlocked.
  * Entry is guaranteed to be valid.
  */
 struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
@@ -721,5 +721,5 @@ const struct file_operations ext2_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
-	.fsync		= simple_fsync,
+	.fsync		= ext2_fsync,
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 9a8a8e27a06..061914add3c 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -142,7 +142,7 @@ struct dentry *ext2_get_parent(struct dentry *child);
 /* super.c */
 extern void ext2_error (struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void ext2_warning (struct super_block *, const char *, const char *, ...)
+extern void ext2_msg(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 extern void ext2_update_dynamic_rev (struct super_block *sb);
 extern void ext2_write_super (struct super_block *);
@@ -155,6 +155,7 @@ extern void ext2_write_super (struct super_block *);
 extern const struct file_operations ext2_dir_operations;
 
 /* file.c */
+extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a2f3afd1a1c..586e3589d4c 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/time.h>
+#include <linux/pagemap.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -38,6 +39,22 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
+int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	int ret;
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+
+	ret = simple_fsync(file, dentry, datasync);
+	if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
+		/* We don't really know where the IO error happened... */
+		ext2_error(sb, __func__,
+			   "detected IO error when writing metadata buffers");
+		ret = -EIO;
+	}
+	return ret;
+}
+
 /*
  * We have mostly NULL's here: the current defaults are ok for
  * the ext2 filesystem.
@@ -55,7 +72,7 @@ const struct file_operations ext2_file_operations = {
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
-	.fsync		= simple_fsync,
+	.fsync		= ext2_fsync,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
@@ -72,7 +89,7 @@ const struct file_operations ext2_xip_file_operations = {
 	.mmap		= xip_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
-	.fsync		= simple_fsync,
+	.fsync		= ext2_fsync,
 };
 #endif
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index ade634076d0..71b032c65a0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -137,7 +137,8 @@ static int ext2_block_to_path(struct inode *inode,
 	int final = 0;
 
 	if (i_block < 0) {
-		ext2_warning (inode->i_sb, "ext2_block_to_path", "block < 0");
+		ext2_msg(inode->i_sb, KERN_WARNING,
+			"warning: %s: block < 0", __func__);
 	} else if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
@@ -157,7 +158,8 @@ static int ext2_block_to_path(struct inode *inode,
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else {
-		ext2_warning (inode->i_sb, "ext2_block_to_path", "block > big");
+		ext2_msg(inode->i_sb, KERN_WARNING,
+			"warning: %s: block is too big", __func__);
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1a9ffee47d5..f9cb54a585c 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -58,27 +58,27 @@ void ext2_error (struct super_block * sb, const char * function,
 	}
 
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT2-fs error (device %s): %s: ",sb->s_id, function);
+	printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
 
 	if (test_opt(sb, ERRORS_PANIC))
-		panic("EXT2-fs panic from previous error\n");
+		panic("EXT2-fs: panic from previous error\n");
 	if (test_opt(sb, ERRORS_RO)) {
-		printk("Remounting filesystem read-only\n");
+		ext2_msg(sb, KERN_CRIT,
+			     "error: remounting filesystem read-only");
 		sb->s_flags |= MS_RDONLY;
 	}
 }
 
-void ext2_warning (struct super_block * sb, const char * function,
-		   const char * fmt, ...)
+void ext2_msg(struct super_block *sb, const char *prefix,
+		const char *fmt, ...)
 {
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_WARNING "EXT2-fs warning (device %s): %s: ",
-	       sb->s_id, function);
+	printk("%sEXT2-fs (%s): ", prefix, sb->s_id);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
@@ -91,9 +91,9 @@ void ext2_update_dynamic_rev(struct super_block *sb)
 	if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV)
 		return;
 
-	ext2_warning(sb, __func__,
-		     "updating to rev %d because of new feature flag, "
-		     "running e2fsck is recommended",
+	ext2_msg(sb, KERN_WARNING,
+		     "warning: updating to rev %d because of "
+		     "new feature flag, running e2fsck is recommended",
 		     EXT2_DYNAMIC_REV);
 
 	es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO);
@@ -419,10 +419,10 @@ static const match_table_t tokens = {
 	{Opt_err, NULL}
 };
 
-static int parse_options (char * options,
-			  struct ext2_sb_info *sbi)
+static int parse_options(char *options, struct super_block *sb)
 {
-	char * p;
+	char *p;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	substring_t args[MAX_OPT_ARGS];
 	int option;
 
@@ -505,7 +505,8 @@ static int parse_options (char * options,
 #else
 		case Opt_user_xattr:
 		case Opt_nouser_xattr:
-			printk("EXT2 (no)user_xattr options not supported\n");
+			ext2_msg(sb, KERN_INFO, "(no)user_xattr options"
+				"not supported");
 			break;
 #endif
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -518,14 +519,15 @@ static int parse_options (char * options,
 #else
 		case Opt_acl:
 		case Opt_noacl:
-			printk("EXT2 (no)acl options not supported\n");
+			ext2_msg(sb, KERN_INFO,
+				"(no)acl options not supported");
 			break;
 #endif
 		case Opt_xip:
 #ifdef CONFIG_EXT2_FS_XIP
 			set_opt (sbi->s_mount_opt, XIP);
 #else
-			printk("EXT2 xip option not supported\n");
+			ext2_msg(sb, KERN_INFO, "xip option not supported");
 #endif
 			break;
 
@@ -542,19 +544,18 @@ static int parse_options (char * options,
 		case Opt_quota:
 		case Opt_usrquota:
 		case Opt_grpquota:
-			printk(KERN_ERR
-				"EXT2-fs: quota operations not supported.\n");
-
+			ext2_msg(sb, KERN_INFO,
+				"quota operations not supported");
 			break;
 #endif
 
 		case Opt_reservation:
 			set_opt(sbi->s_mount_opt, RESERVATION);
-			printk("reservations ON\n");
+			ext2_msg(sb, KERN_INFO, "reservations ON");
 			break;
 		case Opt_noreservation:
 			clear_opt(sbi->s_mount_opt, RESERVATION);
-			printk("reservations OFF\n");
+			ext2_msg(sb, KERN_INFO, "reservations OFF");
 			break;
 		case Opt_ignore:
 			break;
@@ -573,34 +574,40 @@ static int ext2_setup_super (struct super_block * sb,
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 
 	if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) {
-		printk ("EXT2-fs warning: revision level too high, "
-			"forcing read-only mode\n");
+		ext2_msg(sb, KERN_ERR,
+			"error: revision level too high, "
+			"forcing read-only mode");
 		res = MS_RDONLY;
 	}
 	if (read_only)
 		return res;
 	if (!(sbi->s_mount_state & EXT2_VALID_FS))
-		printk ("EXT2-fs warning: mounting unchecked fs, "
-			"running e2fsck is recommended\n");
+		ext2_msg(sb, KERN_WARNING,
+			"warning: mounting unchecked fs, "
+			"running e2fsck is recommended");
 	else if ((sbi->s_mount_state & EXT2_ERROR_FS))
-		printk ("EXT2-fs warning: mounting fs with errors, "
-			"running e2fsck is recommended\n");
+		ext2_msg(sb, KERN_WARNING,
+			"warning: mounting fs with errors, "
+			"running e2fsck is recommended");
 	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-		printk ("EXT2-fs warning: maximal mount count reached, "
-			"running e2fsck is recommended\n");
+		ext2_msg(sb, KERN_WARNING,
+			"warning: maximal mount count reached, "
+			"running e2fsck is recommended");
 	else if (le32_to_cpu(es->s_checkinterval) &&
-		(le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-		printk ("EXT2-fs warning: checktime reached, "
-			"running e2fsck is recommended\n");
+		(le32_to_cpu(es->s_lastcheck) +
+			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: checktime reached, "
+			"running e2fsck is recommended");
 	if (!le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
 	le16_add_cpu(&es->s_mnt_count, 1);
 	ext2_write_super(sb);
 	if (test_opt (sb, DEBUG))
-		printk ("[EXT II FS %s, %s, bs=%lu, fs=%lu, gc=%lu, "
-			"bpg=%lu, ipg=%lu, mo=%04lx]\n",
+		ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
+			"bpg=%lu, ipg=%lu, mo=%04lx]",
 			EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize,
 			sbi->s_frag_size,
 			sbi->s_groups_count,
@@ -767,7 +774,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
 	if (!blocksize) {
-		printk ("EXT2-fs: unable to set blocksize\n");
+		ext2_msg(sb, KERN_ERR, "error: unable to set blocksize");
 		goto failed_sbi;
 	}
 
@@ -783,7 +790,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (!(bh = sb_bread(sb, logic_sb_block))) {
-		printk ("EXT2-fs: unable to read superblock\n");
+		ext2_msg(sb, KERN_ERR, "error: unable to read superblock");
 		goto failed_sbi;
 	}
 	/*
@@ -826,7 +833,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	
 	set_opt(sbi->s_mount_opt, RESERVATION);
 
-	if (!parse_options ((char *) data, sbi))
+	if (!parse_options((char *) data, sb))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -840,8 +847,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	    (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-		printk("EXT2-fs warning: feature flags set on rev 0 fs, "
-		       "running e2fsck is recommended\n");
+		ext2_msg(sb, KERN_WARNING,
+			"warning: feature flags set on rev 0 fs, "
+			"running e2fsck is recommended");
 	/*
 	 * Check feature flags regardless of the revision level, since we
 	 * previously didn't change the revision level when setting the flags,
@@ -849,16 +857,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP);
 	if (features) {
-		printk("EXT2-fs: %s: couldn't mount because of "
-		       "unsupported optional features (%x).\n",
-		       sb->s_id, le32_to_cpu(features));
+		ext2_msg(sb, KERN_ERR,	"error: couldn't mount because of "
+		       "unsupported optional features (%x)",
+			le32_to_cpu(features));
 		goto failed_mount;
 	}
 	if (!(sb->s_flags & MS_RDONLY) &&
 	    (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){
-		printk("EXT2-fs: %s: couldn't mount RDWR because of "
-		       "unsupported optional features (%x).\n",
-		       sb->s_id, le32_to_cpu(features));
+		ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of "
+		       "unsupported optional features (%x)",
+		       le32_to_cpu(features));
 		goto failed_mount;
 	}
 
@@ -866,7 +874,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) {
 		if (!silent)
-			printk("XIP: Unsupported blocksize\n");
+			ext2_msg(sb, KERN_ERR,
+				"error: unsupported blocksize for xip");
 		goto failed_mount;
 	}
 
@@ -875,7 +884,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		brelse(bh);
 
 		if (!sb_set_blocksize(sb, blocksize)) {
-			printk(KERN_ERR "EXT2-fs: blocksize too small for device.\n");
+			ext2_msg(sb, KERN_ERR, "error: blocksize is too small");
 			goto failed_sbi;
 		}
 
@@ -883,14 +892,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		offset = (sb_block*BLOCK_SIZE) % blocksize;
 		bh = sb_bread(sb, logic_sb_block);
 		if(!bh) {
-			printk("EXT2-fs: Couldn't read superblock on "
-			       "2nd try.\n");
+			ext2_msg(sb, KERN_ERR, "error: couldn't read"
+				"superblock on 2nd try");
 			goto failed_sbi;
 		}
 		es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) {
-			printk ("EXT2-fs: Magic mismatch, very weird !\n");
+			ext2_msg(sb, KERN_ERR, "error: magic mismatch");
 			goto failed_mount;
 		}
 	}
@@ -906,7 +915,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) ||
 		    !is_power_of_2(sbi->s_inode_size) ||
 		    (sbi->s_inode_size > blocksize)) {
-			printk ("EXT2-fs: unsupported inode size: %d\n",
+			ext2_msg(sb, KERN_ERR,
+				"error: unsupported inode size: %d",
 				sbi->s_inode_size);
 			goto failed_mount;
 		}
@@ -943,29 +953,33 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (sb->s_blocksize != bh->b_size) {
 		if (!silent)
-			printk ("VFS: Unsupported blocksize on dev "
-				"%s.\n", sb->s_id);
+			ext2_msg(sb, KERN_ERR, "error: unsupported blocksize");
 		goto failed_mount;
 	}
 
 	if (sb->s_blocksize != sbi->s_frag_size) {
-		printk ("EXT2-fs: fragsize %lu != blocksize %lu (not supported yet)\n",
+		ext2_msg(sb, KERN_ERR,
+			"error: fragsize %lu != blocksize %lu"
+			"(not supported yet)",
 			sbi->s_frag_size, sb->s_blocksize);
 		goto failed_mount;
 	}
 
 	if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
-		printk ("EXT2-fs: #blocks per group too big: %lu\n",
+		ext2_msg(sb, KERN_ERR,
+			"error: #blocks per group too big: %lu",
 			sbi->s_blocks_per_group);
 		goto failed_mount;
 	}
 	if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
-		printk ("EXT2-fs: #fragments per group too big: %lu\n",
+		ext2_msg(sb, KERN_ERR,
+			"error: #fragments per group too big: %lu",
 			sbi->s_frags_per_group);
 		goto failed_mount;
 	}
 	if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
-		printk ("EXT2-fs: #inodes per group too big: %lu\n",
+		ext2_msg(sb, KERN_ERR,
+			"error: #inodes per group too big: %lu",
 			sbi->s_inodes_per_group);
 		goto failed_mount;
 	}
@@ -979,13 +993,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		   EXT2_DESC_PER_BLOCK(sb);
 	sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
-		printk ("EXT2-fs: not enough memory\n");
+		ext2_msg(sb, KERN_ERR, "error: not enough memory");
 		goto failed_mount;
 	}
 	bgl_lock_init(sbi->s_blockgroup_lock);
 	sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
 	if (!sbi->s_debts) {
-		printk ("EXT2-fs: not enough memory\n");
+		ext2_msg(sb, KERN_ERR, "error: not enough memory");
 		goto failed_mount_group_desc;
 	}
 	for (i = 0; i < db_count; i++) {
@@ -994,12 +1008,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sbi->s_group_desc[i]) {
 			for (j = 0; j < i; j++)
 				brelse (sbi->s_group_desc[j]);
-			printk ("EXT2-fs: unable to read group descriptors\n");
+			ext2_msg(sb, KERN_ERR,
+				"error: unable to read group descriptors");
 			goto failed_mount_group_desc;
 		}
 	}
 	if (!ext2_check_descriptors (sb)) {
-		printk ("EXT2-fs: group descriptors corrupted!\n");
+		ext2_msg(sb, KERN_ERR, "group descriptors corrupted");
 		goto failed_mount2;
 	}
 	sbi->s_gdb_count = db_count;
@@ -1032,7 +1047,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 				ext2_count_dirs(sb));
 	}
 	if (err) {
-		printk(KERN_ERR "EXT2-fs: insufficient memory\n");
+		ext2_msg(sb, KERN_ERR, "error: insufficient memory");
 		goto failed_mount3;
 	}
 	/*
@@ -1048,27 +1063,28 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		iput(root);
-		printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n");
+		ext2_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
 		goto failed_mount3;
 	}
 
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
 		iput(root);
-		printk(KERN_ERR "EXT2-fs: get root inode failed\n");
+		ext2_msg(sb, KERN_ERR, "error: get root inode failed");
 		ret = -ENOMEM;
 		goto failed_mount3;
 	}
 	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
-		ext2_warning(sb, __func__,
-			"mounting ext3 filesystem as ext2");
+		ext2_msg(sb, KERN_WARNING,
+			"warning: mounting ext3 filesystem as ext2");
 	ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
 	return 0;
 
 cantfind_ext2:
 	if (!silent)
-		printk("VFS: Can't find an ext2 filesystem on dev %s.\n",
-		       sb->s_id);
+		ext2_msg(sb, KERN_ERR,
+			"error: can't find an ext2 filesystem on dev %s.",
+			sb->s_id);
 	goto failed_mount;
 failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
@@ -1089,9 +1105,30 @@ failed_sbi:
 	return ret;
 }
 
+static void ext2_clear_super_error(struct super_block *sb)
+{
+	struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
+
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "EXT2-fs: %s previous I/O error to "
+		       "superblock detected", sb->s_id);
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
+}
+
 static void ext2_commit_super (struct super_block * sb,
 			       struct ext2_super_block * es)
 {
+	ext2_clear_super_error(sb);
 	es->s_wtime = cpu_to_le32(get_seconds());
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
 	sb->s_dirt = 0;
@@ -1099,6 +1136,7 @@ static void ext2_commit_super (struct super_block * sb,
 
 static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 {
+	ext2_clear_super_error(sb);
 	es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
 	es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
 	es->s_wtime = cpu_to_le32(get_seconds());
@@ -1121,8 +1159,24 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 static int ext2_sync_fs(struct super_block *sb, int wait)
 {
 	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+	struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
 
 	lock_kernel();
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		ext2_msg(sb, KERN_ERR,
+		       "previous I/O error to superblock detected\n");
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
+
 	if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
 		ext2_debug("setting valid to 0\n");
 		es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
@@ -1170,7 +1224,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options (data, sbi)) {
+	if (!parse_options(data, sb)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
@@ -1182,7 +1236,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 				    EXT2_MOUNT_XIP if not */
 
 	if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
-		printk("XIP: Unsupported blocksize\n");
+		ext2_msg(sb, KERN_WARNING,
+			"warning: unsupported blocksize for xip");
 		err = -EINVAL;
 		goto restore_opts;
 	}
@@ -1191,8 +1246,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
 	    (old_mount_opt & EXT2_MOUNT_XIP)) &&
 	    invalidate_inodes(sb)) {
-		ext2_warning(sb, __func__, "refusing change of xip flag "
-			     "with busy inodes while remounting");
+		ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
+			 "xip flag with busy inodes while remounting");
 		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
 		sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
 	}
@@ -1216,9 +1271,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		__le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
 					       ~EXT2_FEATURE_RO_COMPAT_SUPP);
 		if (ret) {
-			printk("EXT2-fs: %s: couldn't remount RDWR because of "
-			       "unsupported optional features (%x).\n",
-			       sb->s_id, le32_to_cpu(ret));
+			ext2_msg(sb, KERN_WARNING,
+				"warning: couldn't remount RDWR because of "
+				"unsupported optional features (%x).",
+				le32_to_cpu(ret));
 			err = -EROFS;
 			goto restore_opts;
 		}
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index c18fbf3e406..322a56b2dfb 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -69,8 +69,9 @@ void ext2_xip_verify_sb(struct super_block *sb)
 	if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
 	    !sb->s_bdev->bd_disk->fops->direct_access) {
 		sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
-		ext2_warning(sb, __func__,
-			     "ignoring xip option - not supported by bdev");
+		ext2_msg(sb, KERN_WARNING,
+			     "warning: ignoring xip option - "
+			     "not supported by bdev");
 	}
 }
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 354ed3b47b3..ad14227f509 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle,
 	return ext3_journal_get_write_access(handle, bh);
 }
 
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext3_truncate_failed_write(struct inode *inode)
+{
+	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	ext3_truncate(inode);
+}
+
 static int ext3_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
@@ -1209,7 +1219,7 @@ write_begin_failed:
 		unlock_page(page);
 		page_cache_release(page);
 		if (pos + len > inode->i_size)
-			ext3_truncate(inode);
+			ext3_truncate_failed_write(inode);
 	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
@@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file,
 	page_cache_release(page);
 
 	if (pos + len > inode->i_size)
-		ext3_truncate(inode);
+		ext3_truncate_failed_write(inode);
 	return ret ? ret : copied;
 }
 
@@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file,
 	page_cache_release(page);
 
 	if (pos + len > inode->i_size)
-		ext3_truncate(inode);
+		ext3_truncate_failed_write(inode);
 	return ret ? ret : copied;
 }
 
@@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file,
 	page_cache_release(page);
 
 	if (pos + len > inode->i_size)
-		ext3_truncate(inode);
+		ext3_truncate_failed_write(inode);
 	return ret ? ret : copied;
 }
 
@@ -2033,7 +2043,7 @@ static Indirect *ext3_find_shared(struct inode *inode, int depth,
 	int k, err;
 
 	*top = 0;
-	/* Make k index the deepest non-null offest + 1 */
+	/* Make k index the deepest non-null offset + 1 */
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
 	partial = ext3_get_branch(inode, k, offsets, chain, &err);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 8359e7b3dc8..5f83b617917 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -266,7 +266,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 			goto exit_bh;
 
 		if (IS_ERR(gdb = bclean(handle, sb, block))) {
-			err = PTR_ERR(bh);
+			err = PTR_ERR(gdb);
 			goto exit_bh;
 		}
 		ext3_journal_dirty_metadata(handle, gdb);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 427496c4767..7ad1e8c30bd 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -135,12 +135,24 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
 	if (is_handle_aborted(handle))
 		return;
 
-	printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
-	       caller, errstr, err_fn);
+	printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
+		caller, errstr, err_fn);
 
 	journal_abort_handle(handle);
 }
 
+void ext3_msg(struct super_block *sb, const char *prefix,
+		const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+}
+
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
  *
@@ -174,12 +186,13 @@ static void ext3_handle_error(struct super_block *sb)
 			journal_abort(journal, -EIO);
 	}
 	if (test_opt (sb, ERRORS_RO)) {
-		printk (KERN_CRIT "Remounting filesystem read-only\n");
+		ext3_msg(sb, KERN_CRIT,
+			"error: remounting filesystem read-only");
 		sb->s_flags |= MS_RDONLY;
 	}
 	ext3_commit_super(sb, es, 1);
 	if (test_opt(sb, ERRORS_PANIC))
-		panic("EXT3-fs (device %s): panic forced after error\n",
+		panic("EXT3-fs (%s): panic forced after error\n",
 			sb->s_id);
 }
 
@@ -247,8 +260,7 @@ void __ext3_std_error (struct super_block * sb, const char * function,
 		return;
 
 	errstr = ext3_decode_error(sb, errno, nbuf);
-	printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
-		sb->s_id, function, errstr);
+	ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
 
 	ext3_handle_error(sb);
 }
@@ -268,21 +280,20 @@ void ext3_abort (struct super_block * sb, const char * function,
 {
 	va_list args;
 
-	printk (KERN_CRIT "ext3_abort called.\n");
-
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
+	printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
 
 	if (test_opt(sb, ERRORS_PANIC))
-		panic("EXT3-fs panic from previous error\n");
+		panic("EXT3-fs: panic from previous error\n");
 
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
-	printk(KERN_CRIT "Remounting filesystem read-only\n");
+	ext3_msg(sb, KERN_CRIT,
+		"error: remounting filesystem read-only");
 	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
 	EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
@@ -296,7 +307,7 @@ void ext3_warning (struct super_block * sb, const char * function,
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ",
+	printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
 	       sb->s_id, function);
 	vprintk(fmt, args);
 	printk("\n");
@@ -310,10 +321,10 @@ void ext3_update_dynamic_rev(struct super_block *sb)
 	if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
 		return;
 
-	ext3_warning(sb, __func__,
-		     "updating to rev %d because of new feature flag, "
-		     "running e2fsck is recommended",
-		     EXT3_DYNAMIC_REV);
+	ext3_msg(sb, KERN_WARNING,
+		"warning: updating to rev %d because of "
+		"new feature flag, running e2fsck is recommended",
+		EXT3_DYNAMIC_REV);
 
 	es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
 	es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
@@ -331,7 +342,7 @@ void ext3_update_dynamic_rev(struct super_block *sb)
 /*
  * Open the external journal device
  */
-static struct block_device *ext3_blkdev_get(dev_t dev)
+static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
 {
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
@@ -342,8 +353,9 @@ static struct block_device *ext3_blkdev_get(dev_t dev)
 	return bdev;
 
 fail:
-	printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n",
-			__bdevname(dev, b), PTR_ERR(bdev));
+	ext3_msg(sb, "error: failed to open journal device %s: %ld",
+		__bdevname(dev, b), PTR_ERR(bdev));
+
 	return NULL;
 }
 
@@ -378,13 +390,13 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
 {
 	struct list_head *l;
 
-	printk(KERN_ERR "sb orphan head is %d\n",
+	ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
 	       le32_to_cpu(sbi->s_es->s_last_orphan));
 
-	printk(KERN_ERR "sb_info orphan list:\n");
+	ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
 	list_for_each(l, &sbi->s_orphan) {
 		struct inode *inode = orphan_list_entry(l);
-		printk(KERN_ERR "  "
+		ext3_msg(sb, KERN_ERR, "  "
 		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
 		       inode->i_sb->s_id, inode->i_ino, inode,
 		       inode->i_mode, inode->i_nlink,
@@ -527,9 +539,22 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
 #if defined(CONFIG_QUOTA)
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 
-	if (sbi->s_jquota_fmt)
-		seq_printf(seq, ",jqfmt=%s",
-		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+	if (sbi->s_jquota_fmt) {
+		char *fmtname = "";
+
+		switch (sbi->s_jquota_fmt) {
+		case QFMT_VFS_OLD:
+			fmtname = "vfsold";
+			break;
+		case QFMT_VFS_V0:
+			fmtname = "vfsv0";
+			break;
+		case QFMT_VFS_V1:
+			fmtname = "vfsv1";
+			break;
+		}
+		seq_printf(seq, ",jqfmt=%s", fmtname);
+	}
 
 	if (sbi->s_qf_names[USRQUOTA])
 		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -636,6 +661,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, DATA_ERR_ABORT))
 		seq_puts(seq, ",data_err=abort");
 
+	if (test_opt(sb, NOLOAD))
+		seq_puts(seq, ",norecovery");
+
 	ext3_show_quota_options(seq, sb);
 
 	return 0;
@@ -787,9 +815,9 @@ enum {
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
-	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota
+	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+	Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+	Opt_usrquota, Opt_grpquota
 };
 
 static const match_table_t tokens = {
@@ -818,6 +846,7 @@ static const match_table_t tokens = {
 	{Opt_reservation, "reservation"},
 	{Opt_noreservation, "noreservation"},
 	{Opt_noload, "noload"},
+	{Opt_noload, "norecovery"},
 	{Opt_nobh, "nobh"},
 	{Opt_bh, "bh"},
 	{Opt_commit, "commit=%u"},
@@ -836,6 +865,7 @@ static const match_table_t tokens = {
 	{Opt_grpjquota, "grpjquota=%s"},
 	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
 	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
 	{Opt_grpquota, "grpquota"},
 	{Opt_noquota, "noquota"},
 	{Opt_quota, "quota"},
@@ -845,7 +875,7 @@ static const match_table_t tokens = {
 	{Opt_err, NULL},
 };
 
-static ext3_fsblk_t get_sb_block(void **data)
+static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
 {
 	ext3_fsblk_t	sb_block;
 	char		*options = (char *) *data;
@@ -856,7 +886,7 @@ static ext3_fsblk_t get_sb_block(void **data)
 	/*todo: use simple_strtoll with >32bit ext3 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
-		printk("EXT3-fs: Invalid sb specification: %s\n",
+		ext3_msg(sb, "error: invalid sb specification: %s",
 		       (char *) *data);
 		return 1;
 	}
@@ -956,7 +986,8 @@ static int parse_options (char *options, struct super_block *sb,
 #else
 		case Opt_user_xattr:
 		case Opt_nouser_xattr:
-			printk("EXT3 (no)user_xattr options not supported\n");
+			ext3_msg(sb, KERN_INFO,
+				"(no)user_xattr options not supported");
 			break;
 #endif
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -969,7 +1000,8 @@ static int parse_options (char *options, struct super_block *sb,
 #else
 		case Opt_acl:
 		case Opt_noacl:
-			printk("EXT3 (no)acl options not supported\n");
+			ext3_msg(sb, KERN_INFO,
+				"(no)acl options not supported");
 			break;
 #endif
 		case Opt_reservation:
@@ -985,16 +1017,16 @@ static int parse_options (char *options, struct super_block *sb,
 			   user to specify an existing inode to be the
 			   journal file. */
 			if (is_remount) {
-				printk(KERN_ERR "EXT3-fs: cannot specify "
-				       "journal on remount\n");
+				ext3_msg(sb, KERN_ERR, "error: cannot specify "
+					"journal on remount");
 				return 0;
 			}
 			set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
 			break;
 		case Opt_journal_inum:
 			if (is_remount) {
-				printk(KERN_ERR "EXT3-fs: cannot specify "
-				       "journal on remount\n");
+				ext3_msg(sb, KERN_ERR, "error: cannot specify "
+				       "journal on remount");
 				return 0;
 			}
 			if (match_int(&args[0], &option))
@@ -1003,8 +1035,8 @@ static int parse_options (char *options, struct super_block *sb,
 			break;
 		case Opt_journal_dev:
 			if (is_remount) {
-				printk(KERN_ERR "EXT3-fs: cannot specify "
-				       "journal on remount\n");
+				ext3_msg(sb, KERN_ERR, "error: cannot specify "
+				       "journal on remount");
 				return 0;
 			}
 			if (match_int(&args[0], &option))
@@ -1036,12 +1068,11 @@ static int parse_options (char *options, struct super_block *sb,
 				if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
 						== data_opt)
 					break;
-				printk(KERN_ERR
-					"EXT3-fs (device %s): Cannot change "
+				ext3_msg(sb, KERN_ERR,
+					"error: cannot change "
 					"data mode on remount. The filesystem "
 					"is mounted in data=%s mode and you "
-					"try to remount it in data=%s mode.\n",
-					sb->s_id,
+					"try to remount it in data=%s mode.",
 					data_mode_string(sbi->s_mount_opt &
 							EXT3_MOUNT_DATA_FLAGS),
 					data_mode_string(data_opt));
@@ -1066,31 +1097,31 @@ static int parse_options (char *options, struct super_block *sb,
 set_qf_name:
 			if (sb_any_quota_loaded(sb) &&
 			    !sbi->s_qf_names[qtype]) {
-				printk(KERN_ERR
-					"EXT3-fs: Cannot change journaled "
-					"quota options when quota turned on.\n");
+				ext3_msg(sb, KERN_ERR,
+					"error: cannot change journaled "
+					"quota options when quota turned on.");
 				return 0;
 			}
 			qname = match_strdup(&args[0]);
 			if (!qname) {
-				printk(KERN_ERR
-					"EXT3-fs: not enough memory for "
-					"storing quotafile name.\n");
+				ext3_msg(sb, KERN_ERR,
+					"error: not enough memory for "
+					"storing quotafile name.");
 				return 0;
 			}
 			if (sbi->s_qf_names[qtype] &&
 			    strcmp(sbi->s_qf_names[qtype], qname)) {
-				printk(KERN_ERR
-					"EXT3-fs: %s quota file already "
-					"specified.\n", QTYPE2NAME(qtype));
+				ext3_msg(sb, KERN_ERR,
+					"error: %s quota file already "
+					"specified.", QTYPE2NAME(qtype));
 				kfree(qname);
 				return 0;
 			}
 			sbi->s_qf_names[qtype] = qname;
 			if (strchr(sbi->s_qf_names[qtype], '/')) {
-				printk(KERN_ERR
-					"EXT3-fs: quotafile must be on "
-					"filesystem root.\n");
+				ext3_msg(sb, KERN_ERR,
+					"error: quotafile must be on "
+					"filesystem root.");
 				kfree(sbi->s_qf_names[qtype]);
 				sbi->s_qf_names[qtype] = NULL;
 				return 0;
@@ -1105,9 +1136,9 @@ set_qf_name:
 clear_qf_name:
 			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_qf_names[qtype]) {
-				printk(KERN_ERR "EXT3-fs: Cannot change "
+				ext3_msg(sb, KERN_ERR, "error: cannot change "
 					"journaled quota options when "
-					"quota turned on.\n");
+					"quota turned on.");
 				return 0;
 			}
 			/*
@@ -1121,12 +1152,15 @@ clear_qf_name:
 			goto set_qf_format;
 		case Opt_jqfmt_vfsv0:
 			qfmt = QFMT_VFS_V0;
+			goto set_qf_format;
+		case Opt_jqfmt_vfsv1:
+			qfmt = QFMT_VFS_V1;
 set_qf_format:
 			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
-				printk(KERN_ERR "EXT3-fs: Cannot change "
+				ext3_msg(sb, KERN_ERR, "error: cannot change "
 					"journaled quota options when "
-					"quota turned on.\n");
+					"quota turned on.");
 				return 0;
 			}
 			sbi->s_jquota_fmt = qfmt;
@@ -1142,8 +1176,8 @@ set_qf_format:
 			break;
 		case Opt_noquota:
 			if (sb_any_quota_loaded(sb)) {
-				printk(KERN_ERR "EXT3-fs: Cannot change quota "
-					"options when quota turned on.\n");
+				ext3_msg(sb, KERN_ERR, "error: cannot change "
+					"quota options when quota turned on.");
 				return 0;
 			}
 			clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1154,8 +1188,8 @@ set_qf_format:
 		case Opt_quota:
 		case Opt_usrquota:
 		case Opt_grpquota:
-			printk(KERN_ERR
-				"EXT3-fs: quota options not supported.\n");
+			ext3_msg(sb, KERN_ERR,
+				"error: quota options not supported.");
 			break;
 		case Opt_usrjquota:
 		case Opt_grpjquota:
@@ -1163,9 +1197,10 @@ set_qf_format:
 		case Opt_offgrpjquota:
 		case Opt_jqfmt_vfsold:
 		case Opt_jqfmt_vfsv0:
-			printk(KERN_ERR
-				"EXT3-fs: journaled quota options not "
-				"supported.\n");
+		case Opt_jqfmt_vfsv1:
+			ext3_msg(sb, KERN_ERR,
+				"error: journaled quota options not "
+				"supported.");
 			break;
 		case Opt_noquota:
 			break;
@@ -1185,8 +1220,9 @@ set_qf_format:
 			break;
 		case Opt_resize:
 			if (!is_remount) {
-				printk("EXT3-fs: resize option only available "
-					"for remount\n");
+				ext3_msg(sb, KERN_ERR,
+					"error: resize option only available "
+					"for remount");
 				return 0;
 			}
 			if (match_int(&args[0], &option) != 0)
@@ -1200,9 +1236,9 @@ set_qf_format:
 			clear_opt(sbi->s_mount_opt, NOBH);
 			break;
 		default:
-			printk (KERN_ERR
-				"EXT3-fs: Unrecognized mount option \"%s\" "
-				"or missing value\n", p);
+			ext3_msg(sb, KERN_ERR,
+				"error: unrecognized mount option \"%s\" "
+				"or missing value", p);
 			return 0;
 		}
 	}
@@ -1220,21 +1256,21 @@ set_qf_format:
 				(sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
 		    (sbi->s_qf_names[GRPQUOTA] &&
 				(sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
-			printk(KERN_ERR "EXT3-fs: old and new quota "
-					"format mixing.\n");
+			ext3_msg(sb, KERN_ERR, "error: old and new quota "
+					"format mixing.");
 			return 0;
 		}
 
 		if (!sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT3-fs: journaled quota format "
-					"not specified.\n");
+			ext3_msg(sb, KERN_ERR, "error: journaled quota format "
+					"not specified.");
 			return 0;
 		}
 	} else {
 		if (sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT3-fs: journaled quota format "
+			ext3_msg(sb, KERN_ERR, "error: journaled quota format "
 					"specified with no journaling "
-					"enabled.\n");
+					"enabled.");
 			return 0;
 		}
 	}
@@ -1249,31 +1285,33 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 	int res = 0;
 
 	if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
-		printk (KERN_ERR "EXT3-fs warning: revision level too high, "
-			"forcing read-only mode\n");
+		ext3_msg(sb, KERN_ERR,
+			"error: revision level too high, "
+			"forcing read-only mode");
 		res = MS_RDONLY;
 	}
 	if (read_only)
 		return res;
 	if (!(sbi->s_mount_state & EXT3_VALID_FS))
-		printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
-			"running e2fsck is recommended\n");
+		ext3_msg(sb, KERN_WARNING,
+			"warning: mounting unchecked fs, "
+			"running e2fsck is recommended");
 	else if ((sbi->s_mount_state & EXT3_ERROR_FS))
-		printk (KERN_WARNING
-			"EXT3-fs warning: mounting fs with errors, "
-			"running e2fsck is recommended\n");
+		ext3_msg(sb, KERN_WARNING,
+			"warning: mounting fs with errors, "
+			"running e2fsck is recommended");
 	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-		printk (KERN_WARNING
-			"EXT3-fs warning: maximal mount count reached, "
-			"running e2fsck is recommended\n");
+		ext3_msg(sb, KERN_WARNING,
+			"warning: maximal mount count reached, "
+			"running e2fsck is recommended");
 	else if (le32_to_cpu(es->s_checkinterval) &&
 		(le32_to_cpu(es->s_lastcheck) +
 			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-		printk (KERN_WARNING
-			"EXT3-fs warning: checktime reached, "
-			"running e2fsck is recommended\n");
+		ext3_msg(sb, KERN_WARNING,
+			"warning: checktime reached, "
+			"running e2fsck is recommended");
 #if 0
 		/* @@@ We _will_ want to clear the valid bit if we find
                    inconsistencies, to force a fsck at reboot.  But for
@@ -1290,22 +1328,20 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 
 	ext3_commit_super(sb, es, 1);
 	if (test_opt(sb, DEBUG))
-		printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, "
-				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
+		ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
+				"bpg=%lu, ipg=%lu, mo=%04lx]",
 			sb->s_blocksize,
 			sbi->s_groups_count,
 			EXT3_BLOCKS_PER_GROUP(sb),
 			EXT3_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);
 
-	printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id);
 	if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
 		char b[BDEVNAME_SIZE];
-
-		printk("external journal on %s\n",
+		ext3_msg(sb, KERN_INFO, "using external journal on %s",
 			bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
 	} else {
-		printk("internal journal\n");
+		ext3_msg(sb, KERN_INFO, "using internal journal");
 	}
 	return res;
 }
@@ -1399,8 +1435,8 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 	}
 
 	if (bdev_read_only(sb->s_bdev)) {
-		printk(KERN_ERR "EXT3-fs: write access "
-			"unavailable, skipping orphan cleanup.\n");
+		ext3_msg(sb, KERN_ERR, "error: write access "
+			"unavailable, skipping orphan cleanup.");
 		return;
 	}
 
@@ -1414,8 +1450,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 	}
 
 	if (s_flags & MS_RDONLY) {
-		printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
-		       sb->s_id);
+		ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
 		sb->s_flags &= ~MS_RDONLY;
 	}
 #ifdef CONFIG_QUOTA
@@ -1426,9 +1461,9 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 		if (EXT3_SB(sb)->s_qf_names[i]) {
 			int ret = ext3_quota_on_mount(sb, i);
 			if (ret < 0)
-				printk(KERN_ERR
-					"EXT3-fs: Cannot turn on journaled "
-					"quota: error %d\n", ret);
+				ext3_msg(sb, KERN_ERR,
+					"error: cannot turn on journaled "
+					"quota: %d", ret);
 		}
 	}
 #endif
@@ -1466,11 +1501,11 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 #define PLURAL(x) (x), ((x)==1) ? "" : "s"
 
 	if (nr_orphans)
-		printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
-		       sb->s_id, PLURAL(nr_orphans));
+		ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+		       PLURAL(nr_orphans));
 	if (nr_truncates)
-		printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
-		       sb->s_id, PLURAL(nr_truncates));
+		ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+		       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
 	for (i = 0; i < MAXQUOTAS; i++) {
@@ -1554,7 +1589,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	struct ext3_super_block *es = NULL;
 	struct ext3_sb_info *sbi;
 	ext3_fsblk_t block;
-	ext3_fsblk_t sb_block = get_sb_block(&data);
+	ext3_fsblk_t sb_block = get_sb_block(&data, sb);
 	ext3_fsblk_t logic_sb_block;
 	unsigned long offset = 0;
 	unsigned int journal_inum = 0;
@@ -1590,7 +1625,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 
 	blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
 	if (!blocksize) {
-		printk(KERN_ERR "EXT3-fs: unable to set blocksize\n");
+		ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
 		goto out_fail;
 	}
 
@@ -1606,7 +1641,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	}
 
 	if (!(bh = sb_bread(sb, logic_sb_block))) {
-		printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
+		ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
 		goto out_fail;
 	}
 	/*
@@ -1665,9 +1700,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	    (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-		printk(KERN_WARNING
-		       "EXT3-fs warning: feature flags set on rev 0 fs, "
-		       "running e2fsck is recommended\n");
+		ext3_msg(sb, KERN_WARNING,
+			"warning: feature flags set on rev 0 fs, "
+			"running e2fsck is recommended");
 	/*
 	 * Check feature flags regardless of the revision level, since we
 	 * previously didn't change the revision level when setting the flags,
@@ -1675,25 +1710,25 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	 */
 	features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
 	if (features) {
-		printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
-		       "unsupported optional features (%x).\n",
-		       sb->s_id, le32_to_cpu(features));
+		ext3_msg(sb, KERN_ERR,
+			"error: couldn't mount because of unsupported "
+			"optional features (%x)", le32_to_cpu(features));
 		goto failed_mount;
 	}
 	features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
 	if (!(sb->s_flags & MS_RDONLY) && features) {
-		printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
-		       "unsupported optional features (%x).\n",
-		       sb->s_id, le32_to_cpu(features));
+		ext3_msg(sb, KERN_ERR,
+			"error: couldn't mount RDWR because of unsupported "
+			"optional features (%x)", le32_to_cpu(features));
 		goto failed_mount;
 	}
 	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
 
 	if (blocksize < EXT3_MIN_BLOCK_SIZE ||
 	    blocksize > EXT3_MAX_BLOCK_SIZE) {
-		printk(KERN_ERR
-		       "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
-		       blocksize, sb->s_id);
+		ext3_msg(sb, KERN_ERR,
+			"error: couldn't mount because of unsupported "
+			"filesystem blocksize %d", blocksize);
 		goto failed_mount;
 	}
 
@@ -1704,30 +1739,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		 * than the hardware sectorsize for the machine.
 		 */
 		if (blocksize < hblock) {
-			printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
-			       "device blocksize %d.\n", blocksize, hblock);
+			ext3_msg(sb, KERN_ERR,
+				"error: fsblocksize %d too small for "
+				"hardware sectorsize %d", blocksize, hblock);
 			goto failed_mount;
 		}
 
 		brelse (bh);
 		if (!sb_set_blocksize(sb, blocksize)) {
-			printk(KERN_ERR "EXT3-fs: bad blocksize %d.\n",
-				blocksize);
+			ext3_msg(sb, KERN_ERR,
+				"error: bad blocksize %d", blocksize);
 			goto out_fail;
 		}
 		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
 		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
 		bh = sb_bread(sb, logic_sb_block);
 		if (!bh) {
-			printk(KERN_ERR
-			       "EXT3-fs: Can't read superblock on 2nd try.\n");
+			ext3_msg(sb, KERN_ERR,
+			       "error: can't read superblock on 2nd try");
 			goto failed_mount;
 		}
 		es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
-			printk (KERN_ERR
-				"EXT3-fs: Magic mismatch, very weird !\n");
+			ext3_msg(sb, KERN_ERR,
+				"error: magic mismatch");
 			goto failed_mount;
 		}
 	}
@@ -1743,8 +1779,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
 		    (!is_power_of_2(sbi->s_inode_size)) ||
 		    (sbi->s_inode_size > blocksize)) {
-			printk (KERN_ERR
-				"EXT3-fs: unsupported inode size: %d\n",
+			ext3_msg(sb, KERN_ERR,
+				"error: unsupported inode size: %d",
 				sbi->s_inode_size);
 			goto failed_mount;
 		}
@@ -1752,8 +1788,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
 				   le32_to_cpu(es->s_log_frag_size);
 	if (blocksize != sbi->s_frag_size) {
-		printk(KERN_ERR
-		       "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
+		ext3_msg(sb, KERN_ERR,
+		       "error: fragsize %lu != blocksize %u (unsupported)",
 		       sbi->s_frag_size, blocksize);
 		goto failed_mount;
 	}
@@ -1789,31 +1825,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
-		printk (KERN_ERR
-			"EXT3-fs: #blocks per group too big: %lu\n",
+		ext3_msg(sb, KERN_ERR,
+			"#blocks per group too big: %lu",
 			sbi->s_blocks_per_group);
 		goto failed_mount;
 	}
 	if (sbi->s_frags_per_group > blocksize * 8) {
-		printk (KERN_ERR
-			"EXT3-fs: #fragments per group too big: %lu\n",
+		ext3_msg(sb, KERN_ERR,
+			"error: #fragments per group too big: %lu",
 			sbi->s_frags_per_group);
 		goto failed_mount;
 	}
 	if (sbi->s_inodes_per_group > blocksize * 8) {
-		printk (KERN_ERR
-			"EXT3-fs: #inodes per group too big: %lu\n",
+		ext3_msg(sb, KERN_ERR,
+			"error: #inodes per group too big: %lu",
 			sbi->s_inodes_per_group);
 		goto failed_mount;
 	}
 
 	if (le32_to_cpu(es->s_blocks_count) >
 		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
-			" too large to mount safely\n", sb->s_id);
+		ext3_msg(sb, KERN_ERR,
+			"error: filesystem is too large to mount safely");
 		if (sizeof(sector_t) < 8)
-			printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not "
-					"enabled\n");
+			ext3_msg(sb, KERN_ERR,
+				"error: CONFIG_LBDAF not enabled");
 		goto failed_mount;
 	}
 
@@ -1827,7 +1863,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
 				    GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
-		printk (KERN_ERR "EXT3-fs: not enough memory\n");
+		ext3_msg(sb, KERN_ERR,
+			"error: not enough memory");
 		goto failed_mount;
 	}
 
@@ -1837,14 +1874,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		block = descriptor_loc(sb, logic_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread(sb, block);
 		if (!sbi->s_group_desc[i]) {
-			printk (KERN_ERR "EXT3-fs: "
-				"can't read group descriptor %d\n", i);
+			ext3_msg(sb, KERN_ERR,
+				"error: can't read group descriptor %d", i);
 			db_count = i;
 			goto failed_mount2;
 		}
 	}
 	if (!ext3_check_descriptors (sb)) {
-		printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
+		ext3_msg(sb, KERN_ERR,
+			"error: group descriptors corrupted");
 		goto failed_mount2;
 	}
 	sbi->s_gdb_count = db_count;
@@ -1862,7 +1900,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 				ext3_count_dirs(sb));
 	}
 	if (err) {
-		printk(KERN_ERR "EXT3-fs: insufficient memory\n");
+		ext3_msg(sb, KERN_ERR, "error: insufficient memory");
 		goto failed_mount3;
 	}
 
@@ -1910,9 +1948,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 			goto failed_mount3;
 	} else {
 		if (!silent)
-			printk (KERN_ERR
-				"ext3: No journal on filesystem on %s\n",
-				sb->s_id);
+			ext3_msg(sb, KERN_ERR,
+				"error: no journal found. "
+				"mounting ext3 over ext2?");
 		goto failed_mount3;
 	}
 
@@ -1934,8 +1972,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	case EXT3_MOUNT_WRITEBACK_DATA:
 		if (!journal_check_available_features
 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
-			printk(KERN_ERR "EXT3-fs: Journal does not support "
-			       "requested data journaling mode\n");
+			ext3_msg(sb, KERN_ERR,
+				"error: journal does not support "
+				"requested data journaling mode");
 			goto failed_mount4;
 		}
 	default:
@@ -1944,8 +1983,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 
 	if (test_opt(sb, NOBH)) {
 		if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
-			printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - "
-				"its supported only with writeback mode\n");
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring nobh option - "
+				"it is supported only with writeback mode");
 			clear_opt(sbi->s_mount_opt, NOBH);
 		}
 	}
@@ -1956,18 +1996,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 
 	root = ext3_iget(sb, EXT3_ROOT_INO);
 	if (IS_ERR(root)) {
-		printk(KERN_ERR "EXT3-fs: get root inode failed\n");
+		ext3_msg(sb, KERN_ERR, "error: get root inode failed");
 		ret = PTR_ERR(root);
 		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		iput(root);
-		printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
+		ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
 		goto failed_mount4;
 	}
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
-		printk(KERN_ERR "EXT3-fs: get root dentry failed\n");
+		ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
 		iput(root);
 		ret = -ENOMEM;
 		goto failed_mount4;
@@ -1986,9 +2026,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	ext3_orphan_cleanup(sb, es);
 	EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
 	if (needs_recovery)
-		printk (KERN_INFO "EXT3-fs: recovery complete.\n");
+		ext3_msg(sb, KERN_INFO, "recovery complete");
 	ext3_mark_recovery_complete(sb, es);
-	printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
+	ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
@@ -1998,7 +2038,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 
 cantfind_ext3:
 	if (!silent)
-		printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n",
+		ext3_msg(sb, KERN_INFO,
+			"error: can't find ext3 filesystem on dev %s.",
 		       sb->s_id);
 	goto failed_mount;
 
@@ -2066,27 +2107,27 @@ static journal_t *ext3_get_journal(struct super_block *sb,
 
 	journal_inode = ext3_iget(sb, journal_inum);
 	if (IS_ERR(journal_inode)) {
-		printk(KERN_ERR "EXT3-fs: no journal found.\n");
+		ext3_msg(sb, KERN_ERR, "error: no journal found");
 		return NULL;
 	}
 	if (!journal_inode->i_nlink) {
 		make_bad_inode(journal_inode);
 		iput(journal_inode);
-		printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
+		ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
 		return NULL;
 	}
 
 	jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	if (!S_ISREG(journal_inode->i_mode)) {
-		printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
+		ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
 
 	journal = journal_init_inode(journal_inode);
 	if (!journal) {
-		printk(KERN_ERR "EXT3-fs: Could not load journal inode\n");
+		ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
@@ -2108,13 +2149,13 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	struct ext3_super_block * es;
 	struct block_device *bdev;
 
-	bdev = ext3_blkdev_get(j_dev);
+	bdev = ext3_blkdev_get(j_dev, sb);
 	if (bdev == NULL)
 		return NULL;
 
 	if (bd_claim(bdev, sb)) {
-		printk(KERN_ERR
-		        "EXT3: failed to claim external journal device.\n");
+		ext3_msg(sb, KERN_ERR,
+			"error: failed to claim external journal device");
 		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
@@ -2122,8 +2163,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
-		printk(KERN_ERR
-			"EXT3-fs: blocksize too small for journal device.\n");
+		ext3_msg(sb, KERN_ERR,
+			"error: blocksize too small for journal device");
 		goto out_bdev;
 	}
 
@@ -2131,8 +2172,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	offset = EXT3_MIN_BLOCK_SIZE % blocksize;
 	set_blocksize(bdev, blocksize);
 	if (!(bh = __bread(bdev, sb_block, blocksize))) {
-		printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
-		       "external journal\n");
+		ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
+			"external journal");
 		goto out_bdev;
 	}
 
@@ -2140,14 +2181,14 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
 	    !(le32_to_cpu(es->s_feature_incompat) &
 	      EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-		printk(KERN_ERR "EXT3-fs: external journal has "
-					"bad superblock\n");
+		ext3_msg(sb, KERN_ERR, "error: external journal has "
+			"bad superblock");
 		brelse(bh);
 		goto out_bdev;
 	}
 
 	if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-		printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
+		ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
 		brelse(bh);
 		goto out_bdev;
 	}
@@ -2159,19 +2200,21 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	journal = journal_init_dev(bdev, sb->s_bdev,
 					start, len, blocksize);
 	if (!journal) {
-		printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
+		ext3_msg(sb, KERN_ERR,
+			"error: failed to create device journal");
 		goto out_bdev;
 	}
 	journal->j_private = sb;
 	ll_rw_block(READ, 1, &journal->j_sb_buffer);
 	wait_on_buffer(journal->j_sb_buffer);
 	if (!buffer_uptodate(journal->j_sb_buffer)) {
-		printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
+		ext3_msg(sb, KERN_ERR, "I/O error on journal device");
 		goto out_journal;
 	}
 	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-		printk(KERN_ERR "EXT3-fs: External journal has more than one "
-					"user (unsupported) - %d\n",
+		ext3_msg(sb, KERN_ERR,
+			"error: external journal has more than one "
+			"user (unsupported) - %d",
 			be32_to_cpu(journal->j_superblock->s_nr_users));
 		goto out_journal;
 	}
@@ -2197,8 +2240,8 @@ static int ext3_load_journal(struct super_block *sb,
 
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-		printk(KERN_INFO "EXT3-fs: external journal device major/minor "
-			"numbers have changed\n");
+		ext3_msg(sb, KERN_INFO, "external journal device major/minor "
+			"numbers have changed");
 		journal_dev = new_decode_dev(journal_devnum);
 	} else
 		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -2213,21 +2256,21 @@ static int ext3_load_journal(struct super_block *sb,
 
 	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
 		if (sb->s_flags & MS_RDONLY) {
-			printk(KERN_INFO "EXT3-fs: INFO: recovery "
-					"required on readonly filesystem.\n");
+			ext3_msg(sb, KERN_INFO,
+				"recovery required on readonly filesystem");
 			if (really_read_only) {
-				printk(KERN_ERR "EXT3-fs: write access "
-					"unavailable, cannot proceed.\n");
+				ext3_msg(sb, KERN_ERR, "error: write access "
+					"unavailable, cannot proceed");
 				return -EROFS;
 			}
-			printk (KERN_INFO "EXT3-fs: write access will "
-					"be enabled during recovery.\n");
+			ext3_msg(sb, KERN_INFO,
+				"write access will be enabled during recovery");
 		}
 	}
 
 	if (journal_inum && journal_dev) {
-		printk(KERN_ERR "EXT3-fs: filesystem has both journal "
-		       "and inode journals!\n");
+		ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
+		       "and inode journals");
 		return -EINVAL;
 	}
 
@@ -2242,7 +2285,7 @@ static int ext3_load_journal(struct super_block *sb,
 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
 		err = journal_update_format(journal);
 		if (err)  {
-			printk(KERN_ERR "EXT3-fs: error updating journal.\n");
+			ext3_msg(sb, KERN_ERR, "error updating journal");
 			journal_destroy(journal);
 			return err;
 		}
@@ -2254,7 +2297,7 @@ static int ext3_load_journal(struct super_block *sb,
 		err = journal_load(journal);
 
 	if (err) {
-		printk(KERN_ERR "EXT3-fs: error loading journal.\n");
+		ext3_msg(sb, KERN_ERR, "error loading journal");
 		journal_destroy(journal);
 		return err;
 	}
@@ -2273,16 +2316,17 @@ static int ext3_load_journal(struct super_block *sb,
 	return 0;
 }
 
-static int ext3_create_journal(struct super_block * sb,
-			       struct ext3_super_block * es,
+static int ext3_create_journal(struct super_block *sb,
+			       struct ext3_super_block *es,
 			       unsigned int journal_inum)
 {
 	journal_t *journal;
 	int err;
 
 	if (sb->s_flags & MS_RDONLY) {
-		printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
-				"create journal.\n");
+		ext3_msg(sb, KERN_ERR,
+			"error: readonly filesystem when trying to "
+			"create journal");
 		return -EROFS;
 	}
 
@@ -2290,12 +2334,12 @@ static int ext3_create_journal(struct super_block * sb,
 	if (!journal)
 		return -EINVAL;
 
-	printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n",
+	ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
 	       journal_inum);
 
 	err = journal_create(journal);
 	if (err) {
-		printk(KERN_ERR "EXT3-fs: error creating journal.\n");
+		ext3_msg(sb, KERN_ERR, "error creating journal");
 		journal_destroy(journal);
 		return -EIO;
 	}
@@ -2376,8 +2420,8 @@ out:
  * has recorded an error from a previous lifetime, move that error to the
  * main filesystem now.
  */
-static void ext3_clear_journal_err(struct super_block * sb,
-				   struct ext3_super_block * es)
+static void ext3_clear_journal_err(struct super_block *sb,
+				   struct ext3_super_block *es)
 {
 	journal_t *journal;
 	int j_errno;
@@ -2568,10 +2612,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 			__le32 ret;
 			if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
 					~EXT3_FEATURE_RO_COMPAT_SUPP))) {
-				printk(KERN_WARNING "EXT3-fs: %s: couldn't "
-				       "remount RDWR because of unsupported "
-				       "optional features (%x).\n",
-				       sb->s_id, le32_to_cpu(ret));
+				ext3_msg(sb, KERN_WARNING,
+					"warning: couldn't remount RDWR "
+					"because of unsupported optional "
+					"features (%x)", le32_to_cpu(ret));
 				err = -EROFS;
 				goto restore_opts;
 			}
@@ -2582,11 +2626,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 			 * require a full umount/remount for now.
 			 */
 			if (es->s_last_orphan) {
-				printk(KERN_WARNING "EXT3-fs: %s: couldn't "
+				ext3_msg(sb, KERN_WARNING, "warning: couldn't "
 				       "remount RDWR because of unprocessed "
 				       "orphan inode list.  Please "
-				       "umount/remount instead.\n",
-				       sb->s_id);
+				       "umount/remount instead.");
 				err = -EINVAL;
 				goto restore_opts;
 			}
@@ -2686,13 +2729,11 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
 	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
-	es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
 	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
 	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
-	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
 	buf->f_namelen = EXT3_NAME_LEN;
 	fsid = le64_to_cpup((void *)es->s_uuid) ^
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -2837,9 +2878,9 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 	if (EXT3_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
 		if (path.dentry->d_parent != sb->s_root)
-			printk(KERN_WARNING
-				"EXT3-fs: Quota file not on filesystem root. "
-				"Journaled quota will not work.\n");
+			ext3_msg(sb, KERN_WARNING,
+				"warning: Quota file not on filesystem root. "
+				"Journaled quota will not work.");
 	}
 
 	/*
@@ -2921,8 +2962,9 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 	handle_t *handle = journal_current_handle();
 
 	if (!handle) {
-		printk(KERN_WARNING "EXT3-fs: Quota write (off=%Lu, len=%Lu)"
-			" cancelled because transaction is not started.\n",
+		ext3_msg(sb, KERN_WARNING,
+			"warning: quota write (off=%llu, len=%llu)"
+			" cancelled because transaction is not started.",
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
 	}
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 545e37c4b91..387d92d00b9 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -960,6 +960,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (error)
 		goto cleanup;
 
+	error = ext3_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto cleanup;
+
 	if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
 		struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
 		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
@@ -985,9 +989,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (flags & XATTR_CREATE)
 			goto cleanup;
 	}
-	error = ext3_journal_get_write_access(handle, is.iloc.bh);
-	if (error)
-		goto cleanup;
 	if (!value) {
 		if (!is.s.not_found)
 			error = ext3_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9f2d45d75b1..e5f6774846e 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -2,6 +2,7 @@ config EXT4_FS
 	tristate "The Extended 4 (ext4) filesystem"
 	select JBD2
 	select CRC16
+	select FS_JOURNAL_INFO
 	help
 	  This is the next generation of the ext3 filesystem.
 
@@ -26,6 +27,16 @@ config EXT4_FS
 
 	  If unsure, say N.
 
+config EXT4_USE_FOR_EXT23
+	bool "Use ext4 for ext2/ext3 file systems"
+	depends on EXT3_FS=n || EXT2_FS=n
+	default y
+	help
+	  Allow the ext4 file system driver code to be used for ext2 or
+	  ext3 file system mounts.  This allows users to reduce their
+	  compiled kernel size by using one file system driver for
+	  ext2, ext3, and ext4 file systems.
+
 config EXT4_FS_XATTR
 	bool "Ext4 extended attributes"
 	depends on EXT4_FS
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1d0418980f8..22bc7435d91 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -499,44 +499,6 @@ error_return:
 }
 
 /**
- * ext4_free_blocks() -- Free given blocks and update quota
- * @handle:		handle for this transaction
- * @inode:		inode
- * @block:		start physical block to free
- * @count:		number of blocks to count
- * @metadata: 		Are these metadata blocks
- */
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t block, unsigned long count,
-			int metadata)
-{
-	struct super_block *sb;
-	unsigned long dquot_freed_blocks;
-
-	/* this isn't the right place to decide whether block is metadata
-	 * inode.c/extents.c knows better, but for safety ... */
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		metadata = 1;
-
-	/* We need to make sure we don't reuse
-	 * block released untill the transaction commit.
-	 * writeback mode have weak data consistency so
-	 * don't force data as metadata when freeing block
-	 * for writeback mode.
-	 */
-	if (metadata == 0 && !ext4_should_writeback_data(inode))
-		metadata = 1;
-
-	sb = inode->i_sb;
-
-	ext4_mb_free_blocks(handle, inode, block, count,
-			    metadata, &dquot_freed_blocks);
-	if (dquot_freed_blocks)
-		vfs_dq_free_block(inode, dquot_freed_blocks);
-	return;
-}
-
-/**
  * ext4_has_free_blocks()
  * @sbi:	in-core super block structure.
  * @nblocks:	number of needed blocks
@@ -761,7 +723,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
 					ext4_group_t group)
 {
-	return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+	if (!ext4_bg_has_super(sb, group))
+		return 0;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+		return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+	else
+		return EXT4_SB(sb)->s_gdb_count;
 }
 
 /**
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 50784ef0756..4df8621ec31 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
 		if (ext4_bg_has_super(sb, i) &&
 		    ((i < 5) || ((i % flex_size) == 0)))
 			add_system_zone(sbi, ext4_group_first_block_no(sb, i),
-					sbi->s_gdb_count + 1);
+					ext4_bg_num_gdb(sb, i) + 1);
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
 		if (ret)
@@ -228,6 +228,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
 	struct rb_node *n = sbi->system_blks.rb_node;
 
 	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+	    (start_blk + count < start_blk) ||
 	    (start_blk + count > ext4_blocks_count(sbi->s_es)))
 		return 0;
 	while (n) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8825515eedd..ab31e65d46d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -376,6 +376,12 @@ struct ext4_new_group_data {
 					 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
 
 /*
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA	0x0001
+#define EXT4_FREE_BLOCKS_FORGET		0x0002
+
+/*
  * ioctl commands
  */
 #define	EXT4_IOC_GETFLAGS		FS_IOC_GETFLAGS
@@ -703,6 +709,13 @@ struct ext4_inode_info {
 	struct list_head i_aio_dio_complete_list;
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
+
+	/*
+	 * Transactions that contain inode's metadata needed to complete
+	 * fsync and fdatasync, respectively.
+	 */
+	tid_t i_sync_tid;
+	tid_t i_datasync_tid;
 };
 
 /*
@@ -750,6 +763,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
 
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)			o |= EXT4_MOUNT_##opt
@@ -1324,8 +1338,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 				ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1384,16 +1396,15 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
-extern void ext4_mb_free_blocks(handle_t *, struct inode *,
-		ext4_fsblk_t, unsigned long, int, unsigned long *);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+			     struct buffer_head *bh, ext4_fsblk_t block,
+			     unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
 /* inode.c */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-		struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6a9409920de..b57e5c711b6 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -4,6 +4,8 @@
 
 #include "ext4_jbd2.h"
 
+#include <trace/events/ext4.h>
+
 int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 				struct buffer_head *bh)
 {
@@ -32,35 +34,69 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
 	return err;
 }
 
-int __ext4_journal_forget(const char *where, handle_t *handle,
-				struct buffer_head *bh)
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+		  struct inode *inode, struct buffer_head *bh,
+		  ext4_fsblk_t blocknr)
 {
-	int err = 0;
+	int err;
 
-	if (ext4_handle_valid(handle)) {
-		err = jbd2_journal_forget(handle, bh);
-		if (err)
-			ext4_journal_abort_handle(where, __func__, bh,
-						  handle, err);
-	}
-	else
+	might_sleep();
+
+	trace_ext4_forget(inode, is_metadata, blocknr);
+	BUFFER_TRACE(bh, "enter");
+
+	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+		  "data mode %x\n",
+		  bh, is_metadata, inode->i_mode,
+		  test_opt(inode->i_sb, DATA_FLAGS));
+
+	/* In the no journal case, we can just do a bforget and return */
+	if (!ext4_handle_valid(handle)) {
 		bforget(bh);
-	return err;
-}
+		return 0;
+	}
 
-int __ext4_journal_revoke(const char *where, handle_t *handle,
-				ext4_fsblk_t blocknr, struct buffer_head *bh)
-{
-	int err = 0;
+	/* Never use the revoke function if we are doing full data
+	 * journaling: there is no need to, and a V1 superblock won't
+	 * support it.  Otherwise, only skip the revoke on un-journaled
+	 * data blocks. */
 
-	if (ext4_handle_valid(handle)) {
-		err = jbd2_journal_revoke(handle, blocknr, bh);
-		if (err)
-			ext4_journal_abort_handle(where, __func__, bh,
-						  handle, err);
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+	    (!is_metadata && !ext4_should_journal_data(inode))) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call jbd2_journal_forget");
+			err = jbd2_journal_forget(handle, bh);
+			if (err)
+				ext4_journal_abort_handle(where, __func__, bh,
+							  handle, err);
+			return err;
+		}
+		return 0;
 	}
-	else
-		bforget(bh);
+
+	/*
+	 * data!=journal && (is_metadata || should_journal_data(inode))
+	 */
+	BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+	err = jbd2_journal_revoke(handle, blocknr, bh);
+	if (err) {
+		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+		ext4_abort(inode->i_sb, __func__,
+			   "error %d when attempting revoke", err);
+	}
+	BUFFER_TRACE(bh, "exit");
 	return err;
 }
 
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a2865980342..05eca817d70 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -49,7 +49,7 @@
 
 #define EXT4_DATA_TRANS_BLOCKS(sb)	(EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
 					 EXT4_XATTR_TRANS_BLOCKS - 2 + \
-					 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+					 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
 
 /*
  * Define the number of metadata blocks we need to account to modify data.
@@ -57,7 +57,7 @@
  * This include super block, inode block, quota blocks and xattr blocks
  */
 #define EXT4_META_TRANS_BLOCKS(sb)	(EXT4_XATTR_TRANS_BLOCKS + \
-					2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+					EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
 
 /* Delete operations potentially hit one directory's namespace plus an
  * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
@@ -92,6 +92,7 @@
  * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
 		(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
+
 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
 		(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
 #else
@@ -99,6 +100,9 @@
 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
 #endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 
 int
 ext4_mark_iloc_dirty(handle_t *handle,
@@ -116,12 +120,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
 
 /*
- * Wrapper functions with which ext4 calls into JBD.  The intent here is
- * to allow these to be turned into appropriate stubs so ext4 can control
- * ext2 filesystems, so ext2+ext4 systems only nee one fs.  This work hasn't
- * been done yet.
+ * Wrapper functions with which ext4 calls into JBD.
  */
-
 void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 		struct buffer_head *bh, handle_t *handle, int err);
 
@@ -131,13 +131,9 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
 				struct buffer_head *bh);
 
-/* When called with an invalid handle, this will still do a put on the BH */
-int __ext4_journal_forget(const char *where, handle_t *handle,
-				struct buffer_head *bh);
-
-/* When called with an invalid handle, this will still do a put on the BH */
-int __ext4_journal_revoke(const char *where, handle_t *handle,
-				ext4_fsblk_t blocknr, struct buffer_head *bh);
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+		  struct inode *inode, struct buffer_head *bh,
+		  ext4_fsblk_t blocknr);
 
 int __ext4_journal_get_create_access(const char *where,
 				handle_t *handle, struct buffer_head *bh);
@@ -149,12 +145,11 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
 	__ext4_journal_get_undo_access(__func__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
 	__ext4_journal_get_write_access(__func__, (handle), (bh))
-#define ext4_journal_revoke(handle, blocknr, bh) \
-	__ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
+	__ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
+		      (block_nr))
 #define ext4_journal_get_create_access(handle, bh) \
 	__ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_forget(handle, bh) \
-	__ext4_journal_forget(__func__, (handle), (bh))
 #define ext4_handle_dirty_metadata(handle, inode, bh) \
 	__ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
 
@@ -254,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 	return 0;
 }
 
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+						 struct inode *inode,
+						 int datasync)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (ext4_handle_valid(handle)) {
+		ei->i_sync_tid = handle->h_transaction->t_tid;
+		if (datasync)
+			ei->i_datasync_tid = handle->h_transaction->t_tid;
+	}
+}
+
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 715264b4bae..3a7928f825e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1007,7 +1007,8 @@ cleanup:
 		for (i = 0; i < depth; i++) {
 			if (!ablocks[i])
 				continue;
-			ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
+			ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+					 EXT4_FREE_BLOCKS_METADATA);
 		}
 	}
 	kfree(ablocks);
@@ -1761,7 +1762,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 	while (block < last && block != EXT_MAX_BLOCK) {
 		num = last - block;
 		/* find extent for this block */
+		down_read(&EXT4_I(inode)->i_data_sem);
 		path = ext4_ext_find_extent(inode, block, path);
+		up_read(&EXT4_I(inode)->i_data_sem);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			path = NULL;
@@ -1957,7 +1960,6 @@ errout:
 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 			struct ext4_ext_path *path)
 {
-	struct buffer_head *bh;
 	int err;
 	ext4_fsblk_t leaf;
 
@@ -1973,9 +1975,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 	if (err)
 		return err;
 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
-	bh = sb_find_get_block(inode->i_sb, leaf);
-	ext4_forget(handle, 1, inode, bh, leaf);
-	ext4_free_blocks(handle, inode, leaf, 1, 1);
+	ext4_free_blocks(handle, inode, 0, leaf, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
 	return err;
 }
 
@@ -2042,12 +2043,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 				struct ext4_extent *ex,
 				ext4_lblk_t from, ext4_lblk_t to)
 {
-	struct buffer_head *bh;
 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
-	int i, metadata = 0;
+	int flags = EXT4_FREE_BLOCKS_FORGET;
 
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		metadata = 1;
+		flags |= EXT4_FREE_BLOCKS_METADATA;
 #ifdef EXTENTS_STATS
 	{
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2072,11 +2072,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
 		start = ext_pblock(ex) + ee_len - num;
 		ext_debug("free last %u blocks starting %llu\n", num, start);
-		for (i = 0; i < num; i++) {
-			bh = sb_find_get_block(inode->i_sb, start + i);
-			ext4_forget(handle, 0, inode, bh, start + i);
-		}
-		ext4_free_blocks(handle, inode, start, num, metadata);
+		ext4_free_blocks(handle, inode, 0, start, num, flags);
 	} else if (from == le32_to_cpu(ex->ee_block)
 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2167,7 +2163,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 			correct_index = 1;
 			credits += (ext_depth(inode)) + 1;
 		}
-		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
 
 		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
 		if (err)
@@ -3064,6 +3060,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
 		ret = ext4_convert_unwritten_extents_dio(handle, inode,
 							path);
+		if (ret >= 0)
+			ext4_update_inode_fsync_trans(handle, inode, 1);
 		goto out2;
 	}
 	/* buffered IO case */
@@ -3091,6 +3089,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	ret = ext4_ext_convert_to_initialized(handle, inode,
 						path, iblock,
 						max_blocks);
+	if (ret >= 0)
+		ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
 	if (ret <= 0) {
 		err = ret;
@@ -3319,8 +3319,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		/* not a good idea to call discard here directly,
 		 * but otherwise we'd need to call it every free() */
 		ext4_discard_preallocations(inode);
-		ext4_free_blocks(handle, inode, ext_pblock(&newex),
-					ext4_ext_get_actual_len(&newex), 0);
+		ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+				 ext4_ext_get_actual_len(&newex), 0);
 		goto out2;
 	}
 
@@ -3329,10 +3329,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	allocated = ext4_ext_get_actual_len(&newex);
 	set_buffer_new(bh_result);
 
-	/* Cache only when it is _not_ an uninitialized extent */
-	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+	/*
+	 * Cache the extent and update transaction to commit on fdatasync only
+	 * when it is _not_ an uninitialized extent.
+	 */
+	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
 		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
 						EXT4_EXT_CACHE_EXTENT);
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+	} else
+		ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
 	if (allocated > max_blocks)
 		allocated = max_blocks;
@@ -3720,10 +3726,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 * Walk the extent tree gathering extent information.
 		 * ext4_ext_fiemap_cb will push extents back to user.
 		 */
-		down_read(&EXT4_I(inode)->i_data_sem);
 		error = ext4_ext_walk_space(inode, start_blk, len_blks,
 					  ext4_ext_fiemap_cb, fieinfo);
-		up_read(&EXT4_I(inode)->i_data_sem);
 	}
 
 	return error;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2b1531266ee..0b22497d92e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -51,25 +51,30 @@
 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
+	struct ext4_inode_info *ei = EXT4_I(inode);
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-	int err, ret = 0;
+	int ret;
+	tid_t commit_tid;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
 	trace_ext4_sync_file(file, dentry, datasync);
 
+	if (inode->i_sb->s_flags & MS_RDONLY)
+		return 0;
+
 	ret = flush_aio_dio_completed_IO(inode);
 	if (ret < 0)
-		goto out;
+		return ret;
+	
+	if (!journal)
+		return simple_fsync(file, dentry, datasync);
+
 	/*
-	 * data=writeback:
+	 * data=writeback,ordered:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
-	 *  sync_inode() will sync the metadata
-	 *
-	 * data=ordered:
-	 *  The caller's filemap_fdatawrite() will write the data and
-	 *  sync_inode() will write the inode if it is dirty.  Then the caller's
-	 *  filemap_fdatawait() will wait on the pages.
+	 *  Metadata is in the journal, we wait for proper transaction to
+	 *  commit here.
 	 *
 	 * data=journal:
 	 *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -79,32 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	 *  (they were dirtied by commit).  But that's OK - the blocks are
 	 *  safe in-journal, which is all fsync() needs to ensure.
 	 */
-	if (ext4_should_journal_data(inode)) {
-		ret = ext4_force_commit(inode->i_sb);
-		goto out;
-	}
+	if (ext4_should_journal_data(inode))
+		return ext4_force_commit(inode->i_sb);
 
-	if (!journal)
-		ret = sync_mapping_buffers(inode->i_mapping);
-
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		goto out;
-
-	/*
-	 * The VFS has written the file data.  If the inode is unaltered
-	 * then we need not start a commit.
-	 */
-	if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL,
-			.nr_to_write = 0, /* sys_fsync did this */
-		};
-		err = sync_inode(inode, &wbc);
-		if (ret == 0)
-			ret = err;
-	}
-out:
-	if (journal && (journal->j_flags & JBD2_BARRIER))
+	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+	if (jbd2_log_start_commit(journal, commit_tid))
+		jbd2_log_wait_commit(journal, commit_tid);
+	else if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	return ret;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c8caa51add..5352db1a308 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -71,58 +71,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 }
 
 /*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled.  Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-		struct buffer_head *bh, ext4_fsblk_t blocknr)
-{
-	int err;
-
-	might_sleep();
-
-	BUFFER_TRACE(bh, "enter");
-
-	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-		  "data mode %x\n",
-		  bh, is_metadata, inode->i_mode,
-		  test_opt(inode->i_sb, DATA_FLAGS));
-
-	/* Never use the revoke function if we are doing full data
-	 * journaling: there is no need to, and a V1 superblock won't
-	 * support it.  Otherwise, only skip the revoke on un-journaled
-	 * data blocks. */
-
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-	    (!is_metadata && !ext4_should_journal_data(inode))) {
-		if (bh) {
-			BUFFER_TRACE(bh, "call jbd2_journal_forget");
-			return ext4_journal_forget(handle, bh);
-		}
-		return 0;
-	}
-
-	/*
-	 * data!=journal && (is_metadata || should_journal_data(inode))
-	 */
-	BUFFER_TRACE(bh, "call ext4_journal_revoke");
-	err = ext4_journal_revoke(handle, blocknr, bh);
-	if (err)
-		ext4_abort(inode->i_sb, __func__,
-			   "error %d when attempting revoke", err);
-	BUFFER_TRACE(bh, "exit");
-	return err;
-}
-
-/*
  * Work out how many blocks we need to proceed with the next chunk of a
  * truncate transaction.
  */
@@ -721,7 +669,7 @@ allocated:
 	return ret;
 failed_out:
 	for (i = 0; i < index; i++)
-		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+		ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
 	return ret;
 }
 
@@ -817,14 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 	return err;
 failed:
 	/* Allocation failed, free what we already allocated */
+	ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
 	for (i = 1; i <= n ; i++) {
-		BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
-		ext4_journal_forget(handle, branch[i].bh);
+		/* 
+		 * branch[i].bh is newly allocated, so there is no
+		 * need to revoke the block, which is why we don't
+		 * need to set EXT4_FREE_BLOCKS_METADATA.
+		 */
+		ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+				 EXT4_FREE_BLOCKS_FORGET);
 	}
-	for (i = 0; i < indirect_blks; i++)
-		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+	for (i = n+1; i < indirect_blks; i++)
+		ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
 
-	ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
+	ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
 
 	return err;
 }
@@ -903,12 +857,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 
 err_out:
 	for (i = 1; i <= num; i++) {
-		BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
-		ext4_journal_forget(handle, where[i].bh);
-		ext4_free_blocks(handle, inode,
-					le32_to_cpu(where[i-1].key), 1, 0);
+		/* 
+		 * branch[i].bh is newly allocated, so there is no
+		 * need to revoke the block, which is why we don't
+		 * need to set EXT4_FREE_BLOCKS_METADATA.
+		 */
+		ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+				 EXT4_FREE_BLOCKS_FORGET);
 	}
-	ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
+	ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+			 blks, 0);
 
 	return err;
 }
@@ -1021,10 +979,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	if (!err)
 		err = ext4_splice_branch(handle, inode, iblock,
 					 partial, indirect_blks, count);
-	else
+	if (err)
 		goto cleanup;
 
 	set_buffer_new(bh_result);
+
+	ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
 	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
 	if (count > blocks_to_boundary)
@@ -1052,7 +1012,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode)
 		EXT4_I(inode)->i_reserved_meta_blocks;
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-	return total;
+	return (total << inode->i_blkbits);
 }
 /*
  * Calculate the number of metadata blocks need to reserve
@@ -1534,6 +1494,16 @@ static int do_journal_get_write_access(handle_t *handle,
 	return ext4_journal_get_write_access(handle, bh);
 }
 
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	ext4_truncate(inode);
+}
+
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned flags,
 			    struct page **pagep, void **fsdata)
@@ -1599,7 +1569,7 @@ retry:
 
 		ext4_journal_stop(handle);
 		if (pos + len > inode->i_size) {
-			ext4_truncate(inode);
+			ext4_truncate_failed_write(inode);
 			/*
 			 * If truncate failed early the inode might
 			 * still be on the orphan list; we need to
@@ -1709,7 +1679,7 @@ static int ext4_ordered_write_end(struct file *file,
 		ret = ret2;
 
 	if (pos + len > inode->i_size) {
-		ext4_truncate(inode);
+		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
@@ -1751,7 +1721,7 @@ static int ext4_writeback_write_end(struct file *file,
 		ret = ret2;
 
 	if (pos + len > inode->i_size) {
-		ext4_truncate(inode);
+		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
@@ -1814,7 +1784,7 @@ static int ext4_journalled_write_end(struct file *file,
 	if (!ret)
 		ret = ret2;
 	if (pos + len > inode->i_size) {
-		ext4_truncate(inode);
+		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
@@ -2600,7 +2570,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 }
 
 static int __ext4_journalled_writepage(struct page *page,
-				       struct writeback_control *wbc,
 				       unsigned int len)
 {
 	struct address_space *mapping = page->mapping;
@@ -2758,7 +2727,7 @@ static int ext4_writepage(struct page *page,
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		ClearPageChecked(page);
-		return __ext4_journalled_writepage(page, wbc, len);
+		return __ext4_journalled_writepage(page, len);
 	}
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@ -2788,7 +2757,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 	 * number of contiguous block. So we will limit
 	 * number of contiguous block to a sane value
 	 */
-	if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
 	    (max_blocks > EXT4_MAX_TRANS_DATA))
 		max_blocks = EXT4_MAX_TRANS_DATA;
 
@@ -2933,7 +2902,7 @@ retry:
 		ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
 					&mpd);
 		/*
-		 * If we have a contigous extent of pages and we
+		 * If we have a contiguous extent of pages and we
 		 * haven't done the I/O yet, map the blocks and submit
 		 * them for I/O.
 		 */
@@ -3091,7 +3060,7 @@ retry:
 		 * i_size_read because we hold i_mutex.
 		 */
 		if (pos + len > inode->i_size)
-			ext4_truncate(inode);
+			ext4_truncate_failed_write(inode);
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -4064,7 +4033,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
 	int k, err;
 
 	*top = 0;
-	/* Make k index the deepest non-null offest + 1 */
+	/* Make k index the deepest non-null offset + 1 */
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
 	partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -4120,6 +4089,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 			      __le32 *last)
 {
 	__le32 *p;
+	int	flags = EXT4_FREE_BLOCKS_FORGET;
+
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		flags |= EXT4_FREE_BLOCKS_METADATA;
+
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4134,27 +4108,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 		}
 	}
 
-	/*
-	 * Any buffers which are on the journal will be in memory. We
-	 * find them on the hash table so jbd2_journal_revoke() will
-	 * run jbd2_journal_forget() on them.  We've already detached
-	 * each block from the file, so bforget() in
-	 * jbd2_journal_forget() should be safe.
-	 *
-	 * AKPM: turn on bforget in jbd2_journal_forget()!!!
-	 */
-	for (p = first; p < last; p++) {
-		u32 nr = le32_to_cpu(*p);
-		if (nr) {
-			struct buffer_head *tbh;
-
-			*p = 0;
-			tbh = sb_find_get_block(inode->i_sb, nr);
-			ext4_forget(handle, 0, inode, tbh, nr);
-		}
-	}
+	for (p = first; p < last; p++)
+		*p = 0;
 
-	ext4_free_blocks(handle, inode, block_to_free, count, 0);
+	ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
 }
 
 /**
@@ -4342,7 +4299,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 					    blocks_for_truncate(inode));
 			}
 
-			ext4_free_blocks(handle, inode, nr, 1, 1);
+			ext4_free_blocks(handle, inode, 0, nr, 1,
+					 EXT4_FREE_BLOCKS_METADATA);
 
 			if (parent_bh) {
 				/*
@@ -4781,8 +4739,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	struct ext4_iloc iloc;
 	struct ext4_inode *raw_inode;
 	struct ext4_inode_info *ei;
-	struct buffer_head *bh;
 	struct inode *inode;
+	journal_t *journal = EXT4_SB(sb)->s_journal;
 	long ret;
 	int block;
 
@@ -4793,11 +4751,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	ei = EXT4_I(inode);
+	iloc.bh = 0;
 
 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)
 		goto bad_inode;
-	bh = iloc.bh;
 	raw_inode = ext4_raw_inode(&iloc);
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4820,7 +4778,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		if (inode->i_mode == 0 ||
 		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
 			/* this inode is deleted */
-			brelse(bh);
 			ret = -ESTALE;
 			goto bad_inode;
 		}
@@ -4848,11 +4805,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 
+	/*
+	 * Set transaction id's of transactions that have to be committed
+	 * to finish f[data]sync. We set them to currently running transaction
+	 * as we cannot be sure that the inode or some of its metadata isn't
+	 * part of the transaction - the inode could have been reclaimed and
+	 * now it is reread from disk.
+	 */
+	if (journal) {
+		transaction_t *transaction;
+		tid_t tid;
+
+		spin_lock(&journal->j_state_lock);
+		if (journal->j_running_transaction)
+			transaction = journal->j_running_transaction;
+		else
+			transaction = journal->j_committing_transaction;
+		if (transaction)
+			tid = transaction->t_tid;
+		else
+			tid = journal->j_commit_sequence;
+		spin_unlock(&journal->j_state_lock);
+		ei->i_sync_tid = tid;
+		ei->i_datasync_tid = tid;
+	}
+
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 		    EXT4_INODE_SIZE(inode->i_sb)) {
-			brelse(bh);
 			ret = -EIO;
 			goto bad_inode;
 		}
@@ -4884,10 +4865,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 
 	ret = 0;
 	if (ei->i_file_acl &&
-	    ((ei->i_file_acl <
-	      (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
-	       EXT4_SB(sb)->s_gdb_count)) ||
-	     (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
 		ext4_error(sb, __func__,
 			   "bad extended attribute block %llu in inode #%lu",
 			   ei->i_file_acl, inode->i_ino);
@@ -4905,10 +4883,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		/* Validate block references which are part of inode */
 		ret = ext4_check_inode_blockref(inode);
 	}
-	if (ret) {
-		brelse(bh);
+	if (ret)
 		goto bad_inode;
-	}
 
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
@@ -4936,7 +4912,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	} else {
-		brelse(bh);
 		ret = -EIO;
 		ext4_error(inode->i_sb, __func__,
 			   "bogus i_mode (%o) for inode=%lu",
@@ -4949,6 +4924,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	return inode;
 
 bad_inode:
+	brelse(iloc.bh);
 	iget_failed(inode);
 	return ERR_PTR(ret);
 }
@@ -5108,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle,
 		err = rc;
 	ei->i_state &= ~EXT4_STATE_NEW;
 
+	ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
 	brelse(bh);
 	ext4_std_error(inode->i_sb, err);
@@ -5227,8 +5204,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 
 		/* (user+group)*(old+new) structure, inode write (sb,
 		 * inode block, ? - but truncate inode update has it) */
-		handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
-					EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+		handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+					EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
@@ -5376,7 +5353,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  * worse case, the indexs blocks spread over different block groups
  *
  * If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiugous, with flexbg,
+ * different block groups too. If they are contiuguous, with flexbg,
  * they could still across block group boundary.
  *
  * Also account for superblock, inode, quota and xattr blocks
@@ -5452,7 +5429,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
  * Calculate the journal credits for a chunk of data modification.
  *
  * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
  *
  * journal buffers for data blocks are not included here, as DIO
  * and fallocate do no need to journal data buffers.
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c1cdf613e72..b63d193126d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -221,31 +221,38 @@ setversion_out:
 		struct file *donor_filp;
 		int err;
 
+		if (!(filp->f_mode & FMODE_READ) ||
+		    !(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+
 		if (copy_from_user(&me,
 			(struct move_extent __user *)arg, sizeof(me)))
 			return -EFAULT;
+		me.moved_len = 0;
 
 		donor_filp = fget(me.donor_fd);
 		if (!donor_filp)
 			return -EBADF;
 
-		if (!capable(CAP_DAC_OVERRIDE)) {
-			if ((current->real_cred->fsuid != inode->i_uid) ||
-				!(inode->i_mode & S_IRUSR) ||
-				!(donor_filp->f_dentry->d_inode->i_mode &
-				S_IRUSR)) {
-				fput(donor_filp);
-				return -EACCES;
-			}
+		if (!(donor_filp->f_mode & FMODE_WRITE)) {
+			err = -EBADF;
+			goto mext_out;
 		}
 
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			goto mext_out;
+
 		err = ext4_move_extents(filp, donor_filp, me.orig_start,
 					me.donor_start, me.len, &me.moved_len);
-		fput(donor_filp);
+		mnt_drop_write(filp->f_path.mnt);
+		if (me.moved_len > 0)
+			file_remove_suid(donor_filp);
 
 		if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
-			return -EFAULT;
-
+			err = -EFAULT;
+mext_out:
+		fput(donor_filp);
 		return err;
 	}
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bba12824def..b1fd3daadc9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -142,7 +142,7 @@
  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
  * value of s_mb_order2_reqs can be tuned via
  * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
- * stripe size (sbi->s_stripe), we try to search for contigous block in
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
  * stripe size. This should result in better allocation on RAID setups. If
  * not, we search in the specific group using bitmap for best extents. The
  * tunable min_to_scan and max_to_scan control the behaviour here.
@@ -2529,7 +2529,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	struct ext4_group_info *db;
 	int err, count = 0, count2 = 0;
 	struct ext4_free_data *entry;
-	ext4_fsblk_t discard_block;
 	struct list_head *l, *ltmp;
 
 	list_for_each_safe(l, ltmp, &txn->t_private_list) {
@@ -2559,13 +2558,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 			page_cache_release(e4b.bd_bitmap_page);
 		}
 		ext4_unlock_group(sb, entry->group);
-		discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
-			+ entry->start_blk
-			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-		trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
-					  entry->count);
-		sb_issue_discard(sb, discard_block, entry->count);
-
+		if (test_opt(sb, DISCARD)) {
+			ext4_fsblk_t discard_block;
+			struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+			discard_block = (ext4_fsblk_t)entry->group *
+						EXT4_BLOCKS_PER_GROUP(sb)
+					+ entry->start_blk
+					+ le32_to_cpu(es->s_first_data_block);
+			trace_ext4_discard_blocks(sb,
+					(unsigned long long)discard_block,
+					entry->count);
+			sb_issue_discard(sb, discard_block, entry->count);
+		}
 		kmem_cache_free(ext4_free_ext_cachep, entry);
 		ext4_mb_release_desc(&e4b);
 	}
@@ -3006,6 +3011,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 }
 
 /*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context.  We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+	struct ext4_prealloc_space *pa = ac->ac_pa;
+	int len;
+
+	if (pa && pa->pa_type == MB_INODE_PA) {
+		len = ac->ac_b_ex.fe_len;
+		pa->pa_free += len;
+	}
+
+}
+
+/*
  * use blocks preallocated to inode
  */
 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
@@ -3932,7 +3955,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	 * per cpu locality group is to reduce the contention between block
 	 * request from multiple CPUs.
 	 */
-	ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
+	ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
 
 	/* we're going to use group allocation */
 	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4290,6 +4313,7 @@ repeat:
 			ac->ac_status = AC_STATUS_CONTINUE;
 			goto repeat;
 		} else if (*errp) {
+			ext4_discard_allocated_blocks(ac);
 			ac->ac_b_ex.fe_len = 0;
 			ar->len = 0;
 			ext4_mb_show_ac(ac);
@@ -4422,18 +4446,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	return 0;
 }
 
-/*
- * Main entry point into mballoc to free blocks
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle:		handle for this transaction
+ * @inode:		inode
+ * @block:		start physical block to free
+ * @count:		number of blocks to count
+ * @metadata: 		Are these metadata blocks
  */
-void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t block, unsigned long count,
-			int metadata, unsigned long *freed)
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+		      struct buffer_head *bh, ext4_fsblk_t block,
+		      unsigned long count, int flags)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct super_block *sb = inode->i_sb;
 	struct ext4_allocation_context *ac = NULL;
 	struct ext4_group_desc *gdp;
 	struct ext4_super_block *es;
+	unsigned long freed = 0;
 	unsigned int overflow;
 	ext4_grpblk_t bit;
 	struct buffer_head *gd_bh;
@@ -4443,13 +4473,16 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 	int err = 0;
 	int ret;
 
-	*freed = 0;
+	if (bh) {
+		if (block)
+			BUG_ON(block != bh->b_blocknr);
+		else
+			block = bh->b_blocknr;
+	}
 
 	sbi = EXT4_SB(sb);
 	es = EXT4_SB(sb)->s_es;
-	if (block < le32_to_cpu(es->s_first_data_block) ||
-	    block + count < block ||
-	    block + count > ext4_blocks_count(es)) {
+	if (!ext4_data_block_valid(sbi, block, count)) {
 		ext4_error(sb, __func__,
 			    "Freeing blocks not in datazone - "
 			    "block = %llu, count = %lu", block, count);
@@ -4457,7 +4490,32 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	ext4_debug("freeing block %llu\n", block);
-	trace_ext4_free_blocks(inode, block, count, metadata);
+	trace_ext4_free_blocks(inode, block, count, flags);
+
+	if (flags & EXT4_FREE_BLOCKS_FORGET) {
+		struct buffer_head *tbh = bh;
+		int i;
+
+		BUG_ON(bh && (count > 1));
+
+		for (i = 0; i < count; i++) {
+			if (!bh)
+				tbh = sb_find_get_block(inode->i_sb,
+							block + i);
+			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+				    inode, tbh, block + i);
+		}
+	}
+
+	/* 
+	 * We need to make sure we don't reuse the freed block until
+	 * after the transaction is committed, which we can do by
+	 * treating the block as metadata, below.  We make an
+	 * exception if the inode is to be written in writeback mode
+	 * since writeback mode has weak data consistency guarantees.
+	 */
+	if (!ext4_should_writeback_data(inode))
+		flags |= EXT4_FREE_BLOCKS_METADATA;
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (ac) {
@@ -4533,7 +4591,8 @@ do_more:
 	err = ext4_mb_load_buddy(sb, block_group, &e4b);
 	if (err)
 		goto error_return;
-	if (metadata && ext4_handle_valid(handle)) {
+
+	if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
 		struct ext4_free_data *new_entry;
 		/*
 		 * blocks being freed are metadata. these blocks shouldn't
@@ -4572,7 +4631,7 @@ do_more:
 
 	ext4_mb_release_desc(&e4b);
 
-	*freed += count;
+	freed += count;
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4592,6 +4651,8 @@ do_more:
 	}
 	sb->s_dirt = 1;
 error_return:
+	if (freed)
+		vfs_dq_free_block(inode, freed);
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
 	if (ac)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a93d5b80f3e..81415814b00 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
 	 * So allocate a credit of 3. We may update
 	 * quota (user and group).
 	 */
-	needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+	needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
 
 	if (ext4_journal_extend(handle, needed) != 0)
 		retval = ext4_journal_restart(handle, needed);
@@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle,
 	for (i = 0; i < max_entries; i++) {
 		if (tmp_idata[i]) {
 			extend_credit_for_blkdel(handle, inode);
-			ext4_free_blocks(handle, inode,
-					le32_to_cpu(tmp_idata[i]), 1, 1);
+			ext4_free_blocks(handle, inode, 0,
+					 le32_to_cpu(tmp_idata[i]), 1,
+					 EXT4_FREE_BLOCKS_METADATA |
+					 EXT4_FREE_BLOCKS_FORGET);
 		}
 	}
 	put_bh(bh);
 	extend_credit_for_blkdel(handle, inode);
-	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+	ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+			 EXT4_FREE_BLOCKS_METADATA |
+			 EXT4_FREE_BLOCKS_FORGET);
 	return 0;
 }
 
@@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle,
 	}
 	put_bh(bh);
 	extend_credit_for_blkdel(handle, inode);
-	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+	ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+			 EXT4_FREE_BLOCKS_METADATA |
+			 EXT4_FREE_BLOCKS_FORGET);
 	return 0;
 }
 
@@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
 	/* ei->i_data[EXT4_IND_BLOCK] */
 	if (i_data[0]) {
 		extend_credit_for_blkdel(handle, inode);
-		ext4_free_blocks(handle, inode,
-				le32_to_cpu(i_data[0]), 1, 1);
+		ext4_free_blocks(handle, inode, 0,
+				le32_to_cpu(i_data[0]), 1,
+				 EXT4_FREE_BLOCKS_METADATA |
+				 EXT4_FREE_BLOCKS_FORGET);
 	}
 
 	/* ei->i_data[EXT4_DIND_BLOCK] */
@@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
 	}
 	put_bh(bh);
 	extend_credit_for_blkdel(handle, inode);
-	ext4_free_blocks(handle, inode, block, 1, 1);
+	ext4_free_blocks(handle, inode, 0, block, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
 	return retval;
 }
 
@@ -477,7 +486,7 @@ int ext4_ext_migrate(struct inode *inode)
 	handle = ext4_journal_start(inode,
 					EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
+					EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
 					+ 1);
 	if (IS_ERR(handle)) {
 		retval = PTR_ERR(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 25b6b145736..82c415be87a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -77,12 +77,14 @@ static int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 		      struct ext4_extent **extent)
 {
+	struct ext4_extent_header *eh;
 	int ppos, leaf_ppos = path->p_depth;
 
 	ppos = leaf_ppos;
 	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
 		/* leaf block */
 		*extent = ++path[ppos].p_ext;
+		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
 		return 0;
 	}
 
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 					ext_block_hdr(path[cur_ppos+1].p_bh);
 			}
 
+			path[leaf_ppos].p_ext = *extent = NULL;
+
+			eh = path[leaf_ppos].p_hdr;
+			if (le16_to_cpu(eh->eh_entries) == 0)
+				/* empty leaf is found */
+				return -ENODATA;
+
 			/* leaf block */
 			path[leaf_ppos].p_ext = *extent =
 				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+			path[leaf_ppos].p_block =
+					ext_pblock(path[leaf_ppos].p_ext);
 			return 0;
 		}
 	}
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
 }
 
 /**
- * mext_double_down_read - Acquire two inodes' read semaphore
- *
- * @orig_inode:		original inode structure
- * @donor_inode:	donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
- */
-static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-	struct inode *first = orig_inode, *second = donor_inode;
-
-	/*
-	 * Use the inode number to provide the stable locking order instead
-	 * of its address, because the C language doesn't guarantee you can
-	 * compare pointers that don't come from the same array.
-	 */
-	if (donor_inode->i_ino < orig_inode->i_ino) {
-		first = donor_inode;
-		second = orig_inode;
-	}
-
-	down_read(&EXT4_I(first)->i_data_sem);
-	down_read(&EXT4_I(second)->i_data_sem);
-}
-
-/**
- * mext_double_down_write - Acquire two inodes' write semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
  *
  * @orig_inode:		original inode structure
  * @donor_inode:	donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
  */
 static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
 	struct inode *first = orig_inode, *second = donor_inode;
 
@@ -203,32 +189,18 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
 	}
 
 	down_write(&EXT4_I(first)->i_data_sem);
-	down_write(&EXT4_I(second)->i_data_sem);
+	down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
 }
 
 /**
- * mext_double_up_read - Release two inodes' read semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
  *
  * @orig_inode:		original inode structure to be released its lock first
  * @donor_inode:	donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
  */
 static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-	up_read(&EXT4_I(orig_inode)->i_data_sem);
-	up_read(&EXT4_I(donor_inode)->i_data_sem);
-}
-
-/**
- * mext_double_up_write - Release two inodes' write semaphore
- *
- * @orig_inode:		original inode structure to be released its lock first
- * @donor_inode:	donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
 	up_write(&EXT4_I(orig_inode)->i_data_sem);
 	up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -596,7 +568,7 @@ out:
  * @tmp_oext:		the extent that will belong to the donor inode
  * @orig_off:		block offset of original inode
  * @donor_off:		block offset of donor inode
- * @max_count:		the maximun length of extents
+ * @max_count:		the maximum length of extents
  *
  * Return 0 on success, or a negative error value on failure.
  */
@@ -661,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
  * @donor_inode:	donor inode
  * @from:		block offset of orig_inode
  * @count:		block count to be replaced
+ * @err:		pointer to save return value
  *
  * Replace original inode extents and donor inode extents page by page.
  * We implement this replacement in the following three steps:
@@ -671,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
  * 3. Change the block information of donor inode to point at the saved
  *    original inode blocks in the dummy extents.
  *
- * Return 0 on success, or a negative error value on failure.
+ * Return replaced block count.
  */
 static int
 mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 			   struct inode *donor_inode, ext4_lblk_t from,
-			   ext4_lblk_t count)
+			   ext4_lblk_t count, int *err)
 {
 	struct ext4_ext_path *orig_path = NULL;
 	struct ext4_ext_path *donor_path = NULL;
 	struct ext4_extent *oext, *dext;
 	struct ext4_extent tmp_dext, tmp_oext;
 	ext4_lblk_t orig_off = from, donor_off = from;
-	int err = 0;
 	int depth;
 	int replaced_count = 0;
 	int dext_alen;
 
-	mext_double_down_write(orig_inode, donor_inode);
+	/* Protect extent trees against block allocations via delalloc */
+	double_down_write_data_sem(orig_inode, donor_inode);
 
 	/* Get the original extent for the block "orig_off" */
-	err = get_ext_path(orig_inode, orig_off, &orig_path);
-	if (err)
+	*err = get_ext_path(orig_inode, orig_off, &orig_path);
+	if (*err)
 		goto out;
 
 	/* Get the donor extent for the head */
-	err = get_ext_path(donor_inode, donor_off, &donor_path);
-	if (err)
+	*err = get_ext_path(donor_inode, donor_off, &donor_path);
+	if (*err)
 		goto out;
 	depth = ext_depth(orig_inode);
 	oext = orig_path[depth].p_ext;
@@ -707,9 +680,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	dext = donor_path[depth].p_ext;
 	tmp_dext = *dext;
 
-	err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+	*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
 				      donor_off, count);
-	if (err)
+	if (*err)
 		goto out;
 
 	/* Loop for the donor extents */
@@ -718,7 +691,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 		if (!dext) {
 			ext4_error(donor_inode->i_sb, __func__,
 				   "The extent for donor must be found");
-			err = -EIO;
+			*err = -EIO;
 			goto out;
 		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
 			ext4_error(donor_inode->i_sb, __func__,
@@ -726,20 +699,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 				"extent(%u) should be equal",
 				donor_off,
 				le32_to_cpu(tmp_dext.ee_block));
-			err = -EIO;
+			*err = -EIO;
 			goto out;
 		}
 
 		/* Set donor extent to orig extent */
-		err = mext_leaf_block(handle, orig_inode,
+		*err = mext_leaf_block(handle, orig_inode,
 					   orig_path, &tmp_dext, &orig_off);
-		if (err < 0)
+		if (*err)
 			goto out;
 
 		/* Set orig extent to donor extent */
-		err = mext_leaf_block(handle, donor_inode,
+		*err = mext_leaf_block(handle, donor_inode,
 					   donor_path, &tmp_oext, &donor_off);
-		if (err < 0)
+		if (*err)
 			goto out;
 
 		dext_alen = ext4_ext_get_actual_len(&tmp_dext);
@@ -753,35 +726,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
-		err = get_ext_path(orig_inode, orig_off, &orig_path);
-		if (err)
+		*err = get_ext_path(orig_inode, orig_off, &orig_path);
+		if (*err)
 			goto out;
 		depth = ext_depth(orig_inode);
 		oext = orig_path[depth].p_ext;
-		if (le32_to_cpu(oext->ee_block) +
-				ext4_ext_get_actual_len(oext) <= orig_off) {
-			err = 0;
-			goto out;
-		}
 		tmp_oext = *oext;
 
 		if (donor_path)
 			ext4_ext_drop_refs(donor_path);
-		err = get_ext_path(donor_inode, donor_off, &donor_path);
-		if (err)
+		*err = get_ext_path(donor_inode, donor_off, &donor_path);
+		if (*err)
 			goto out;
 		depth = ext_depth(donor_inode);
 		dext = donor_path[depth].p_ext;
-		if (le32_to_cpu(dext->ee_block) +
-				ext4_ext_get_actual_len(dext) <= donor_off) {
-			err = 0;
-			goto out;
-		}
 		tmp_dext = *dext;
 
-		err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+		*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
 					   donor_off, count - replaced_count);
-		if (err)
+		if (*err)
 			goto out;
 	}
 
@@ -795,8 +758,12 @@ out:
 		kfree(donor_path);
 	}
 
-	mext_double_up_write(orig_inode, donor_inode);
-	return err;
+	ext4_ext_invalidate_cache(orig_inode);
+	ext4_ext_invalidate_cache(donor_inode);
+
+	double_up_write_data_sem(orig_inode, donor_inode);
+
+	return replaced_count;
 }
 
 /**
@@ -808,16 +775,17 @@ out:
  * @data_offset_in_page:	block index where data swapping starts
  * @block_len_in_page:		the number of blocks to be swapped
  * @uninit:			orig extent is uninitialized or not
+ * @err:			pointer to save return value
  *
  * Save the data in original inode blocks and replace original inode extents
  * with donor inode extents by calling mext_replace_branches().
- * Finally, write out the saved data in new original inode blocks. Return 0
- * on success, or a negative error value on failure.
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
  */
 static int
 move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 		  pgoff_t orig_page_offset, int data_offset_in_page,
-		  int block_len_in_page, int uninit)
+		  int block_len_in_page, int uninit, int *err)
 {
 	struct inode *orig_inode = o_filp->f_dentry->d_inode;
 	struct address_space *mapping = orig_inode->i_mapping;
@@ -829,9 +797,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
 	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
 	unsigned int w_flags = 0;
-	unsigned int tmp_data_len, data_len;
+	unsigned int tmp_data_size, data_size, replaced_size;
 	void *fsdata;
-	int ret, i, jblocks;
+	int i, jblocks;
+	int err2 = 0;
+	int replaced_count = 0;
 	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
 
 	/*
@@ -841,8 +811,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
 	handle = ext4_journal_start(orig_inode, jblocks);
 	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		return ret;
+		*err = PTR_ERR(handle);
+		return 0;
 	}
 
 	if (segment_eq(get_fs(), KERNEL_DS))
@@ -858,39 +828,36 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	 * Just swap data blocks between orig and donor.
 	 */
 	if (uninit) {
-		ret = mext_replace_branches(handle, orig_inode,
-						 donor_inode, orig_blk_offset,
-						 block_len_in_page);
-
-		/* Clear the inode cache not to refer to the old data */
-		ext4_ext_invalidate_cache(orig_inode);
-		ext4_ext_invalidate_cache(donor_inode);
+		replaced_count = mext_replace_branches(handle, orig_inode,
+						donor_inode, orig_blk_offset,
+						block_len_in_page, err);
 		goto out2;
 	}
 
 	offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
 
-	/* Calculate data_len */
+	/* Calculate data_size */
 	if ((orig_blk_offset + block_len_in_page - 1) ==
 	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
 		/* Replace the last block */
-		tmp_data_len = orig_inode->i_size & (blocksize - 1);
+		tmp_data_size = orig_inode->i_size & (blocksize - 1);
 		/*
-		 * If data_len equal zero, it shows data_len is multiples of
+		 * If data_size equal zero, it shows data_size is multiples of
 		 * blocksize. So we set appropriate value.
 		 */
-		if (tmp_data_len == 0)
-			tmp_data_len = blocksize;
+		if (tmp_data_size == 0)
+			tmp_data_size = blocksize;
 
-		data_len = tmp_data_len +
+		data_size = tmp_data_size +
 			((block_len_in_page - 1) << orig_inode->i_blkbits);
-	} else {
-		data_len = block_len_in_page << orig_inode->i_blkbits;
-	}
+	} else
+		data_size = block_len_in_page << orig_inode->i_blkbits;
+
+	replaced_size = data_size;
 
-	ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+	*err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
 				 &page, &fsdata);
-	if (unlikely(ret < 0))
+	if (unlikely(*err < 0))
 		goto out;
 
 	if (!PageUptodate(page)) {
@@ -911,14 +878,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	/* Release old bh and drop refs */
 	try_to_release_page(page, 0);
 
-	ret = mext_replace_branches(handle, orig_inode, donor_inode,
-					 orig_blk_offset, block_len_in_page);
-	if (ret < 0)
-		goto out;
-
-	/* Clear the inode cache not to refer to the old data */
-	ext4_ext_invalidate_cache(orig_inode);
-	ext4_ext_invalidate_cache(donor_inode);
+	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+					orig_blk_offset, block_len_in_page,
+					&err2);
+	if (err2) {
+		if (replaced_count) {
+			block_len_in_page = replaced_count;
+			replaced_size =
+				block_len_in_page << orig_inode->i_blkbits;
+		} else
+			goto out;
+	}
 
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
@@ -928,16 +898,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 		bh = bh->b_this_page;
 
 	for (i = 0; i < block_len_in_page; i++) {
-		ret = ext4_get_block(orig_inode,
+		*err = ext4_get_block(orig_inode,
 				(sector_t)(orig_blk_offset + i), bh, 0);
-		if (ret < 0)
+		if (*err < 0)
 			goto out;
 
 		if (bh->b_this_page != NULL)
 			bh = bh->b_this_page;
 	}
 
-	ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+	*err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
 			       page, fsdata);
 	page = NULL;
 
@@ -951,7 +921,10 @@ out:
 out2:
 	ext4_journal_stop(handle);
 
-	return ret < 0 ? ret : 0;
+	if (err2)
+		*err = err2;
+
+	return replaced_count;
 }
 
 /**
@@ -962,7 +935,6 @@ out2:
  * @orig_start:		logical start offset in block for orig
  * @donor_start:	logical start offset in block for donor
  * @len:		the number of blocks to be moved
- * @moved_len:		moved block length
  *
  * Check the arguments of ext4_move_extents() whether the files can be
  * exchanged with each other.
@@ -970,8 +942,8 @@ out2:
  */
 static int
 mext_check_arguments(struct inode *orig_inode,
-			  struct inode *donor_inode, __u64 orig_start,
-			  __u64 donor_start, __u64 *len, __u64 moved_len)
+		     struct inode *donor_inode, __u64 orig_start,
+		     __u64 donor_start, __u64 *len)
 {
 	ext4_lblk_t orig_blocks, donor_blocks;
 	unsigned int blkbits = orig_inode->i_blkbits;
@@ -985,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode,
 		return -EINVAL;
 	}
 
+	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+		ext4_debug("ext4 move extent: suid or sgid is set"
+			   " to donor file [ino:orig %lu, donor %lu]\n",
+			   orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
 	/* Ext4 move extent does not support swapfile */
 	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
 		ext4_debug("ext4 move extent: The argument files should "
@@ -1025,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode,
 		return -EINVAL;
 	}
 
-	if (moved_len) {
-		ext4_debug("ext4 move extent: moved_len should be 0 "
-			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
-			donor_inode->i_ino);
-		return -EINVAL;
-	}
-
 	if ((orig_start > EXT_MAX_BLOCK) ||
 	    (donor_start > EXT_MAX_BLOCK) ||
 	    (*len > EXT_MAX_BLOCK) ||
@@ -1088,7 +1060,7 @@ mext_check_arguments(struct inode *orig_inode,
 	}
 
 	if (!*len) {
-		ext4_debug("ext4 move extent: len shoudld not be 0 "
+		ext4_debug("ext4 move extent: len should not be 0 "
 			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
 			donor_inode->i_ino);
 		return -EINVAL;
@@ -1232,16 +1204,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		return -EINVAL;
 	}
 
-	/* protect orig and donor against a truncate */
+	/* Protect orig and donor inodes against a truncate */
 	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
 	if (ret1 < 0)
 		return ret1;
 
-	mext_double_down_read(orig_inode, donor_inode);
+	/* Protect extent tree against block allocations via delalloc */
+	double_down_write_data_sem(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
 	ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
-					donor_start, &len, *moved_len);
-	mext_double_up_read(orig_inode, donor_inode);
+				    donor_start, &len);
 	if (ret1)
 		goto out;
 
@@ -1355,36 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		seq_start = le32_to_cpu(ext_cur->ee_block);
 		rest_blocks = seq_blocks;
 
-		/* Discard preallocations of two inodes */
-		down_write(&EXT4_I(orig_inode)->i_data_sem);
-		ext4_discard_preallocations(orig_inode);
-		up_write(&EXT4_I(orig_inode)->i_data_sem);
-
-		down_write(&EXT4_I(donor_inode)->i_data_sem);
-		ext4_discard_preallocations(donor_inode);
-		up_write(&EXT4_I(donor_inode)->i_data_sem);
+		/*
+		 * Up semaphore to avoid following problems:
+		 * a. transaction deadlock among ext4_journal_start,
+		 *    ->write_begin via pagefault, and jbd2_journal_commit
+		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
+		 *    in move_extent_per_page
+		 */
+		double_up_write_data_sem(orig_inode, donor_inode);
 
 		while (orig_page_offset <= seq_end_page) {
 
 			/* Swap original branches with new branches */
-			ret1 = move_extent_per_page(o_filp, donor_inode,
+			block_len_in_page = move_extent_per_page(
+						o_filp, donor_inode,
 						orig_page_offset,
 						data_offset_in_page,
-						block_len_in_page, uninit);
-			if (ret1 < 0)
-				goto out;
-			orig_page_offset++;
+						block_len_in_page, uninit,
+						&ret1);
+
 			/* Count how many blocks we have exchanged */
 			*moved_len += block_len_in_page;
+			if (ret1 < 0)
+				break;
 			if (*moved_len > len) {
 				ext4_error(orig_inode->i_sb, __func__,
 					"We replaced blocks too much! "
 					"sum of replaced: %llu requested: %llu",
 					*moved_len, len);
 				ret1 = -EIO;
-				goto out;
+				break;
 			}
 
+			orig_page_offset++;
 			data_offset_in_page = 0;
 			rest_blocks -= block_len_in_page;
 			if (rest_blocks > blocks_per_page)
@@ -1393,6 +1368,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 				block_len_in_page = rest_blocks;
 		}
 
+		double_down_write_data_sem(orig_inode, donor_inode);
+		if (ret1 < 0)
+			break;
+
 		/* Decrease buffer counter */
 		if (holecheck_path)
 			ext4_ext_drop_refs(holecheck_path);
@@ -1414,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 
 	}
 out:
+	if (*moved_len) {
+		ext4_discard_preallocations(orig_inode);
+		ext4_discard_preallocations(donor_inode);
+	}
+
 	if (orig_path) {
 		ext4_ext_drop_refs(orig_path);
 		kfree(orig_path);
@@ -1422,7 +1406,7 @@ out:
 		ext4_ext_drop_refs(holecheck_path);
 		kfree(holecheck_path);
 	}
-
+	double_up_write_data_sem(orig_inode, donor_inode);
 	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 
 	if (ret1)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d2c1b897fc..17a17e10dd6 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1292,9 +1292,6 @@ errout:
  * add_dirent_to_buf will attempt search the directory block for
  * space.  It will return -ENOSPC if no space is available, and -EIO
  * and -EEXIST if directory entry already exists.
- *
- * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
- * all other cases bh is released.
  */
 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode, struct ext4_dir_entry_2 *de,
@@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 		top = bh->b_data + blocksize - reclen;
 		while ((char *) de <= top) {
 			if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
-						  bh, offset)) {
-				brelse(bh);
+						  bh, offset))
 				return -EIO;
-			}
-			if (ext4_match(namelen, name, de)) {
-				brelse(bh);
+			if (ext4_match(namelen, name, de))
 				return -EEXIST;
-			}
 			nlen = EXT4_DIR_REC_LEN(de->name_len);
 			rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
 			if ((de->inode? rlen - nlen: rlen) >= reclen)
@@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	err = ext4_journal_get_write_access(handle, bh);
 	if (err) {
 		ext4_std_error(dir->i_sb, err);
-		brelse(bh);
 		return err;
 	}
 
@@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	err = ext4_handle_dirty_metadata(handle, dir, bh);
 	if (err)
 		ext4_std_error(dir->i_sb, err);
-	brelse(bh);
 	return 0;
 }
 
@@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	if (!(de))
 		return retval;
 
-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	brelse(bh);
+	return retval;
 }
 
 /*
@@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		if(!bh)
 			return retval;
 		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-		if (retval != -ENOSPC)
+		if (retval != -ENOSPC) {
+			brelse(bh);
 			return retval;
+		}
 
 		if (blocks == 1 && !dx_fallback &&
 		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
@@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
 	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	brelse(bh);
+	return retval;
 }
 
 /*
@@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 		goto journal_error;
 
 	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-	if (err != -ENOSPC) {
-		bh = NULL;
+	if (err != -ENOSPC)
 		goto cleanup;
-	}
 
 	/* Block full, should compress but for now just split */
 	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
@@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	if (!de)
 		goto cleanup;
 	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-	bh = NULL;
 	goto cleanup;
 
 journal_error:
@@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2259,7 +2253,7 @@ static int ext4_symlink(struct inode *dir,
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3cfc343c41b..3b2c5541d8a 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 			goto exit_bh;
 
 		if (IS_ERR(gdb = bclean(handle, sb, block))) {
-			err = PTR_ERR(bh);
+			err = PTR_ERR(gdb);
 			goto exit_bh;
 		}
 		ext4_handle_dirty_metadata(handle, NULL, gdb);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d4ca92aab51..827bde1f259 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -603,10 +603,6 @@ static void ext4_put_super(struct super_block *sb)
 	if (sb->s_dirt)
 		ext4_commit_super(sb, 1);
 
-	ext4_release_system_zone(sb);
-	ext4_mb_release(sb);
-	ext4_ext_release(sb);
-	ext4_xattr_put_super(sb);
 	if (sbi->s_journal) {
 		err = jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
@@ -614,6 +610,12 @@ static void ext4_put_super(struct super_block *sb)
 			ext4_abort(sb, __func__,
 				   "Couldn't clean up the journal");
 	}
+
+	ext4_release_system_zone(sb);
+	ext4_mb_release(sb);
+	ext4_ext_release(sb);
+	ext4_xattr_put_super(sb);
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -704,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	spin_lock_init(&(ei->i_block_reservation_lock));
 	INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
 	ei->cur_aio_dio = NULL;
+	ei->i_sync_tid = 0;
+	ei->i_datasync_tid = 0;
 
 	return &ei->vfs_inode;
 }
@@ -765,9 +769,22 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
 #if defined(CONFIG_QUOTA)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (sbi->s_jquota_fmt)
-		seq_printf(seq, ",jqfmt=%s",
-		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
+	if (sbi->s_jquota_fmt) {
+		char *fmtname = "";
+
+		switch (sbi->s_jquota_fmt) {
+		case QFMT_VFS_OLD:
+			fmtname = "vfsold";
+			break;
+		case QFMT_VFS_V0:
+			fmtname = "vfsv0";
+			break;
+		case QFMT_VFS_V1:
+			fmtname = "vfsv1";
+			break;
+		}
+		seq_printf(seq, ",jqfmt=%s", fmtname);
+	}
 
 	if (sbi->s_qf_names[USRQUOTA])
 		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -899,6 +916,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, NO_AUTO_DA_ALLOC))
 		seq_puts(seq, ",noauto_da_alloc");
 
+	if (test_opt(sb, DISCARD))
+		seq_puts(seq, ",discard");
+
+	if (test_opt(sb, NOLOAD))
+		seq_puts(seq, ",norecovery");
+
 	ext4_show_quota_options(seq, sb);
 
 	return 0;
@@ -1074,12 +1097,13 @@ enum {
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
-	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
-	Opt_usrquota, Opt_grpquota, Opt_i_version,
+	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
+	Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 	Opt_block_validity, Opt_noblock_validity,
-	Opt_inode_readahead_blks, Opt_journal_ioprio
+	Opt_inode_readahead_blks, Opt_journal_ioprio,
+	Opt_discard, Opt_nodiscard,
 };
 
 static const match_table_t tokens = {
@@ -1104,6 +1128,7 @@ static const match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_noload, "noload"},
+	{Opt_noload, "norecovery"},
 	{Opt_nobh, "nobh"},
 	{Opt_bh, "bh"},
 	{Opt_commit, "commit=%u"},
@@ -1125,6 +1150,7 @@ static const match_table_t tokens = {
 	{Opt_grpjquota, "grpjquota=%s"},
 	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
 	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
 	{Opt_grpquota, "grpquota"},
 	{Opt_noquota, "noquota"},
 	{Opt_quota, "quota"},
@@ -1144,6 +1170,8 @@ static const match_table_t tokens = {
 	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
 	{Opt_auto_da_alloc, "auto_da_alloc"},
 	{Opt_noauto_da_alloc, "noauto_da_alloc"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
 	{Opt_err, NULL},
 };
 
@@ -1425,6 +1453,9 @@ clear_qf_name:
 			goto set_qf_format;
 		case Opt_jqfmt_vfsv0:
 			qfmt = QFMT_VFS_V0;
+			goto set_qf_format;
+		case Opt_jqfmt_vfsv1:
+			qfmt = QFMT_VFS_V1;
 set_qf_format:
 			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
@@ -1467,6 +1498,7 @@ set_qf_format:
 		case Opt_offgrpjquota:
 		case Opt_jqfmt_vfsold:
 		case Opt_jqfmt_vfsv0:
+		case Opt_jqfmt_vfsv1:
 			ext4_msg(sb, KERN_ERR,
 				"journaled quota options not supported");
 			break;
@@ -1565,6 +1597,12 @@ set_qf_format:
 			else
 				set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
 			break;
+		case Opt_discard:
+			set_opt(sbi->s_mount_opt, DISCARD);
+			break;
+		case Opt_nodiscard:
+			clear_opt(sbi->s_mount_opt, DISCARD);
+			break;
 		default:
 			ext4_msg(sb, KERN_ERR,
 			       "Unrecognized mount option \"%s\" "
@@ -1673,14 +1711,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	size_t size;
 	int i;
 
-	if (!sbi->s_es->s_log_groups_per_flex) {
+	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+	if (groups_per_flex < 2) {
 		sbi->s_log_groups_per_flex = 0;
 		return 1;
 	}
 
-	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
-	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
 	/* We allocate both existing and potentially added groups */
 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
 			((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
@@ -2099,11 +2137,8 @@ static int parse_strtoul(const char *buf,
 {
 	char *endp;
 
-	while (*buf && isspace(*buf))
-		buf++;
-	*value = simple_strtoul(buf, &endp, 0);
-	while (*endp && isspace(*endp))
-		endp++;
+	*value = simple_strtoul(skip_spaces(buf), &endp, 0);
+	endp = skip_spaces(endp);
 	if (*endp || *value > max)
 		return -EINVAL;
 
@@ -2721,26 +2756,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext4_load_journal(sb, es, journal_devnum))
 			goto failed_mount3;
-		if (!(sb->s_flags & MS_RDONLY) &&
-		    EXT4_SB(sb)->s_journal->j_failed_commit) {
-			ext4_msg(sb, KERN_CRIT, "error: "
-			       "ext4_fill_super: Journal transaction "
-			       "%u is corrupt",
-			       EXT4_SB(sb)->s_journal->j_failed_commit);
-			if (test_opt(sb, ERRORS_RO)) {
-				ext4_msg(sb, KERN_CRIT,
-				       "Mounting filesystem read-only");
-				sb->s_flags |= MS_RDONLY;
-				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-			}
-			if (test_opt(sb, ERRORS_PANIC)) {
-				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-				ext4_commit_super(sb, 1);
-				goto failed_mount4;
-			}
-		}
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -3668,13 +3683,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
 	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
 		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
-	ext4_free_blocks_count_set(es, buf->f_bfree);
 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
 	if (buf->f_bfree < ext4_r_blocks_count(es))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
-	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
 	buf->f_namelen = EXT4_NAME_LEN;
 	fsid = le64_to_cpup((void *)es->s_uuid) ^
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -3966,6 +3979,58 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
+#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext2",
+	.get_sb		= ext4_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static inline void register_as_ext2(void)
+{
+	int err = register_filesystem(&ext2_fs_type);
+	if (err)
+		printk(KERN_WARNING
+		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
+}
+
+static inline void unregister_as_ext2(void)
+{
+	unregister_filesystem(&ext2_fs_type);
+}
+#else
+static inline void register_as_ext2(void) { }
+static inline void unregister_as_ext2(void) { }
+#endif
+
+#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext3",
+	.get_sb		= ext4_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static inline void register_as_ext3(void)
+{
+	int err = register_filesystem(&ext3_fs_type);
+	if (err)
+		printk(KERN_WARNING
+		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
+}
+
+static inline void unregister_as_ext3(void)
+{
+	unregister_filesystem(&ext3_fs_type);
+}
+#else
+static inline void register_as_ext3(void) { }
+static inline void unregister_as_ext3(void) { }
+#endif
+
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4",
@@ -3995,11 +4060,15 @@ static int __init init_ext4_fs(void)
 	err = init_inodecache();
 	if (err)
 		goto out1;
+	register_as_ext2();
+	register_as_ext3();
 	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
 	return 0;
 out:
+	unregister_as_ext2();
+	unregister_as_ext3();
 	destroy_inodecache();
 out1:
 	exit_ext4_xattr();
@@ -4015,6 +4084,8 @@ out4:
 
 static void __exit exit_ext4_fs(void)
 {
+	unregister_as_ext2();
+	unregister_as_ext3();
 	unregister_filesystem(&ext4_fs_type);
 	destroy_inodecache();
 	exit_ext4_xattr();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fed5b01d7a8..910bf9a59cb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -482,9 +482,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		ea_bdebug(bh, "refcount now=0; freeing");
 		if (ce)
 			mb_cache_entry_free(ce);
-		ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
 		get_bh(bh);
-		ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+		ext4_free_blocks(handle, inode, bh, 0, 1,
+				 EXT4_FREE_BLOCKS_METADATA |
+				 EXT4_FREE_BLOCKS_FORGET);
 	} else {
 		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
 		error = ext4_handle_dirty_metadata(handle, inode, bh);
@@ -832,7 +833,8 @@ inserted:
 			new_bh = sb_getblk(sb, block);
 			if (!new_bh) {
 getblk_failed:
-				ext4_free_blocks(handle, inode, block, 1, 1);
+				ext4_free_blocks(handle, inode, 0, block, 1,
+						 EXT4_FREE_BLOCKS_METADATA);
 				error = -EIO;
 				goto cleanup;
 			}
@@ -988,6 +990,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (error)
 		goto cleanup;
 
+	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto cleanup;
+
 	if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
 		struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
@@ -1013,9 +1019,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (flags & XATTR_CREATE)
 			goto cleanup;
 	}
-	error = ext4_journal_get_write_access(handle, is.iloc.bh);
-	if (error)
-		goto cleanup;
 	if (!value) {
 		if (!is.s.not_found)
 			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 7db0979c6b7..e6efdfa0f6d 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -44,7 +44,8 @@ struct fat_mount_options {
 		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
 		 usefree:1,	  /* Use free_clusters for FAT32 */
 		 tz_utc:1,	  /* Filesystem timestamps are in UTC */
-		 rodir:1;	  /* allow ATTR_RO for directory */
+		 rodir:1,	  /* allow ATTR_RO for directory */
+		 discard:1;	  /* Issue discard requests on deletions */
 };
 
 #define FAT_HASH_BITS	8
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index a81037721a6..81184d3b75a 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -566,16 +566,21 @@ int fat_free_clusters(struct inode *inode, int cluster)
 			goto error;
 		}
 
-		/* 
-		 * Issue discard for the sectors we no longer care about,
-		 * batching contiguous clusters into one request
-		 */
-		if (cluster != fatent.entry + 1) {
-			int nr_clus = fatent.entry - first_cl + 1;
-
-			sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
-					 nr_clus * sbi->sec_per_clus);
-			first_cl = cluster;
+		if (sbi->options.discard) {
+			/*
+			 * Issue discard for the sectors we no longer
+			 * care about, batching contiguous clusters
+			 * into one request
+			 */
+			if (cluster != fatent.entry + 1) {
+				int nr_clus = fatent.entry - first_cl + 1;
+
+				sb_issue_discard(sb,
+					fat_clus_to_blknr(sbi, first_cl),
+					nr_clus * sbi->sec_per_clus);
+
+				first_cl = cluster;
+			}
 		}
 
 		ops->ent_put(&fatent, FAT_ENT_FREE);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 76b7961ab66..14da530b05c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -858,6 +858,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_puts(m, ",errors=panic");
 	else
 		seq_puts(m, ",errors=remount-ro");
+	if (opts->discard)
+		seq_puts(m, ",discard");
 
 	return 0;
 }
@@ -871,7 +873,7 @@ enum {
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
 	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
-	Opt_err_panic, Opt_err_ro, Opt_err,
+	Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err,
 };
 
 static const match_table_t fat_tokens = {
@@ -899,6 +901,7 @@ static const match_table_t fat_tokens = {
 	{Opt_err_cont, "errors=continue"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_discard, "discard"},
 	{Opt_obsolate, "conv=binary"},
 	{Opt_obsolate, "conv=text"},
 	{Opt_obsolate, "conv=auto"},
@@ -1136,6 +1139,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 		case Opt_rodir:
 			opts->rodir = 1;
 			break;
+		case Opt_discard:
+			opts->discard = 1;
+			break;
 
 		/* obsolete mount options */
 		case Opt_obsolate:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 0f55f5cb732..d3da05f2646 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/time.h>
 #include "fat.h"
 
 /*
@@ -157,10 +158,6 @@ extern struct timezone sys_tz;
 #define SECS_PER_MIN	60
 #define SECS_PER_HOUR	(60 * 60)
 #define SECS_PER_DAY	(SECS_PER_HOUR * 24)
-#define UNIX_SECS_1980	315532800L
-#if BITS_PER_LONG == 64
-#define UNIX_SECS_2108	4354819200L
-#endif
 /* days between 1.1.70 and 1.1.80 (2 leap days) */
 #define DAYS_DELTA	(365 * 10 + 2)
 /* 120 (2100 - 1980) isn't leap year */
@@ -213,58 +210,35 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
 void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
 		       __le16 *time, __le16 *date, u8 *time_cs)
 {
-	time_t second = ts->tv_sec;
-	time_t day, leap_day, month, year;
+	struct tm tm;
+	time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 :
+		   -sys_tz.tz_minuteswest * 60, &tm);
 
-	if (!sbi->options.tz_utc)
-		second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
-
-	/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
-	if (second < UNIX_SECS_1980) {
+	/*  FAT can only support year between 1980 to 2107 */
+	if (tm.tm_year < 1980 - 1900) {
 		*time = 0;
 		*date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
 		if (time_cs)
 			*time_cs = 0;
 		return;
 	}
-#if BITS_PER_LONG == 64
-	if (second >= UNIX_SECS_2108) {
+	if (tm.tm_year > 2107 - 1900) {
 		*time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
 		*date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
 		if (time_cs)
 			*time_cs = 199;
 		return;
 	}
-#endif
 
-	day = second / SECS_PER_DAY - DAYS_DELTA;
-	year = day / 365;
-	leap_day = (year + 3) / 4;
-	if (year > YEAR_2100)		/* 2100 isn't leap year */
-		leap_day--;
-	if (year * 365 + leap_day > day)
-		year--;
-	leap_day = (year + 3) / 4;
-	if (year > YEAR_2100)		/* 2100 isn't leap year */
-		leap_day--;
-	day -= year * 365 + leap_day;
-
-	if (IS_LEAP_YEAR(year) && day == days_in_year[3]) {
-		month = 2;
-	} else {
-		if (IS_LEAP_YEAR(year) && day > days_in_year[3])
-			day--;
-		for (month = 1; month < 12; month++) {
-			if (days_in_year[month + 1] > day)
-				break;
-		}
-	}
-	day -= days_in_year[month];
+	/* from 1900 -> from 1980 */
+	tm.tm_year -= 80;
+	/* 0~11 -> 1~12 */
+	tm.tm_mon++;
+	/* 0~59 -> 0~29(2sec counts) */
+	tm.tm_sec >>= 1;
 
-	*time = cpu_to_le16(((second / SECS_PER_HOUR) % 24) << 11
-			    | ((second / SECS_PER_MIN) % 60) << 5
-			    | (second % SECS_PER_MIN) >> 1);
-	*date = cpu_to_le16((year << 9) | (month << 5) | (day + 1));
+	*time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec);
+	*date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday);
 	if (time_cs)
 		*time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
 }
@@ -285,4 +259,3 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 	}
 	return err;
 }
-
diff --git a/fs/file_table.c b/fs/file_table.c
index 8eb44042e00..4bef4c01ec6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
@@ -280,7 +279,6 @@ void __fput(struct file *file)
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
-	ima_file_free(file);
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9d5360c4c2a..49bc1b8e8f1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -614,7 +614,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
 				struct writeback_control *wbc)
 {
 	struct super_block *sb = wbc->sb, *pin_sb = NULL;
-	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
 	const unsigned long start = jiffies;	/* livelock avoidance */
 
 	spin_lock(&inode_lock);
@@ -635,36 +634,11 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
 			continue;
 		}
 
-		if (!bdi_cap_writeback_dirty(wb->bdi)) {
-			redirty_tail(inode);
-			if (is_blkdev_sb) {
-				/*
-				 * Dirty memory-backed blockdev: the ramdisk
-				 * driver does this.  Skip just this inode
-				 */
-				continue;
-			}
-			/*
-			 * Dirty memory-backed inode against a filesystem other
-			 * than the kernel-internal bdev filesystem.  Skip the
-			 * entire superblock.
-			 */
-			break;
-		}
-
 		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
 			requeue_io(inode);
 			continue;
 		}
 
-		if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
-			wbc->encountered_congestion = 1;
-			if (!is_blkdev_sb)
-				break;		/* Skip a congested fs */
-			requeue_io(inode);
-			continue;		/* Skip a congested blockdev */
-		}
-
 		/*
 		 * Was this inode dirtied after sync_sb_inodes was called?
 		 * This keeps sync from extra jobs and livelock.
@@ -756,6 +730,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		.sync_mode		= args->sync_mode,
 		.older_than_this	= NULL,
 		.for_kupdate		= args->for_kupdate,
+		.for_background		= args->for_background,
 		.range_cyclic		= args->range_cyclic,
 	};
 	unsigned long oldest_jif;
@@ -787,7 +762,6 @@ static long wb_writeback(struct bdi_writeback *wb,
 			break;
 
 		wbc.more_io = 0;
-		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
 		writeback_inodes_wb(wb, &wbc);
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 9bbb8ce7bea..864dac20a24 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -54,3 +54,10 @@ config FSCACHE_DEBUG
 	  enabled by setting bits in /sys/modules/fscache/parameter/debug.
 
 	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_OBJECT_LIST
+	bool "Maintain global object list for debugging purposes"
+	depends on FSCACHE && PROC_FS
+	help
+	  Maintain a global list of active fscache objects that can be
+	  retrieved through /proc/fs/fscache/objects for debugging purposes
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 91571b95aac..6d561531cb3 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -15,5 +15,6 @@ fscache-y := \
 fscache-$(CONFIG_PROC_FS) += proc.o
 fscache-$(CONFIG_FSCACHE_STATS) += stats.o
 fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o
 
 obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index e21985bbb1f..6a3c48abd67 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -263,6 +263,7 @@ int fscache_add_cache(struct fscache_cache *cache,
 	spin_lock(&cache->object_list_lock);
 	list_add_tail(&ifsdef->cache_link, &cache->object_list);
 	spin_unlock(&cache->object_list_lock);
+	fscache_objlist_add(ifsdef);
 
 	/* add the cache's netfs definition index object to the top level index
 	 * cookie as a known backing object */
@@ -380,11 +381,15 @@ void fscache_withdraw_cache(struct fscache_cache *cache)
 
 	/* make sure all pages pinned by operations on behalf of the netfs are
 	 * written to disk */
+	fscache_stat(&fscache_n_cop_sync_cache);
 	cache->ops->sync_cache(cache);
+	fscache_stat_d(&fscache_n_cop_sync_cache);
 
 	/* dissociate all the netfs pages backed by this cache from the block
 	 * mappings in the cache */
+	fscache_stat(&fscache_n_cop_dissociate_pages);
 	cache->ops->dissociate_pages(cache);
+	fscache_stat_d(&fscache_n_cop_dissociate_pages);
 
 	/* we now have to destroy all the active objects pertaining to this
 	 * cache - which we do by passing them off to thread pool to be
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 72fd18f6c71..990535071a8 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -36,6 +36,7 @@ void fscache_cookie_init_once(void *_cookie)
 
 	memset(cookie, 0, sizeof(*cookie));
 	spin_lock_init(&cookie->lock);
+	spin_lock_init(&cookie->stores_lock);
 	INIT_HLIST_HEAD(&cookie->backing_objects);
 }
 
@@ -102,7 +103,9 @@ struct fscache_cookie *__fscache_acquire_cookie(
 	cookie->netfs_data	= netfs_data;
 	cookie->flags		= 0;
 
-	INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
+	/* radix tree insertion won't use the preallocation pool unless it's
+	 * told it may not wait */
+	INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
 
 	switch (cookie->def->type) {
 	case FSCACHE_COOKIE_TYPE_INDEX:
@@ -249,7 +252,9 @@ static int fscache_alloc_object(struct fscache_cache *cache,
 
 	/* ask the cache to allocate an object (we may end up with duplicate
 	 * objects at this stage, but we sort that out later) */
+	fscache_stat(&fscache_n_cop_alloc_object);
 	object = cache->ops->alloc_object(cache, cookie);
+	fscache_stat_d(&fscache_n_cop_alloc_object);
 	if (IS_ERR(object)) {
 		fscache_stat(&fscache_n_object_no_alloc);
 		ret = PTR_ERR(object);
@@ -270,8 +275,11 @@ static int fscache_alloc_object(struct fscache_cache *cache,
 	/* only attach if we managed to allocate all we needed, otherwise
 	 * discard the object we just allocated and instead use the one
 	 * attached to the cookie */
-	if (fscache_attach_object(cookie, object) < 0)
+	if (fscache_attach_object(cookie, object) < 0) {
+		fscache_stat(&fscache_n_cop_put_object);
 		cache->ops->put_object(object);
+		fscache_stat_d(&fscache_n_cop_put_object);
+	}
 
 	_leave(" = 0");
 	return 0;
@@ -287,7 +295,9 @@ object_already_extant:
 	return 0;
 
 error_put:
+	fscache_stat(&fscache_n_cop_put_object);
 	cache->ops->put_object(object);
+	fscache_stat_d(&fscache_n_cop_put_object);
 error:
 	_leave(" = %d", ret);
 	return ret;
@@ -349,6 +359,8 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
 	object->cookie = cookie;
 	atomic_inc(&cookie->usage);
 	hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+
+	fscache_objlist_add(object);
 	ret = 0;
 
 cant_attach_object:
@@ -403,6 +415,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 	unsigned long event;
 
 	fscache_stat(&fscache_n_relinquishes);
+	if (retire)
+		fscache_stat(&fscache_n_relinquishes_retire);
 
 	if (!cookie) {
 		fscache_stat(&fscache_n_relinquishes_null);
@@ -428,12 +442,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 
 	event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
 
-	/* detach pointers back to the netfs */
 	spin_lock(&cookie->lock);
 
-	cookie->netfs_data	= NULL;
-	cookie->def		= NULL;
-
 	/* break links with all the active objects */
 	while (!hlist_empty(&cookie->backing_objects)) {
 		object = hlist_entry(cookie->backing_objects.first,
@@ -456,6 +466,10 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 			BUG();
 	}
 
+	/* detach pointers back to the netfs */
+	cookie->netfs_data	= NULL;
+	cookie->def		= NULL;
+
 	spin_unlock(&cookie->lock);
 
 	if (cookie->parent) {
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 1c341304621..edd7434ab6e 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -17,6 +17,7 @@
  * - cache->object_list_lock
  * - object->lock
  * - object->parent->lock
+ * - cookie->stores_lock
  * - fscache_thread_lock
  *
  */
@@ -88,17 +89,31 @@ extern int fscache_wait_bit_interruptible(void *);
 /*
  * object.c
  */
+extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
+
 extern void fscache_withdrawing_object(struct fscache_cache *,
 				       struct fscache_object *);
 extern void fscache_enqueue_object(struct fscache_object *);
 
 /*
+ * object-list.c
+ */
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+extern const struct file_operations fscache_objlist_fops;
+
+extern void fscache_objlist_add(struct fscache_object *);
+#else
+#define fscache_objlist_add(object) do {} while(0)
+#endif
+
+/*
  * operation.c
  */
 extern int fscache_submit_exclusive_op(struct fscache_object *,
 				       struct fscache_operation *);
 extern int fscache_submit_op(struct fscache_object *,
 			     struct fscache_operation *);
+extern int fscache_cancel_op(struct fscache_operation *);
 extern void fscache_abort_object(struct fscache_object *);
 extern void fscache_start_operations(struct fscache_object *);
 extern void fscache_operation_gc(struct work_struct *);
@@ -127,6 +142,8 @@ extern atomic_t fscache_n_op_enqueue;
 extern atomic_t fscache_n_op_deferred_release;
 extern atomic_t fscache_n_op_release;
 extern atomic_t fscache_n_op_gc;
+extern atomic_t fscache_n_op_cancelled;
+extern atomic_t fscache_n_op_rejected;
 
 extern atomic_t fscache_n_attr_changed;
 extern atomic_t fscache_n_attr_changed_ok;
@@ -138,6 +155,8 @@ extern atomic_t fscache_n_allocs;
 extern atomic_t fscache_n_allocs_ok;
 extern atomic_t fscache_n_allocs_wait;
 extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_allocs_intr;
+extern atomic_t fscache_n_allocs_object_dead;
 extern atomic_t fscache_n_alloc_ops;
 extern atomic_t fscache_n_alloc_op_waits;
 
@@ -148,6 +167,7 @@ extern atomic_t fscache_n_retrievals_nodata;
 extern atomic_t fscache_n_retrievals_nobufs;
 extern atomic_t fscache_n_retrievals_intr;
 extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrievals_object_dead;
 extern atomic_t fscache_n_retrieval_ops;
 extern atomic_t fscache_n_retrieval_op_waits;
 
@@ -158,6 +178,14 @@ extern atomic_t fscache_n_stores_nobufs;
 extern atomic_t fscache_n_stores_oom;
 extern atomic_t fscache_n_store_ops;
 extern atomic_t fscache_n_store_calls;
+extern atomic_t fscache_n_store_pages;
+extern atomic_t fscache_n_store_radix_deletes;
+extern atomic_t fscache_n_store_pages_over_limit;
+
+extern atomic_t fscache_n_store_vmscan_not_storing;
+extern atomic_t fscache_n_store_vmscan_gone;
+extern atomic_t fscache_n_store_vmscan_busy;
+extern atomic_t fscache_n_store_vmscan_cancelled;
 
 extern atomic_t fscache_n_marks;
 extern atomic_t fscache_n_uncaches;
@@ -176,6 +204,7 @@ extern atomic_t fscache_n_updates_run;
 extern atomic_t fscache_n_relinquishes;
 extern atomic_t fscache_n_relinquishes_null;
 extern atomic_t fscache_n_relinquishes_waitcrt;
+extern atomic_t fscache_n_relinquishes_retire;
 
 extern atomic_t fscache_n_cookie_index;
 extern atomic_t fscache_n_cookie_data;
@@ -186,6 +215,7 @@ extern atomic_t fscache_n_object_no_alloc;
 extern atomic_t fscache_n_object_lookups;
 extern atomic_t fscache_n_object_lookups_negative;
 extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_lookups_timed_out;
 extern atomic_t fscache_n_object_created;
 extern atomic_t fscache_n_object_avail;
 extern atomic_t fscache_n_object_dead;
@@ -195,15 +225,41 @@ extern atomic_t fscache_n_checkaux_okay;
 extern atomic_t fscache_n_checkaux_update;
 extern atomic_t fscache_n_checkaux_obsolete;
 
+extern atomic_t fscache_n_cop_alloc_object;
+extern atomic_t fscache_n_cop_lookup_object;
+extern atomic_t fscache_n_cop_lookup_complete;
+extern atomic_t fscache_n_cop_grab_object;
+extern atomic_t fscache_n_cop_update_object;
+extern atomic_t fscache_n_cop_drop_object;
+extern atomic_t fscache_n_cop_put_object;
+extern atomic_t fscache_n_cop_sync_cache;
+extern atomic_t fscache_n_cop_attr_changed;
+extern atomic_t fscache_n_cop_read_or_alloc_page;
+extern atomic_t fscache_n_cop_read_or_alloc_pages;
+extern atomic_t fscache_n_cop_allocate_page;
+extern atomic_t fscache_n_cop_allocate_pages;
+extern atomic_t fscache_n_cop_write_page;
+extern atomic_t fscache_n_cop_uncache_page;
+extern atomic_t fscache_n_cop_dissociate_pages;
+
 static inline void fscache_stat(atomic_t *stat)
 {
 	atomic_inc(stat);
 }
 
+static inline void fscache_stat_d(atomic_t *stat)
+{
+	atomic_dec(stat);
+}
+
+#define __fscache_stat(stat) (stat)
+
 extern const struct file_operations fscache_stats_fops;
 #else
 
+#define __fscache_stat(stat) (NULL)
 #define fscache_stat(stat) do {} while (0)
+#define fscache_stat_d(stat) do {} while (0)
 #endif
 
 /*
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 4de41b59749..add6bdb53f0 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -48,7 +48,7 @@ static int __init fscache_init(void)
 {
 	int ret;
 
-	ret = slow_work_register_user();
+	ret = slow_work_register_user(THIS_MODULE);
 	if (ret < 0)
 		goto error_slow_work;
 
@@ -80,7 +80,7 @@ error_kobj:
 error_cookie_jar:
 	fscache_proc_cleanup();
 error_proc:
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 error_slow_work:
 	return ret;
 }
@@ -97,7 +97,7 @@ static void __exit fscache_exit(void)
 	kobject_put(fscache_root);
 	kmem_cache_destroy(fscache_cookie_jar);
 	fscache_proc_cleanup();
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
 
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
new file mode 100644
index 00000000000..3221a0c7944
--- /dev/null
+++ b/fs/fscache/object-list.c
@@ -0,0 +1,432 @@
+/* Global fscache object list maintainer and viewer
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/key.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+static struct rb_root fscache_object_list;
+static DEFINE_RWLOCK(fscache_object_list_lock);
+
+struct fscache_objlist_data {
+	unsigned long	config;		/* display configuration */
+#define FSCACHE_OBJLIST_CONFIG_KEY	0x00000001	/* show object keys */
+#define FSCACHE_OBJLIST_CONFIG_AUX	0x00000002	/* show object auxdata */
+#define FSCACHE_OBJLIST_CONFIG_COOKIE	0x00000004	/* show objects with cookies */
+#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE	0x00000008	/* show objects without cookies */
+#define FSCACHE_OBJLIST_CONFIG_BUSY	0x00000010	/* show busy objects */
+#define FSCACHE_OBJLIST_CONFIG_IDLE	0x00000020	/* show idle objects */
+#define FSCACHE_OBJLIST_CONFIG_PENDWR	0x00000040	/* show objects with pending writes */
+#define FSCACHE_OBJLIST_CONFIG_NOPENDWR	0x00000080	/* show objects without pending writes */
+#define FSCACHE_OBJLIST_CONFIG_READS	0x00000100	/* show objects with active reads */
+#define FSCACHE_OBJLIST_CONFIG_NOREADS	0x00000200	/* show objects without active reads */
+#define FSCACHE_OBJLIST_CONFIG_EVENTS	0x00000400	/* show objects with events */
+#define FSCACHE_OBJLIST_CONFIG_NOEVENTS	0x00000800	/* show objects without no events */
+#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with slow work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without slow work */
+
+	u8		buf[512];	/* key and aux data buffer */
+};
+
+/*
+ * Add an object to the object list
+ * - we use the address of the fscache_object structure as the key into the
+ *   tree
+ */
+void fscache_objlist_add(struct fscache_object *obj)
+{
+	struct fscache_object *xobj;
+	struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
+
+	write_lock(&fscache_object_list_lock);
+
+	while (*p) {
+		parent = *p;
+		xobj = rb_entry(parent, struct fscache_object, objlist_link);
+
+		if (obj < xobj)
+			p = &(*p)->rb_left;
+		else if (obj > xobj)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&obj->objlist_link, parent, p);
+	rb_insert_color(&obj->objlist_link, &fscache_object_list);
+
+	write_unlock(&fscache_object_list_lock);
+}
+
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *obj)
+{
+	write_lock(&fscache_object_list_lock);
+
+	BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
+	rb_erase(&obj->objlist_link, &fscache_object_list);
+
+	write_unlock(&fscache_object_list_lock);
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
+/*
+ * find the object in the tree on or after the specified index
+ */
+static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
+{
+	struct fscache_object *pobj, *obj = NULL, *minobj = NULL;
+	struct rb_node *p;
+	unsigned long pos;
+
+	if (*_pos >= (unsigned long) ERR_PTR(-ENOENT))
+		return NULL;
+	pos = *_pos;
+
+	/* banners (can't represent line 0 by pos 0 as that would involve
+	 * returning a NULL pointer) */
+	if (pos == 0)
+		return (struct fscache_object *) ++(*_pos);
+	if (pos < 3)
+		return (struct fscache_object *)pos;
+
+	pobj = (struct fscache_object *)pos;
+	p = fscache_object_list.rb_node;
+	while (p) {
+		obj = rb_entry(p, struct fscache_object, objlist_link);
+		if (pobj < obj) {
+			if (!minobj || minobj > obj)
+				minobj = obj;
+			p = p->rb_left;
+		} else if (pobj > obj) {
+			p = p->rb_right;
+		} else {
+			minobj = obj;
+			break;
+		}
+		obj = NULL;
+	}
+
+	if (!minobj)
+		*_pos = (unsigned long) ERR_PTR(-ENOENT);
+	else if (minobj != obj)
+		*_pos = (unsigned long) minobj;
+	return minobj;
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos)
+	__acquires(&fscache_object_list_lock)
+{
+	read_lock(&fscache_object_list_lock);
+	return fscache_objlist_lookup(_pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	(*_pos)++;
+	return fscache_objlist_lookup(_pos);
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_objlist_stop(struct seq_file *m, void *v)
+	__releases(&fscache_object_list_lock)
+{
+	read_unlock(&fscache_object_list_lock);
+}
+
+/*
+ * display an object
+ */
+static int fscache_objlist_show(struct seq_file *m, void *v)
+{
+	struct fscache_objlist_data *data = m->private;
+	struct fscache_object *obj = v;
+	unsigned long config = data->config;
+	uint16_t keylen, auxlen;
+	char _type[3], *type;
+	bool no_cookie;
+	u8 *buf = data->buf, *p;
+
+	if ((unsigned long) v == 1) {
+		seq_puts(m, "OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS"
+			 " EM EV F S"
+			 " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
+		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			      FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, "       ");
+		if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+			seq_puts(m, "OBJECT_KEY");
+		if ((config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			       FSCACHE_OBJLIST_CONFIG_AUX)) ==
+		    (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, ", ");
+		if (config & FSCACHE_OBJLIST_CONFIG_AUX)
+			seq_puts(m, "AUX_DATA");
+		seq_puts(m, "\n");
+		return 0;
+	}
+
+	if ((unsigned long) v == 2) {
+		seq_puts(m, "======== ======== ==== ===== === === === == ====="
+			 " == == = ="
+			 " | ================ == == ================");
+		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			      FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, " ================");
+		seq_puts(m, "\n");
+		return 0;
+	}
+
+	/* filter out any unwanted objects */
+#define FILTER(criterion, _yes, _no)					\
+	do {								\
+		unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes;	\
+		unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no;	\
+		if (criterion) {					\
+			if (!(config & yes))				\
+				return 0;				\
+		} else {						\
+			if (!(config & no))				\
+				return 0;				\
+		}							\
+	} while(0)
+
+	if (~config) {
+		FILTER(obj->cookie,
+		       COOKIE, NOCOOKIE);
+		FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+		       obj->n_ops != 0 ||
+		       obj->n_obj_ops != 0 ||
+		       obj->flags ||
+		       !list_empty(&obj->dependents),
+		       BUSY, IDLE);
+		FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags),
+		       PENDWR, NOPENDWR);
+		FILTER(atomic_read(&obj->n_reads),
+		       READS, NOREADS);
+		FILTER(obj->events & obj->event_mask,
+		       EVENTS, NOEVENTS);
+		FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
+		       WORK, NOWORK);
+	}
+
+	seq_printf(m,
+		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
+		   obj->debug_id,
+		   obj->parent ? obj->parent->debug_id : -1,
+		   fscache_object_states_short[obj->state],
+		   obj->n_children,
+		   obj->n_ops,
+		   obj->n_obj_ops,
+		   obj->n_in_progress,
+		   obj->n_exclusive,
+		   atomic_read(&obj->n_reads),
+		   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
+		   obj->events,
+		   obj->flags,
+		   obj->work.flags);
+
+	no_cookie = true;
+	keylen = auxlen = 0;
+	if (obj->cookie) {
+		spin_lock(&obj->lock);
+		if (obj->cookie) {
+			switch (obj->cookie->def->type) {
+			case 0:
+				type = "IX";
+				break;
+			case 1:
+				type = "DT";
+				break;
+			default:
+				sprintf(_type, "%02u",
+					obj->cookie->def->type);
+				type = _type;
+				break;
+			}
+
+			seq_printf(m, "%-16s %s %2lx %16p",
+				   obj->cookie->def->name,
+				   type,
+				   obj->cookie->flags,
+				   obj->cookie->netfs_data);
+
+			if (obj->cookie->def->get_key &&
+			    config & FSCACHE_OBJLIST_CONFIG_KEY)
+				keylen = obj->cookie->def->get_key(
+					obj->cookie->netfs_data,
+					buf, 400);
+
+			if (obj->cookie->def->get_aux &&
+			    config & FSCACHE_OBJLIST_CONFIG_AUX)
+				auxlen = obj->cookie->def->get_aux(
+					obj->cookie->netfs_data,
+					buf + keylen, 512 - keylen);
+
+			no_cookie = false;
+		}
+		spin_unlock(&obj->lock);
+
+		if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+			seq_printf(m, " ");
+			for (p = buf; keylen > 0; keylen--)
+				seq_printf(m, "%02x", *p++);
+			if (auxlen > 0) {
+				if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+					seq_printf(m, ", ");
+				for (; auxlen > 0; auxlen--)
+					seq_printf(m, "%02x", *p++);
+			}
+		}
+	}
+
+	if (no_cookie)
+		seq_printf(m, "<no_cookie>\n");
+	else
+		seq_printf(m, "\n");
+	return 0;
+}
+
+static const struct seq_operations fscache_objlist_ops = {
+	.start		= fscache_objlist_start,
+	.stop		= fscache_objlist_stop,
+	.next		= fscache_objlist_next,
+	.show		= fscache_objlist_show,
+};
+
+/*
+ * get the configuration for filtering the list
+ */
+static void fscache_objlist_config(struct fscache_objlist_data *data)
+{
+#ifdef CONFIG_KEYS
+	struct user_key_payload *confkey;
+	unsigned long config;
+	struct key *key;
+	const char *buf;
+	int len;
+
+	key = request_key(&key_type_user, "fscache:objlist", NULL);
+	if (IS_ERR(key))
+		goto no_config;
+
+	config = 0;
+	rcu_read_lock();
+
+	confkey = key->payload.data;
+	buf = confkey->data;
+
+	for (len = confkey->datalen - 1; len >= 0; len--) {
+		switch (buf[len]) {
+		case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY;		break;
+		case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX;		break;
+		case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE;	break;
+		case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE;	break;
+		case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY;	break;
+		case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE;	break;
+		case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR;	break;
+		case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR;	break;
+		case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS;	break;
+		case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS;	break;
+		case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK;	break;
+		case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK;	break;
+		}
+	}
+
+	rcu_read_unlock();
+	key_put(key);
+
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK;
+
+	data->config = config;
+	return;
+
+no_config:
+#endif
+	data->config = ULONG_MAX;
+}
+
+/*
+ * open "/proc/fs/fscache/objects" to provide a list of active objects
+ * - can be configured by a user-defined key added to the caller's keyrings
+ */
+static int fscache_objlist_open(struct inode *inode, struct file *file)
+{
+	struct fscache_objlist_data *data;
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &fscache_objlist_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+
+	/* buffer for key extraction */
+	data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL);
+	if (!data) {
+		seq_release(inode, file);
+		return -ENOMEM;
+	}
+
+	/* get the configuration key */
+	fscache_objlist_config(data);
+
+	m->private = data;
+	return 0;
+}
+
+/*
+ * clean up on close
+ */
+static int fscache_objlist_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	kfree(m->private);
+	m->private = NULL;
+	return seq_release(inode, file);
+}
+
+const struct file_operations fscache_objlist_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_objlist_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= fscache_objlist_release,
+};
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 392a41b1b79..e513ac599c8 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,9 +14,10 @@
 
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 
-const char *fscache_object_states[] = {
+const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
 	[FSCACHE_OBJECT_INIT]		= "OBJECT_INIT",
 	[FSCACHE_OBJECT_LOOKING_UP]	= "OBJECT_LOOKING_UP",
 	[FSCACHE_OBJECT_CREATING]	= "OBJECT_CREATING",
@@ -33,9 +34,28 @@ const char *fscache_object_states[] = {
 };
 EXPORT_SYMBOL(fscache_object_states);
 
+const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
+	[FSCACHE_OBJECT_INIT]		= "INIT",
+	[FSCACHE_OBJECT_LOOKING_UP]	= "LOOK",
+	[FSCACHE_OBJECT_CREATING]	= "CRTN",
+	[FSCACHE_OBJECT_AVAILABLE]	= "AVBL",
+	[FSCACHE_OBJECT_ACTIVE]		= "ACTV",
+	[FSCACHE_OBJECT_UPDATING]	= "UPDT",
+	[FSCACHE_OBJECT_DYING]		= "DYNG",
+	[FSCACHE_OBJECT_LC_DYING]	= "LCDY",
+	[FSCACHE_OBJECT_ABORT_INIT]	= "ABTI",
+	[FSCACHE_OBJECT_RELEASING]	= "RELS",
+	[FSCACHE_OBJECT_RECYCLING]	= "RCYC",
+	[FSCACHE_OBJECT_WITHDRAWING]	= "WTHD",
+	[FSCACHE_OBJECT_DEAD]		= "DEAD",
+};
+
 static void fscache_object_slow_work_put_ref(struct slow_work *);
 static int  fscache_object_slow_work_get_ref(struct slow_work *);
 static void fscache_object_slow_work_execute(struct slow_work *);
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
+#endif
 static void fscache_initialise_object(struct fscache_object *);
 static void fscache_lookup_object(struct fscache_object *);
 static void fscache_object_available(struct fscache_object *);
@@ -45,9 +65,13 @@ static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 
 const struct slow_work_ops fscache_object_slow_work_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= fscache_object_slow_work_get_ref,
 	.put_ref	= fscache_object_slow_work_put_ref,
 	.execute	= fscache_object_slow_work_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+	.desc		= fscache_object_slow_work_desc,
+#endif
 };
 EXPORT_SYMBOL(fscache_object_slow_work_ops);
 
@@ -81,6 +105,7 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
 static void fscache_object_state_machine(struct fscache_object *object)
 {
 	enum fscache_object_state new_state;
+	struct fscache_cookie *cookie;
 
 	ASSERT(object != NULL);
 
@@ -120,20 +145,31 @@ static void fscache_object_state_machine(struct fscache_object *object)
 	case FSCACHE_OBJECT_UPDATING:
 		clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
 		fscache_stat(&fscache_n_updates_run);
+		fscache_stat(&fscache_n_cop_update_object);
 		object->cache->ops->update_object(object);
+		fscache_stat_d(&fscache_n_cop_update_object);
 		goto active_transit;
 
 		/* handle an object dying during lookup or creation */
 	case FSCACHE_OBJECT_LC_DYING:
 		object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+		fscache_stat(&fscache_n_cop_lookup_complete);
 		object->cache->ops->lookup_complete(object);
+		fscache_stat_d(&fscache_n_cop_lookup_complete);
 
 		spin_lock(&object->lock);
 		object->state = FSCACHE_OBJECT_DYING;
-		if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
-				       &object->cookie->flags))
-			wake_up_bit(&object->cookie->flags,
-				    FSCACHE_COOKIE_CREATING);
+		cookie = object->cookie;
+		if (cookie) {
+			if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
+					       &cookie->flags))
+				wake_up_bit(&cookie->flags,
+					    FSCACHE_COOKIE_LOOKING_UP);
+			if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+					       &cookie->flags))
+				wake_up_bit(&cookie->flags,
+					    FSCACHE_COOKIE_CREATING);
+		}
 		spin_unlock(&object->lock);
 
 		fscache_done_parent_op(object);
@@ -165,6 +201,7 @@ static void fscache_object_state_machine(struct fscache_object *object)
 		}
 		spin_unlock(&object->lock);
 		fscache_enqueue_dependents(object);
+		fscache_start_operations(object);
 		goto terminal_transit;
 
 		/* handle an abort during initialisation */
@@ -316,14 +353,29 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 
 	_enter("{OBJ%x}", object->debug_id);
 
-	clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-
 	start = jiffies;
 	fscache_object_state_machine(object);
 	fscache_hist(fscache_objs_histogram, start);
 	if (object->events & object->event_mask)
 		fscache_enqueue_object(object);
+	clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+}
+
+/*
+ * describe an object for slow-work debugging
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_object_slow_work_desc(struct slow_work *work,
+					  struct seq_file *m)
+{
+	struct fscache_object *object =
+		container_of(work, struct fscache_object, work);
+
+	seq_printf(m, "FSC: OBJ%x: %s",
+		   object->debug_id,
+		   fscache_object_states_short[object->state]);
 }
+#endif
 
 /*
  * initialise an object
@@ -376,7 +428,9 @@ static void fscache_initialise_object(struct fscache_object *object)
 			 * binding on to us, so we need to make sure we don't
 			 * add ourself to the list multiple times */
 			if (list_empty(&object->dep_link)) {
+				fscache_stat(&fscache_n_cop_grab_object);
 				object->cache->ops->grab_object(object);
+				fscache_stat_d(&fscache_n_cop_grab_object);
 				list_add(&object->dep_link,
 					 &parent->dependents);
 
@@ -414,6 +468,7 @@ static void fscache_lookup_object(struct fscache_object *object)
 {
 	struct fscache_cookie *cookie = object->cookie;
 	struct fscache_object *parent;
+	int ret;
 
 	_enter("");
 
@@ -438,11 +493,20 @@ static void fscache_lookup_object(struct fscache_object *object)
 	       object->cache->tag->name);
 
 	fscache_stat(&fscache_n_object_lookups);
-	object->cache->ops->lookup_object(object);
+	fscache_stat(&fscache_n_cop_lookup_object);
+	ret = object->cache->ops->lookup_object(object);
+	fscache_stat_d(&fscache_n_cop_lookup_object);
 
 	if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
 		set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
+	if (ret == -ETIMEDOUT) {
+		/* probably stuck behind another object, so move this one to
+		 * the back of the queue */
+		fscache_stat(&fscache_n_object_lookups_timed_out);
+		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	}
+
 	_leave("");
 }
 
@@ -546,7 +610,8 @@ static void fscache_object_available(struct fscache_object *object)
 
 	spin_lock(&object->lock);
 
-	if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
+	if (object->cookie &&
+	    test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
 		wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
 
 	fscache_done_parent_op(object);
@@ -562,7 +627,9 @@ static void fscache_object_available(struct fscache_object *object)
 	}
 	spin_unlock(&object->lock);
 
+	fscache_stat(&fscache_n_cop_lookup_complete);
 	object->cache->ops->lookup_complete(object);
+	fscache_stat_d(&fscache_n_cop_lookup_complete);
 	fscache_enqueue_dependents(object);
 
 	fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
@@ -581,11 +648,16 @@ static void fscache_drop_object(struct fscache_object *object)
 
 	_enter("{OBJ%x,%d}", object->debug_id, object->n_children);
 
+	ASSERTCMP(object->cookie, ==, NULL);
+	ASSERT(hlist_unhashed(&object->cookie_link));
+
 	spin_lock(&cache->object_list_lock);
 	list_del_init(&object->cache_link);
 	spin_unlock(&cache->object_list_lock);
 
+	fscache_stat(&fscache_n_cop_drop_object);
 	cache->ops->drop_object(object);
+	fscache_stat_d(&fscache_n_cop_drop_object);
 
 	if (parent) {
 		_debug("release parent OBJ%x {%d}",
@@ -600,7 +672,9 @@ static void fscache_drop_object(struct fscache_object *object)
 	}
 
 	/* this just shifts the object release to the slow work processor */
+	fscache_stat(&fscache_n_cop_put_object);
 	object->cache->ops->put_object(object);
+	fscache_stat_d(&fscache_n_cop_put_object);
 
 	_leave("");
 }
@@ -690,8 +764,12 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
 {
 	struct fscache_object *object =
 		container_of(work, struct fscache_object, work);
+	int ret;
 
-	return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+	fscache_stat(&fscache_n_cop_grab_object);
+	ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+	fscache_stat_d(&fscache_n_cop_grab_object);
+	return ret;
 }
 
 /*
@@ -702,7 +780,9 @@ static void fscache_object_slow_work_put_ref(struct slow_work *work)
 	struct fscache_object *object =
 		container_of(work, struct fscache_object, work);
 
-	return object->cache->ops->put_object(object);
+	fscache_stat(&fscache_n_cop_put_object);
+	object->cache->ops->put_object(object);
+	fscache_stat_d(&fscache_n_cop_put_object);
 }
 
 /*
@@ -739,7 +819,9 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
 
 		/* sort onto appropriate lists */
 		fscache_enqueue_object(dep);
+		fscache_stat(&fscache_n_cop_put_object);
 		dep->cache->ops->put_object(dep);
+		fscache_stat_d(&fscache_n_cop_put_object);
 
 		if (!list_empty(&object->dependents))
 			cond_resched_lock(&object->lock);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e7f8d53b8b6..313e79a1426 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -13,6 +13,7 @@
 
 #define FSCACHE_DEBUG_LEVEL OPERATION
 #include <linux/module.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 
 atomic_t fscache_op_debug_id;
@@ -31,32 +32,33 @@ void fscache_enqueue_operation(struct fscache_operation *op)
 	_enter("{OBJ%x OP%x,%u}",
 	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
 
+	fscache_set_op_state(op, "EnQ");
+
+	ASSERT(list_empty(&op->pend_link));
 	ASSERT(op->processor != NULL);
 	ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
 	ASSERTCMP(atomic_read(&op->usage), >, 0);
 
-	if (list_empty(&op->pend_link)) {
-		switch (op->flags & FSCACHE_OP_TYPE) {
-		case FSCACHE_OP_FAST:
-			_debug("queue fast");
-			atomic_inc(&op->usage);
-			if (!schedule_work(&op->fast_work))
-				fscache_put_operation(op);
-			break;
-		case FSCACHE_OP_SLOW:
-			_debug("queue slow");
-			slow_work_enqueue(&op->slow_work);
-			break;
-		case FSCACHE_OP_MYTHREAD:
-			_debug("queue for caller's attention");
-			break;
-		default:
-			printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
-			       op->flags);
-			BUG();
-			break;
-		}
-		fscache_stat(&fscache_n_op_enqueue);
+	fscache_stat(&fscache_n_op_enqueue);
+	switch (op->flags & FSCACHE_OP_TYPE) {
+	case FSCACHE_OP_FAST:
+		_debug("queue fast");
+		atomic_inc(&op->usage);
+		if (!schedule_work(&op->fast_work))
+			fscache_put_operation(op);
+		break;
+	case FSCACHE_OP_SLOW:
+		_debug("queue slow");
+		slow_work_enqueue(&op->slow_work);
+		break;
+	case FSCACHE_OP_MYTHREAD:
+		_debug("queue for caller's attention");
+		break;
+	default:
+		printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
+		       op->flags);
+		BUG();
+		break;
 	}
 }
 EXPORT_SYMBOL(fscache_enqueue_operation);
@@ -67,6 +69,8 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
 static void fscache_run_op(struct fscache_object *object,
 			   struct fscache_operation *op)
 {
+	fscache_set_op_state(op, "Run");
+
 	object->n_in_progress++;
 	if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
 		wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -87,9 +91,12 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 
 	_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
 
+	fscache_set_op_state(op, "SubmitX");
+
 	spin_lock(&object->lock);
 	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
 	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+	ASSERT(list_empty(&op->pend_link));
 
 	ret = -ENOBUFS;
 	if (fscache_object_is_active(object)) {
@@ -190,9 +197,12 @@ int fscache_submit_op(struct fscache_object *object,
 
 	ASSERTCMP(atomic_read(&op->usage), >, 0);
 
+	fscache_set_op_state(op, "Submit");
+
 	spin_lock(&object->lock);
 	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
 	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+	ASSERT(list_empty(&op->pend_link));
 
 	ostate = object->state;
 	smp_rmb();
@@ -222,6 +232,11 @@ int fscache_submit_op(struct fscache_object *object,
 		list_add_tail(&op->pend_link, &object->pending_ops);
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
+	} else if (object->state == FSCACHE_OBJECT_DYING ||
+		   object->state == FSCACHE_OBJECT_LC_DYING ||
+		   object->state == FSCACHE_OBJECT_WITHDRAWING) {
+		fscache_stat(&fscache_n_op_rejected);
+		ret = -ENOBUFS;
 	} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
 		fscache_report_unexpected_submission(object, op, ostate);
 		ASSERT(!fscache_object_is_active(object));
@@ -264,12 +279,7 @@ void fscache_start_operations(struct fscache_object *object)
 			stop = true;
 		}
 		list_del_init(&op->pend_link);
-		object->n_in_progress++;
-
-		if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
-			wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
-		if (op->processor)
-			fscache_enqueue_operation(op);
+		fscache_run_op(object, op);
 
 		/* the pending queue was holding a ref on the object */
 		fscache_put_operation(op);
@@ -282,6 +292,36 @@ void fscache_start_operations(struct fscache_object *object)
 }
 
 /*
+ * cancel an operation that's pending on an object
+ */
+int fscache_cancel_op(struct fscache_operation *op)
+{
+	struct fscache_object *object = op->object;
+	int ret;
+
+	_enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
+
+	spin_lock(&object->lock);
+
+	ret = -EBUSY;
+	if (!list_empty(&op->pend_link)) {
+		fscache_stat(&fscache_n_op_cancelled);
+		list_del_init(&op->pend_link);
+		object->n_ops--;
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+			object->n_exclusive--;
+		if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+			wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+		fscache_put_operation(op);
+		ret = 0;
+	}
+
+	spin_unlock(&object->lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
  * release an operation
  * - queues pending ops if this is the last in-progress op
  */
@@ -298,6 +338,8 @@ void fscache_put_operation(struct fscache_operation *op)
 	if (!atomic_dec_and_test(&op->usage))
 		return;
 
+	fscache_set_op_state(op, "Put");
+
 	_debug("PUT OP");
 	if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
 		BUG();
@@ -311,6 +353,9 @@ void fscache_put_operation(struct fscache_operation *op)
 
 	object = op->object;
 
+	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+		atomic_dec(&object->n_reads);
+
 	/* now... we may get called with the object spinlock held, so we
 	 * complete the cleanup here only if we can immediately acquire the
 	 * lock, and defer it otherwise */
@@ -452,8 +497,27 @@ static void fscache_op_execute(struct slow_work *work)
 	_leave("");
 }
 
+/*
+ * describe an operation for slow-work debugging
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
+{
+	struct fscache_operation *op =
+		container_of(work, struct fscache_operation, slow_work);
+
+	seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
+		   op->object->debug_id, op->debug_id,
+		   op->name, op->state, op->flags);
+}
+#endif
+
 const struct slow_work_ops fscache_op_slow_work_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= fscache_op_get_ref,
 	.put_ref	= fscache_op_put_ref,
 	.execute	= fscache_op_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+	.desc		= fscache_op_desc,
+#endif
 };
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 2568e0eb644..c598ea4c4e7 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -43,18 +43,102 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
 EXPORT_SYMBOL(__fscache_wait_on_page_write);
 
 /*
- * note that a page has finished being written to the cache
+ * decide whether a page can be released, possibly by cancelling a store to it
+ * - we're allowed to sleep if __GFP_WAIT is flagged
  */
-static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
+				  struct page *page,
+				  gfp_t gfp)
 {
 	struct page *xpage;
+	void *val;
+
+	_enter("%p,%p,%x", cookie, page, gfp);
+
+	rcu_read_lock();
+	val = radix_tree_lookup(&cookie->stores, page->index);
+	if (!val) {
+		rcu_read_unlock();
+		fscache_stat(&fscache_n_store_vmscan_not_storing);
+		__fscache_uncache_page(cookie, page);
+		return true;
+	}
+
+	/* see if the page is actually undergoing storage - if so we can't get
+	 * rid of it till the cache has finished with it */
+	if (radix_tree_tag_get(&cookie->stores, page->index,
+			       FSCACHE_COOKIE_STORING_TAG)) {
+		rcu_read_unlock();
+		goto page_busy;
+	}
+
+	/* the page is pending storage, so we attempt to cancel the store and
+	 * discard the store request so that the page can be reclaimed */
+	spin_lock(&cookie->stores_lock);
+	rcu_read_unlock();
+
+	if (radix_tree_tag_get(&cookie->stores, page->index,
+			       FSCACHE_COOKIE_STORING_TAG)) {
+		/* the page started to undergo storage whilst we were looking,
+		 * so now we can only wait or return */
+		spin_unlock(&cookie->stores_lock);
+		goto page_busy;
+	}
 
-	spin_lock(&cookie->lock);
 	xpage = radix_tree_delete(&cookie->stores, page->index);
-	spin_unlock(&cookie->lock);
-	ASSERT(xpage != NULL);
+	spin_unlock(&cookie->stores_lock);
+
+	if (xpage) {
+		fscache_stat(&fscache_n_store_vmscan_cancelled);
+		fscache_stat(&fscache_n_store_radix_deletes);
+		ASSERTCMP(xpage, ==, page);
+	} else {
+		fscache_stat(&fscache_n_store_vmscan_gone);
+	}
 
 	wake_up_bit(&cookie->flags, 0);
+	if (xpage)
+		page_cache_release(xpage);
+	__fscache_uncache_page(cookie, page);
+	return true;
+
+page_busy:
+	/* we might want to wait here, but that could deadlock the allocator as
+	 * the slow-work threads writing to the cache may all end up sleeping
+	 * on memory allocation */
+	fscache_stat(&fscache_n_store_vmscan_busy);
+	return false;
+}
+EXPORT_SYMBOL(__fscache_maybe_release_page);
+
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_object *object,
+				   struct page *page)
+{
+	struct fscache_cookie *cookie;
+	struct page *xpage = NULL;
+
+	spin_lock(&object->lock);
+	cookie = object->cookie;
+	if (cookie) {
+		/* delete the page from the tree if it is now no longer
+		 * pending */
+		spin_lock(&cookie->stores_lock);
+		radix_tree_tag_clear(&cookie->stores, page->index,
+				     FSCACHE_COOKIE_STORING_TAG);
+		if (!radix_tree_tag_get(&cookie->stores, page->index,
+					FSCACHE_COOKIE_PENDING_TAG)) {
+			fscache_stat(&fscache_n_store_radix_deletes);
+			xpage = radix_tree_delete(&cookie->stores, page->index);
+		}
+		spin_unlock(&cookie->stores_lock);
+		wake_up_bit(&cookie->flags, 0);
+	}
+	spin_unlock(&object->lock);
+	if (xpage)
+		page_cache_release(xpage);
 }
 
 /*
@@ -63,14 +147,21 @@ static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *p
 static void fscache_attr_changed_op(struct fscache_operation *op)
 {
 	struct fscache_object *object = op->object;
+	int ret;
 
 	_enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
 
 	fscache_stat(&fscache_n_attr_changed_calls);
 
-	if (fscache_object_is_active(object) &&
-	    object->cache->ops->attr_changed(object) < 0)
-		fscache_abort_object(object);
+	if (fscache_object_is_active(object)) {
+		fscache_set_op_state(op, "CallFS");
+		fscache_stat(&fscache_n_cop_attr_changed);
+		ret = object->cache->ops->attr_changed(object);
+		fscache_stat_d(&fscache_n_cop_attr_changed);
+		fscache_set_op_state(op, "Done");
+		if (ret < 0)
+			fscache_abort_object(object);
+	}
 
 	_leave("");
 }
@@ -99,6 +190,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 	fscache_operation_init(op, NULL);
 	fscache_operation_init_slow(op, fscache_attr_changed_op);
 	op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+	fscache_set_op_name(op, "Attr");
 
 	spin_lock(&cookie->lock);
 
@@ -184,6 +276,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 	op->start_time	= jiffies;
 	INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
 	INIT_LIST_HEAD(&op->to_do);
+	fscache_set_op_name(&op->op, "Retr");
 	return op;
 }
 
@@ -221,6 +314,43 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
 }
 
 /*
+ * wait for an object to become active (or dead)
+ */
+static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
+						 struct fscache_retrieval *op,
+						 atomic_t *stat_op_waits,
+						 atomic_t *stat_object_dead)
+{
+	int ret;
+
+	if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags))
+		goto check_if_dead;
+
+	_debug(">>> WT");
+	fscache_stat(stat_op_waits);
+	if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			fscache_wait_bit_interruptible,
+			TASK_INTERRUPTIBLE) < 0) {
+		ret = fscache_cancel_op(&op->op);
+		if (ret == 0)
+			return -ERESTARTSYS;
+
+		/* it's been removed from the pending queue by another party,
+		 * so we should get to run shortly */
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+	}
+	_debug("<<< GO");
+
+check_if_dead:
+	if (unlikely(fscache_object_is_dead(object))) {
+		fscache_stat(stat_object_dead);
+		return -ENOBUFS;
+	}
+	return 0;
+}
+
+/*
  * read a page from the cache or allocate a block in which to store it
  * - we return:
  *   -ENOMEM	- out of memory, nothing done
@@ -257,6 +387,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
 	}
+	fscache_set_op_name(&op->op, "RetrRA1");
 
 	spin_lock(&cookie->lock);
 
@@ -267,6 +398,9 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
 
+	atomic_inc(&object->n_reads);
+	set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
 	if (fscache_submit_op(object, &op->op) < 0)
 		goto nobufs_unlock;
 	spin_unlock(&cookie->lock);
@@ -279,23 +413,27 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	/* we wait for the operation to become active, and then process it
 	 * *here*, in this thread, and not in the thread pool */
-	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
-		_debug(">>> WT");
-		fscache_stat(&fscache_n_retrieval_op_waits);
-		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
-			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-		_debug("<<< GO");
-	}
+	ret = fscache_wait_for_retrieval_activation(
+		object, op,
+		__fscache_stat(&fscache_n_retrieval_op_waits),
+		__fscache_stat(&fscache_n_retrievals_object_dead));
+	if (ret < 0)
+		goto error;
 
 	/* ask the cache to honour the operation */
 	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+		fscache_stat(&fscache_n_cop_allocate_page);
 		ret = object->cache->ops->allocate_page(op, page, gfp);
+		fscache_stat_d(&fscache_n_cop_allocate_page);
 		if (ret == 0)
 			ret = -ENODATA;
 	} else {
+		fscache_stat(&fscache_n_cop_read_or_alloc_page);
 		ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+		fscache_stat_d(&fscache_n_cop_read_or_alloc_page);
 	}
 
+error:
 	if (ret == -ENOMEM)
 		fscache_stat(&fscache_n_retrievals_nomem);
 	else if (ret == -ERESTARTSYS)
@@ -347,7 +485,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 				  void *context,
 				  gfp_t gfp)
 {
-	fscache_pages_retrieval_func_t func;
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
 	int ret;
@@ -369,6 +506,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 	op = fscache_alloc_retrieval(mapping, end_io_func, context);
 	if (!op)
 		return -ENOMEM;
+	fscache_set_op_name(&op->op, "RetrRAN");
 
 	spin_lock(&cookie->lock);
 
@@ -377,6 +515,9 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	atomic_inc(&object->n_reads);
+	set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
 	if (fscache_submit_op(object, &op->op) < 0)
 		goto nobufs_unlock;
 	spin_unlock(&cookie->lock);
@@ -389,21 +530,27 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 
 	/* we wait for the operation to become active, and then process it
 	 * *here*, in this thread, and not in the thread pool */
-	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
-		_debug(">>> WT");
-		fscache_stat(&fscache_n_retrieval_op_waits);
-		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
-			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-		_debug("<<< GO");
-	}
+	ret = fscache_wait_for_retrieval_activation(
+		object, op,
+		__fscache_stat(&fscache_n_retrieval_op_waits),
+		__fscache_stat(&fscache_n_retrievals_object_dead));
+	if (ret < 0)
+		goto error;
 
 	/* ask the cache to honour the operation */
-	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
-		func = object->cache->ops->allocate_pages;
-	else
-		func = object->cache->ops->read_or_alloc_pages;
-	ret = func(op, pages, nr_pages, gfp);
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+		fscache_stat(&fscache_n_cop_allocate_pages);
+		ret = object->cache->ops->allocate_pages(
+			op, pages, nr_pages, gfp);
+		fscache_stat_d(&fscache_n_cop_allocate_pages);
+	} else {
+		fscache_stat(&fscache_n_cop_read_or_alloc_pages);
+		ret = object->cache->ops->read_or_alloc_pages(
+			op, pages, nr_pages, gfp);
+		fscache_stat_d(&fscache_n_cop_read_or_alloc_pages);
+	}
 
+error:
 	if (ret == -ENOMEM)
 		fscache_stat(&fscache_n_retrievals_nomem);
 	else if (ret == -ERESTARTSYS)
@@ -461,6 +608,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 	op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
 	if (!op)
 		return -ENOMEM;
+	fscache_set_op_name(&op->op, "RetrAL1");
 
 	spin_lock(&cookie->lock);
 
@@ -475,18 +623,22 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 
 	fscache_stat(&fscache_n_alloc_ops);
 
-	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
-		_debug(">>> WT");
-		fscache_stat(&fscache_n_alloc_op_waits);
-		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
-			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-		_debug("<<< GO");
-	}
+	ret = fscache_wait_for_retrieval_activation(
+		object, op,
+		__fscache_stat(&fscache_n_alloc_op_waits),
+		__fscache_stat(&fscache_n_allocs_object_dead));
+	if (ret < 0)
+		goto error;
 
 	/* ask the cache to honour the operation */
+	fscache_stat(&fscache_n_cop_allocate_page);
 	ret = object->cache->ops->allocate_page(op, page, gfp);
+	fscache_stat_d(&fscache_n_cop_allocate_page);
 
-	if (ret < 0)
+error:
+	if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_allocs_intr);
+	else if (ret < 0)
 		fscache_stat(&fscache_n_allocs_nobufs);
 	else
 		fscache_stat(&fscache_n_allocs_ok);
@@ -521,7 +673,7 @@ static void fscache_write_op(struct fscache_operation *_op)
 	struct fscache_storage *op =
 		container_of(_op, struct fscache_storage, op);
 	struct fscache_object *object = op->op.object;
-	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_cookie *cookie;
 	struct page *page;
 	unsigned n;
 	void *results[1];
@@ -529,16 +681,19 @@ static void fscache_write_op(struct fscache_operation *_op)
 
 	_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
 
-	spin_lock(&cookie->lock);
+	fscache_set_op_state(&op->op, "GetPage");
+
 	spin_lock(&object->lock);
+	cookie = object->cookie;
 
-	if (!fscache_object_is_active(object)) {
+	if (!fscache_object_is_active(object) || !cookie) {
 		spin_unlock(&object->lock);
-		spin_unlock(&cookie->lock);
 		_leave("");
 		return;
 	}
 
+	spin_lock(&cookie->stores_lock);
+
 	fscache_stat(&fscache_n_store_calls);
 
 	/* find a page to store */
@@ -549,23 +704,35 @@ static void fscache_write_op(struct fscache_operation *_op)
 		goto superseded;
 	page = results[0];
 	_debug("gang %d [%lx]", n, page->index);
-	if (page->index > op->store_limit)
+	if (page->index > op->store_limit) {
+		fscache_stat(&fscache_n_store_pages_over_limit);
 		goto superseded;
+	}
 
-	radix_tree_tag_clear(&cookie->stores, page->index,
-			     FSCACHE_COOKIE_PENDING_TAG);
+	if (page) {
+		radix_tree_tag_set(&cookie->stores, page->index,
+				   FSCACHE_COOKIE_STORING_TAG);
+		radix_tree_tag_clear(&cookie->stores, page->index,
+				     FSCACHE_COOKIE_PENDING_TAG);
+	}
 
+	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
-	spin_unlock(&cookie->lock);
 
 	if (page) {
+		fscache_set_op_state(&op->op, "Store");
+		fscache_stat(&fscache_n_store_pages);
+		fscache_stat(&fscache_n_cop_write_page);
 		ret = object->cache->ops->write_page(op, page);
-		fscache_end_page_write(cookie, page);
-		page_cache_release(page);
-		if (ret < 0)
+		fscache_stat_d(&fscache_n_cop_write_page);
+		fscache_set_op_state(&op->op, "EndWrite");
+		fscache_end_page_write(object, page);
+		if (ret < 0) {
+			fscache_set_op_state(&op->op, "Abort");
 			fscache_abort_object(object);
-		else
+		} else {
 			fscache_enqueue_operation(&op->op);
+		}
 	}
 
 	_leave("");
@@ -575,9 +742,9 @@ superseded:
 	/* this writer is going away and there aren't any more things to
 	 * write */
 	_debug("cease");
+	spin_unlock(&cookie->stores_lock);
 	clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
 	spin_unlock(&object->lock);
-	spin_unlock(&cookie->lock);
 	_leave("");
 }
 
@@ -634,6 +801,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	fscache_operation_init(&op->op, fscache_release_write_op);
 	fscache_operation_init_slow(&op->op, fscache_write_op);
 	op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+	fscache_set_op_name(&op->op, "Write1");
 
 	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
 	if (ret < 0)
@@ -652,6 +820,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	/* add the page to the pending-storage radix tree on the backing
 	 * object */
 	spin_lock(&object->lock);
+	spin_lock(&cookie->stores_lock);
 
 	_debug("store limit %llx", (unsigned long long) object->store_limit);
 
@@ -672,6 +841,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
 		goto already_pending;
 
+	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
 
 	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
@@ -693,6 +863,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 already_queued:
 	fscache_stat(&fscache_n_stores_again);
 already_pending:
+	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
 	spin_unlock(&cookie->lock);
 	radix_tree_preload_end();
@@ -702,7 +873,9 @@ already_pending:
 	return 0;
 
 submit_failed:
+	spin_lock(&cookie->stores_lock);
 	radix_tree_delete(&cookie->stores, page->index);
+	spin_unlock(&cookie->stores_lock);
 	page_cache_release(page);
 	ret = -ENOBUFS;
 	goto nobufs;
@@ -763,7 +936,9 @@ void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
 	if (TestClearPageFsCache(page) &&
 	    object->cache->ops->uncache_page) {
 		/* the cache backend releases the cookie lock */
+		fscache_stat(&fscache_n_cop_uncache_page);
 		object->cache->ops->uncache_page(object, page);
+		fscache_stat_d(&fscache_n_cop_uncache_page);
 		goto done;
 	}
 
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index beeab44bc31..1d9e4951a59 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -37,10 +37,20 @@ int __init fscache_proc_init(void)
 		goto error_histogram;
 #endif
 
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+	if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL,
+			 &fscache_objlist_fops))
+		goto error_objects;
+#endif
+
 	_leave(" = 0");
 	return 0;
 
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+error_objects:
+#endif
 #ifdef CONFIG_FSCACHE_HISTOGRAM
+	remove_proc_entry("fs/fscache/histogram", NULL);
 error_histogram:
 #endif
 #ifdef CONFIG_FSCACHE_STATS
@@ -58,6 +68,9 @@ error_dir:
  */
 void fscache_proc_cleanup(void)
 {
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+	remove_proc_entry("fs/fscache/objects", NULL);
+#endif
 #ifdef CONFIG_FSCACHE_HISTOGRAM
 	remove_proc_entry("fs/fscache/histogram", NULL);
 #endif
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 65deb99e756..46435f3aae6 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -25,6 +25,8 @@ atomic_t fscache_n_op_requeue;
 atomic_t fscache_n_op_deferred_release;
 atomic_t fscache_n_op_release;
 atomic_t fscache_n_op_gc;
+atomic_t fscache_n_op_cancelled;
+atomic_t fscache_n_op_rejected;
 
 atomic_t fscache_n_attr_changed;
 atomic_t fscache_n_attr_changed_ok;
@@ -36,6 +38,8 @@ atomic_t fscache_n_allocs;
 atomic_t fscache_n_allocs_ok;
 atomic_t fscache_n_allocs_wait;
 atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_allocs_intr;
+atomic_t fscache_n_allocs_object_dead;
 atomic_t fscache_n_alloc_ops;
 atomic_t fscache_n_alloc_op_waits;
 
@@ -46,6 +50,7 @@ atomic_t fscache_n_retrievals_nodata;
 atomic_t fscache_n_retrievals_nobufs;
 atomic_t fscache_n_retrievals_intr;
 atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrievals_object_dead;
 atomic_t fscache_n_retrieval_ops;
 atomic_t fscache_n_retrieval_op_waits;
 
@@ -56,6 +61,14 @@ atomic_t fscache_n_stores_nobufs;
 atomic_t fscache_n_stores_oom;
 atomic_t fscache_n_store_ops;
 atomic_t fscache_n_store_calls;
+atomic_t fscache_n_store_pages;
+atomic_t fscache_n_store_radix_deletes;
+atomic_t fscache_n_store_pages_over_limit;
+
+atomic_t fscache_n_store_vmscan_not_storing;
+atomic_t fscache_n_store_vmscan_gone;
+atomic_t fscache_n_store_vmscan_busy;
+atomic_t fscache_n_store_vmscan_cancelled;
 
 atomic_t fscache_n_marks;
 atomic_t fscache_n_uncaches;
@@ -74,6 +87,7 @@ atomic_t fscache_n_updates_run;
 atomic_t fscache_n_relinquishes;
 atomic_t fscache_n_relinquishes_null;
 atomic_t fscache_n_relinquishes_waitcrt;
+atomic_t fscache_n_relinquishes_retire;
 
 atomic_t fscache_n_cookie_index;
 atomic_t fscache_n_cookie_data;
@@ -84,6 +98,7 @@ atomic_t fscache_n_object_no_alloc;
 atomic_t fscache_n_object_lookups;
 atomic_t fscache_n_object_lookups_negative;
 atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_lookups_timed_out;
 atomic_t fscache_n_object_created;
 atomic_t fscache_n_object_avail;
 atomic_t fscache_n_object_dead;
@@ -93,6 +108,23 @@ atomic_t fscache_n_checkaux_okay;
 atomic_t fscache_n_checkaux_update;
 atomic_t fscache_n_checkaux_obsolete;
 
+atomic_t fscache_n_cop_alloc_object;
+atomic_t fscache_n_cop_lookup_object;
+atomic_t fscache_n_cop_lookup_complete;
+atomic_t fscache_n_cop_grab_object;
+atomic_t fscache_n_cop_update_object;
+atomic_t fscache_n_cop_drop_object;
+atomic_t fscache_n_cop_put_object;
+atomic_t fscache_n_cop_sync_cache;
+atomic_t fscache_n_cop_attr_changed;
+atomic_t fscache_n_cop_read_or_alloc_page;
+atomic_t fscache_n_cop_read_or_alloc_pages;
+atomic_t fscache_n_cop_allocate_page;
+atomic_t fscache_n_cop_allocate_pages;
+atomic_t fscache_n_cop_write_page;
+atomic_t fscache_n_cop_uncache_page;
+atomic_t fscache_n_cop_dissociate_pages;
+
 /*
  * display the general statistics
  */
@@ -129,10 +161,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_acquires_nobufs),
 		   atomic_read(&fscache_n_acquires_oom));
 
-	seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+	seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n",
 		   atomic_read(&fscache_n_object_lookups),
 		   atomic_read(&fscache_n_object_lookups_negative),
 		   atomic_read(&fscache_n_object_lookups_positive),
+		   atomic_read(&fscache_n_object_lookups_timed_out),
 		   atomic_read(&fscache_n_object_created));
 
 	seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
@@ -140,10 +173,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_updates_null),
 		   atomic_read(&fscache_n_updates_run));
 
-	seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
+	seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n",
 		   atomic_read(&fscache_n_relinquishes),
 		   atomic_read(&fscache_n_relinquishes_null),
-		   atomic_read(&fscache_n_relinquishes_waitcrt));
+		   atomic_read(&fscache_n_relinquishes_waitcrt),
+		   atomic_read(&fscache_n_relinquishes_retire));
 
 	seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
 		   atomic_read(&fscache_n_attr_changed),
@@ -152,14 +186,16 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_attr_changed_nomem),
 		   atomic_read(&fscache_n_attr_changed_calls));
 
-	seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
+	seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n",
 		   atomic_read(&fscache_n_allocs),
 		   atomic_read(&fscache_n_allocs_ok),
 		   atomic_read(&fscache_n_allocs_wait),
-		   atomic_read(&fscache_n_allocs_nobufs));
-	seq_printf(m, "Allocs : ops=%u owt=%u\n",
+		   atomic_read(&fscache_n_allocs_nobufs),
+		   atomic_read(&fscache_n_allocs_intr));
+	seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n",
 		   atomic_read(&fscache_n_alloc_ops),
-		   atomic_read(&fscache_n_alloc_op_waits));
+		   atomic_read(&fscache_n_alloc_op_waits),
+		   atomic_read(&fscache_n_allocs_object_dead));
 
 	seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
 		   " int=%u oom=%u\n",
@@ -170,9 +206,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_retrievals_nobufs),
 		   atomic_read(&fscache_n_retrievals_intr),
 		   atomic_read(&fscache_n_retrievals_nomem));
-	seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+	seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n",
 		   atomic_read(&fscache_n_retrieval_ops),
-		   atomic_read(&fscache_n_retrieval_op_waits));
+		   atomic_read(&fscache_n_retrieval_op_waits),
+		   atomic_read(&fscache_n_retrievals_object_dead));
 
 	seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
 		   atomic_read(&fscache_n_stores),
@@ -180,18 +217,49 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_stores_again),
 		   atomic_read(&fscache_n_stores_nobufs),
 		   atomic_read(&fscache_n_stores_oom));
-	seq_printf(m, "Stores : ops=%u run=%u\n",
+	seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n",
 		   atomic_read(&fscache_n_store_ops),
-		   atomic_read(&fscache_n_store_calls));
+		   atomic_read(&fscache_n_store_calls),
+		   atomic_read(&fscache_n_store_pages),
+		   atomic_read(&fscache_n_store_radix_deletes),
+		   atomic_read(&fscache_n_store_pages_over_limit));
 
-	seq_printf(m, "Ops    : pend=%u run=%u enq=%u\n",
+	seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
+		   atomic_read(&fscache_n_store_vmscan_not_storing),
+		   atomic_read(&fscache_n_store_vmscan_gone),
+		   atomic_read(&fscache_n_store_vmscan_busy),
+		   atomic_read(&fscache_n_store_vmscan_cancelled));
+
+	seq_printf(m, "Ops    : pend=%u run=%u enq=%u can=%u rej=%u\n",
 		   atomic_read(&fscache_n_op_pend),
 		   atomic_read(&fscache_n_op_run),
-		   atomic_read(&fscache_n_op_enqueue));
+		   atomic_read(&fscache_n_op_enqueue),
+		   atomic_read(&fscache_n_op_cancelled),
+		   atomic_read(&fscache_n_op_rejected));
 	seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
 		   atomic_read(&fscache_n_op_deferred_release),
 		   atomic_read(&fscache_n_op_release),
 		   atomic_read(&fscache_n_op_gc));
+
+	seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n",
+		   atomic_read(&fscache_n_cop_alloc_object),
+		   atomic_read(&fscache_n_cop_lookup_object),
+		   atomic_read(&fscache_n_cop_lookup_complete),
+		   atomic_read(&fscache_n_cop_grab_object));
+	seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+		   atomic_read(&fscache_n_cop_update_object),
+		   atomic_read(&fscache_n_cop_drop_object),
+		   atomic_read(&fscache_n_cop_put_object),
+		   atomic_read(&fscache_n_cop_attr_changed),
+		   atomic_read(&fscache_n_cop_sync_cache));
+	seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n",
+		   atomic_read(&fscache_n_cop_read_or_alloc_page),
+		   atomic_read(&fscache_n_cop_read_or_alloc_pages),
+		   atomic_read(&fscache_n_cop_allocate_page),
+		   atomic_read(&fscache_n_cop_allocate_pages),
+		   atomic_read(&fscache_n_cop_write_page),
+		   atomic_read(&fscache_n_cop_uncache_page),
+		   atomic_read(&fscache_n_cop_dissociate_pages));
 	return 0;
 }
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8ada78aade5..4787ae6c5c1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -385,6 +385,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (fc->no_create)
 		return -ENOSYS;
 
+	if (flags & O_DIRECT)
+		return -EINVAL;
+
 	forget_req = fuse_get_req(fc);
 	if (IS_ERR(forget_req))
 		return PTR_ERR(forget_req);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5971359d209..b192c661caa 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,6 +8,9 @@ config GFS2_FS
 	select FS_POSIX_ACL
 	select CRC32
 	select SLOW_WORK
+	select QUOTA
+	select QUOTACTL
+	select FS_JOURNAL_INFO
 	help
 	  A cluster filesystem.
 
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3fc4e3ac7d8..3eb1ea84617 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -12,6 +12,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/gfs2_ondisk.h>
@@ -26,108 +27,44 @@
 #include "trans.h"
 #include "util.h"
 
-#define ACL_ACCESS 1
-#define ACL_DEFAULT 0
-
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-			  struct gfs2_ea_request *er, int *remove, mode_t *mode)
+static const char *gfs2_acl_name(int type)
 {
-	struct posix_acl *acl;
-	int error;
-
-	error = gfs2_acl_validate_remove(ip, access);
-	if (error)
-		return error;
-
-	if (!er->er_data)
-		return -EINVAL;
-
-	acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (!acl) {
-		*remove = 1;
-		return 0;
-	}
-
-	error = posix_acl_valid(acl);
-	if (error)
-		goto out;
-
-	if (access) {
-		error = posix_acl_equiv_mode(acl, mode);
-		if (!error)
-			*remove = 1;
-		else if (error > 0)
-			error = 0;
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return GFS2_POSIX_ACL_ACCESS;
+	case ACL_TYPE_DEFAULT:
+		return GFS2_POSIX_ACL_DEFAULT;
 	}
-
-out:
-	posix_acl_release(acl);
-	return error;
-}
-
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
-{
-	if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
-		return -EOPNOTSUPP;
-	if (!is_owner_or_cap(&ip->i_inode))
-		return -EPERM;
-	if (S_ISLNK(ip->i_inode.i_mode))
-		return -EOPNOTSUPP;
-	if (!access && !S_ISDIR(ip->i_inode.i_mode))
-		return -EACCES;
-
-	return 0;
+	return NULL;
 }
 
-static int acl_get(struct gfs2_inode *ip, const char *name,
-		   struct posix_acl **acl, struct gfs2_ea_location *el,
-		   char **datap, unsigned int *lenp)
+static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
 {
+	struct posix_acl *acl;
+	const char *name;
 	char *data;
-	unsigned int len;
-	int error;
-
-	el->el_bh = NULL;
+	int len;
 
 	if (!ip->i_eattr)
-		return 0;
-
-	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
-	if (error)
-		return error;
-	if (!el->el_ea)
-		return 0;
-	if (!GFS2_EA_DATA_LEN(el->el_ea))
-		goto out;
+		return NULL;
 
-	len = GFS2_EA_DATA_LEN(el->el_ea);
-	data = kmalloc(len, GFP_NOFS);
-	error = -ENOMEM;
-	if (!data)
-		goto out;
+	acl = get_cached_acl(&ip->i_inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
 
-	error = gfs2_ea_get_copy(ip, el, data, len);
-	if (error < 0)
-		goto out_kfree;
-	error = 0;
+	name = gfs2_acl_name(type);
+	if (name == NULL)
+		return ERR_PTR(-EINVAL);
 
-	if (acl) {
-		*acl = posix_acl_from_xattr(data, len);
-		if (IS_ERR(*acl))
-			error = PTR_ERR(*acl);
-	}
+	len = gfs2_xattr_acl_get(ip, name, &data);
+	if (len < 0)
+		return ERR_PTR(len);
+	if (len == 0)
+		return NULL;
 
-out_kfree:
-	if (error || !datap) {
-		kfree(data);
-	} else {
-		*datap = data;
-		*lenp = len;
-	}
-out:
-	return error;
+	acl = posix_acl_from_xattr(data, len);
+	kfree(data);
+	return acl;
 }
 
 /**
@@ -140,14 +77,12 @@ out:
 
 int gfs2_check_acl(struct inode *inode, int mask)
 {
-	struct gfs2_ea_location el;
-	struct posix_acl *acl = NULL;
+	struct posix_acl *acl;
 	int error;
 
-	error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
-	brelse(el.el_bh);
-	if (error)
-		return error;
+	acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
 
 	if (acl) {
 		error = posix_acl_permission(inode, acl, mask);
@@ -158,57 +93,75 @@ int gfs2_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+static int gfs2_set_mode(struct inode *inode, mode_t mode)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head *dibh;
-	int error;
+	int error = 0;
 
-	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-	if (error)
-		return error;
+	if (mode != inode->i_mode) {
+		struct iattr iattr;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		gfs2_assert_withdraw(sdp,
-				(ip->i_inode.i_mode & S_IFMT) == (mode & S_IFMT));
-		ip->i_inode.i_mode = mode;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
+		iattr.ia_valid = ATTR_MODE;
+		iattr.ia_mode = mode;
+
+		error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
 	}
 
-	gfs2_trans_end(sdp);
+	return error;
+}
+
+static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
+{
+	int error;
+	int len;
+	char *data;
+	const char *name = gfs2_acl_name(type);
 
-	return 0;
+	BUG_ON(name == NULL);
+	len = posix_acl_to_xattr(acl, NULL, 0);
+	if (len == 0)
+		return 0;
+	data = kmalloc(len, GFP_NOFS);
+	if (data == NULL)
+		return -ENOMEM;
+	error = posix_acl_to_xattr(acl, data, len);
+	if (error < 0)
+		goto out;
+	error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+out:
+	kfree(data);
+	return error;
 }
 
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
 {
-	struct gfs2_ea_location el;
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-	struct posix_acl *acl = NULL, *clone;
-	mode_t mode = ip->i_inode.i_mode;
-	char *data = NULL;
-	unsigned int len;
-	int error;
+	struct posix_acl *acl, *clone;
+	mode_t mode = inode->i_mode;
+	int error = 0;
 
 	if (!sdp->sd_args.ar_posix_acl)
 		return 0;
-	if (S_ISLNK(ip->i_inode.i_mode))
+	if (S_ISLNK(inode->i_mode))
 		return 0;
 
-	error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
-	brelse(el.el_bh);
-	if (error)
-		return error;
+	acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
 	if (!acl) {
 		mode &= ~current_umask();
-		if (mode != ip->i_inode.i_mode)
-			error = munge_mode(ip, mode);
+		if (mode != inode->i_mode)
+			error = gfs2_set_mode(inode, mode);
 		return error;
 	}
 
+	if (S_ISDIR(inode->i_mode)) {
+		error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
+		if (error)
+			goto out;
+	}
+
 	clone = posix_acl_clone(acl, GFP_NOFS);
 	error = -ENOMEM;
 	if (!clone)
@@ -216,43 +169,32 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	posix_acl_release(acl);
 	acl = clone;
 
-	if (S_ISDIR(ip->i_inode.i_mode)) {
-		error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
-				       GFS2_POSIX_ACL_DEFAULT, data, len, 0);
-		if (error)
-			goto out;
-	}
-
 	error = posix_acl_create_masq(acl, &mode);
 	if (error < 0)
 		goto out;
 	if (error == 0)
 		goto munge;
 
-	posix_acl_to_xattr(acl, data, len);
-	error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
-			       GFS2_POSIX_ACL_ACCESS, data, len, 0);
+	error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
 	if (error)
 		goto out;
 munge:
-	error = munge_mode(ip, mode);
+	error = gfs2_set_mode(inode, mode);
 out:
 	posix_acl_release(acl);
-	kfree(data);
 	return error;
 }
 
 int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 {
-	struct posix_acl *acl = NULL, *clone;
-	struct gfs2_ea_location el;
+	struct posix_acl *acl, *clone;
 	char *data;
 	unsigned int len;
 	int error;
 
-	error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
-	if (error)
-		goto out_brelse;
+	acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
 	if (!acl)
 		return gfs2_setattr_simple(ip, attr);
 
@@ -265,15 +207,134 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 
 	error = posix_acl_chmod_masq(acl, attr->ia_mode);
 	if (!error) {
+		len = posix_acl_to_xattr(acl, NULL, 0);
+		data = kmalloc(len, GFP_NOFS);
+		error = -ENOMEM;
+		if (data == NULL)
+			goto out;
 		posix_acl_to_xattr(acl, data, len);
-		error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+		error = gfs2_xattr_acl_chmod(ip, attr, data);
+		kfree(data);
+		set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
 	}
 
 out:
 	posix_acl_release(acl);
-	kfree(data);
-out_brelse:
-	brelse(el.el_bh);
 	return error;
 }
 
+static int gfs2_acl_type(const char *name)
+{
+	if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
+		return ACL_TYPE_ACCESS;
+	if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
+		return ACL_TYPE_DEFAULT;
+	return -EINVAL;
+}
+
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+				 void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int type;
+	int error;
+
+	type = gfs2_acl_type(name);
+	if (type < 0)
+		return type;
+
+	acl = gfs2_acl_get(GFS2_I(inode), type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+				 const void *value, size_t size, int flags)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct posix_acl *acl = NULL;
+	int error = 0, type;
+
+	if (!sdp->sd_args.ar_posix_acl)
+		return -EOPNOTSUPP;
+
+	type = gfs2_acl_type(name);
+	if (type < 0)
+		return type;
+	if (flags & XATTR_CREATE)
+		return -EINVAL;
+	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+		return value ? -EACCES : 0;
+	if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+		return -EPERM;
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!value)
+		goto set_acl;
+
+	acl = posix_acl_from_xattr(value, size);
+	if (!acl) {
+		/*
+		 * acl_set_file(3) may request that we set default ACLs with
+		 * zero length -- defend (gracefully) against that here.
+		 */
+		goto out;
+	}
+	if (IS_ERR(acl)) {
+		error = PTR_ERR(acl);
+		goto out;
+	}
+
+	error = posix_acl_valid(acl);
+	if (error)
+		goto out_release;
+
+	error = -EINVAL;
+	if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
+		goto out_release;
+
+	if (type == ACL_TYPE_ACCESS) {
+		mode_t mode = inode->i_mode;
+		error = posix_acl_equiv_mode(acl, &mode);
+
+		if (error <= 0) {
+			posix_acl_release(acl);
+			acl = NULL;
+
+			if (error < 0)
+				return error;
+		}
+
+		error = gfs2_set_mode(inode, mode);
+		if (error)
+			goto out_release;
+	}
+
+set_acl:
+	error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0);
+	if (!error) {
+		if (acl)
+			set_cached_acl(inode, type, acl);
+		else
+			forget_cached_acl(inode, type);
+	}
+out_release:
+	posix_acl_release(acl);
+out:
+	return error;
+}
+
+struct xattr_handler gfs2_xattr_system_handler = {
+	.prefix = XATTR_SYSTEM_PREFIX,
+	.get    = gfs2_xattr_system_get,
+	.set    = gfs2_xattr_system_set,
+};
+
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 6751930bfb6..9306a2e6620 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,26 +13,12 @@
 #include "incore.h"
 
 #define GFS2_POSIX_ACL_ACCESS		"posix_acl_access"
-#define GFS2_POSIX_ACL_ACCESS_LEN	16
 #define GFS2_POSIX_ACL_DEFAULT		"posix_acl_default"
-#define GFS2_POSIX_ACL_DEFAULT_LEN	17
+#define GFS2_ACL_MAX_ENTRIES		25
 
-#define GFS2_ACL_IS_ACCESS(name, len) \
-         ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
-         !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
-
-#define GFS2_ACL_IS_DEFAULT(name, len) \
-         ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
-         !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
-
-struct gfs2_ea_request;
-
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-			  struct gfs2_ea_request *er,
-			  int *remove, mode_t *mode);
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
-int gfs2_check_acl(struct inode *inode, int mask);
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
-int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern int gfs2_check_acl(struct inode *inode, int mask);
+extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
+extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern struct xattr_handler gfs2_xattr_system_handler;
 
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 694b5d48f03..7b8da941526 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -269,7 +269,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
 	unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int i;
 	int ret;
 
@@ -313,11 +312,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 
 		if (ret || (--(wbc->nr_to_write) <= 0))
 			ret = 1;
-		if (wbc->nonblocking && bdi_write_congested(bdi)) {
-			wbc->encountered_congestion = 1;
-			ret = 1;
-		}
-
 	}
 	gfs2_trans_end(sdp);
 	return ret;
@@ -338,7 +332,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 static int gfs2_write_cache_jdata(struct address_space *mapping,
 				  struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int ret = 0;
 	int done = 0;
 	struct pagevec pvec;
@@ -348,11 +341,6 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
 	int scanned = 0;
 	int range_whole = 0;
 
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
@@ -819,8 +807,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 		mark_inode_dirty(inode);
 	}
 
-	if (inode == sdp->sd_rindex)
+	if (inode == sdp->sd_rindex) {
 		adjust_fs_space(inode);
+		ip->i_gh.gh_flags |= GL_NOCACHE;
+	}
 
 	brelse(dibh);
 	gfs2_trans_end(sdp);
@@ -889,8 +879,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 		mark_inode_dirty(inode);
 	}
 
-	if (inode == sdp->sd_rindex)
+	if (inode == sdp->sd_rindex) {
 		adjust_fs_space(inode);
+		ip->i_gh.gh_flags |= GL_NOCACHE;
+	}
 
 	brelse(dibh);
 	gfs2_trans_end(sdp);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 297d7e5ceba..25fddc100f1 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -525,38 +525,6 @@ consist_inode:
 	return ERR_PTR(-EIO);
 }
 
-
-/**
- * dirent_first - Return the first dirent
- * @dip: the directory
- * @bh: The buffer
- * @dent: Pointer to list of dirents
- *
- * return first dirent whether bh points to leaf or stuffed dinode
- *
- * Returns: IS_LEAF, IS_DINODE, or -errno
- */
-
-static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
-			struct gfs2_dirent **dent)
-{
-	struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
-
-	if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
-		if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
-			return -EIO;
-		*dent = (struct gfs2_dirent *)(bh->b_data +
-					       sizeof(struct gfs2_leaf));
-		return IS_LEAF;
-	} else {
-		if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
-			return -EIO;
-		*dent = (struct gfs2_dirent *)(bh->b_data +
-					       sizeof(struct gfs2_dinode));
-		return IS_DINODE;
-	}
-}
-
 static int dirent_check_reclen(struct gfs2_inode *dip,
 			       const struct gfs2_dirent *d, const void *end_p)
 {
@@ -1006,7 +974,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	divider = (start + half_len) << (32 - dip->i_depth);
 
 	/*  Copy the entries  */
-	dirent_first(dip, obh, &dent);
+	dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
 
 	do {
 		next = dent;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8b674b1f3a5..f455a03a09e 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -241,15 +241,14 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 	int rv = 0;
 
 	write_lock(gl_lock_addr(gl->gl_hash));
-	if (atomic_dec_and_test(&gl->gl_ref)) {
+	if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
 		hlist_del(&gl->gl_list);
-		write_unlock(gl_lock_addr(gl->gl_hash));
-		spin_lock(&lru_lock);
 		if (!list_empty(&gl->gl_lru)) {
 			list_del_init(&gl->gl_lru);
 			atomic_dec(&lru_count);
 		}
 		spin_unlock(&lru_lock);
+		write_unlock(gl_lock_addr(gl->gl_hash));
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 		glock_free(gl);
 		rv = 1;
@@ -513,7 +512,6 @@ retry:
 			GLOCK_BUG_ON(gl, 1);
 		}
 		spin_unlock(&gl->gl_spin);
-		gfs2_glock_put(gl);
 		return;
 	}
 
@@ -524,8 +522,6 @@ retry:
 		if (glops->go_xmote_bh) {
 			spin_unlock(&gl->gl_spin);
 			rv = glops->go_xmote_bh(gl, gh);
-			if (rv == -EAGAIN)
-				return;
 			spin_lock(&gl->gl_spin);
 			if (rv) {
 				do_error(gl, rv);
@@ -540,7 +536,6 @@ out:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
 out_locked:
 	spin_unlock(&gl->gl_spin);
-	gfs2_glock_put(gl);
 }
 
 static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
@@ -600,7 +595,6 @@ __acquires(&gl->gl_spin)
 
 	if (!(ret & LM_OUT_ASYNC)) {
 		finish_xmote(gl, ret);
-		gfs2_glock_hold(gl);
 		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 			gfs2_glock_put(gl);
 	} else {
@@ -672,12 +666,17 @@ out:
 	return;
 
 out_sched:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	smp_mb__after_clear_bit();
 	gfs2_glock_hold(gl);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 		gfs2_glock_put_nolock(gl);
+	return;
+
 out_unlock:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	goto out;
+	smp_mb__after_clear_bit();
+	return;
 }
 
 static void delete_work_func(struct work_struct *work)
@@ -707,9 +706,12 @@ static void glock_work_func(struct work_struct *work)
 {
 	unsigned long delay = 0;
 	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+	int drop_ref = 0;
 
-	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
 		finish_xmote(gl, gl->gl_reply);
+		drop_ref = 1;
+	}
 	down_read(&gfs2_umount_flush_sem);
 	spin_lock(&gl->gl_spin);
 	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -727,6 +729,8 @@ static void glock_work_func(struct work_struct *work)
 	if (!delay ||
 	    queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
 		gfs2_glock_put(gl);
+	if (drop_ref)
+		gfs2_glock_put(gl);
 }
 
 /**
@@ -1361,10 +1365,6 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 		list_del_init(&gl->gl_lru);
 		atomic_dec(&lru_count);
 
-		/* Check if glock is about to be freed */
-		if (atomic_read(&gl->gl_ref) == 0)
-			continue;
-
 		/* Test for being demotable */
 		if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 			gfs2_glock_hold(gl);
@@ -1375,10 +1375,11 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 				handle_callback(gl, LM_ST_UNLOCKED, 0);
 				nr--;
 			}
+			clear_bit(GLF_LOCK, &gl->gl_flags);
+			smp_mb__after_clear_bit();
 			if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 				gfs2_glock_put_nolock(gl);
 			spin_unlock(&gl->gl_spin);
-			clear_bit(GLF_LOCK, &gl->gl_flags);
 			spin_lock(&lru_lock);
 			continue;
 		}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c609894ec0d..13f0bd22813 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -180,15 +180,6 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
 	return gl->gl_state == LM_ST_SHARED;
 }
 
-static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
-{
-	int ret;
-	spin_lock(&gl->gl_spin);
-	ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
-	spin_unlock(&gl->gl_spin);
-	return ret;
-}
-
 int gfs2_glock_get(struct gfs2_sbd *sdp,
 		   u64 number, const struct gfs2_glock_operations *glops,
 		   int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 6985eef06c3..78554acc060 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
+#include <linux/posix_acl.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -184,8 +185,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 	if (flags & DIO_METADATA) {
 		struct address_space *mapping = gl->gl_aspace->i_mapping;
 		truncate_inode_pages(mapping, 0);
-		if (ip)
+		if (ip) {
 			set_bit(GIF_INVALID, &ip->i_flags);
+			forget_all_cached_acls(&ip->i_inode);
+		}
 	}
 
 	if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 6edb423f90b..4792200978c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -429,7 +429,11 @@ struct gfs2_args {
 	unsigned int ar_meta:1;			/* mount metafs */
 	unsigned int ar_discard:1;		/* discard requests */
 	unsigned int ar_errors:2;               /* errors=withdraw | panic */
+	unsigned int ar_nobarrier:1;            /* do not send barriers */
 	int ar_commit;				/* Commit interval */
+	int ar_statfs_quantum;			/* The fast statfs interval */
+	int ar_quota_quantum;			/* The quota interval */
+	int ar_statfs_percent;			/* The % change to force sync */
 };
 
 struct gfs2_tune {
@@ -558,6 +562,7 @@ struct gfs2_sbd {
 	spinlock_t sd_statfs_spin;
 	struct gfs2_statfs_change_host sd_statfs_master;
 	struct gfs2_statfs_change_host sd_statfs_local;
+	int sd_statfs_force_sync;
 
 	/* Resource group stuff */
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fb15d3b1f40..26ba2a4c4a2 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -871,7 +871,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock2;
 
-	error = gfs2_acl_create(dip, GFS2_I(inode));
+	error = gfs2_acl_create(dip, inode);
 	if (error)
 		goto fail_gunlock2;
 
@@ -947,9 +947,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 
 	str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
 	str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
-	str->di_header.__pad0 = 0;
 	str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
-	str->di_header.__pad1 = 0;
 	str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
 	str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
 	str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 13c6237c5f6..4511b08fc45 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -596,7 +596,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	memset(lh, 0, sizeof(struct gfs2_log_header));
 	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
 	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+	lh->lh_header.__pad0 = cpu_to_be64(0);
 	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
 	lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
 	lh->lh_flags = cpu_to_be32(flags);
 	lh->lh_tail = cpu_to_be32(tail);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9969ff062c5..de97632ba32 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -132,6 +132,7 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
 static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
 	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+	struct gfs2_meta_header *mh;
 	struct gfs2_trans *tr;
 
 	lock_buffer(bd->bd_bh);
@@ -148,6 +149,9 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
 	gfs2_meta_check(sdp, bd->bd_bh);
 	gfs2_pin(sdp, bd->bd_bh);
+	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+	mh->__pad0 = cpu_to_be64(0);
+	mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
 	sdp->sd_log_num_buf++;
 	list_add(&le->le_list, &sdp->sd_log_le_buf);
 	tr->tr_num_buf_new++;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index eacd78a5d08..5b31f7741a8 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,7 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		goto fail_unregister;
 
-	error = slow_work_register_user();
+	error = slow_work_register_user(THIS_MODULE);
 	if (error)
 		goto fail_slow;
 
@@ -163,7 +163,7 @@ static void __exit exit_gfs2_fs(void)
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 
 	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52fb6c04898..edfee24f363 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -18,6 +18,7 @@
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/slow-work.h>
+#include <linux/quotaops.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -62,13 +63,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_quota_warn_period = 10;
 	gt->gt_quota_scale_num = 1;
 	gt->gt_quota_scale_den = 1;
-	gt->gt_quota_quantum = 60;
 	gt->gt_new_files_jdata = 0;
 	gt->gt_max_readahead = 1 << 18;
 	gt->gt_stall_secs = 600;
 	gt->gt_complain_secs = 10;
-	gt->gt_statfs_quantum = 30;
-	gt->gt_statfs_slow = 0;
 }
 
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
@@ -1114,7 +1112,7 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp)
  * Returns: errno
  */
 
-static int fill_super(struct super_block *sb, void *data, int silent)
+static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
 {
 	struct gfs2_sbd *sdp;
 	struct gfs2_holder mount_gh;
@@ -1125,17 +1123,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
 		return -ENOMEM;
 	}
-
-	sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
-	sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
-	sdp->sd_args.ar_commit = 60;
-	sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
-
-	error = gfs2_mount_args(sdp, &sdp->sd_args, data);
-	if (error) {
-		printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
-		goto fail;
-	}
+	sdp->sd_args = *args;
 
 	if (sdp->sd_args.ar_spectator) {
                 sb->s_flags |= MS_RDONLY;
@@ -1143,11 +1131,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	}
 	if (sdp->sd_args.ar_posix_acl)
 		sb->s_flags |= MS_POSIXACL;
+	if (sdp->sd_args.ar_nobarrier)
+		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
 
 	sb->s_magic = GFS2_MAGIC;
 	sb->s_op = &gfs2_super_ops;
 	sb->s_export_op = &gfs2_export_ops;
 	sb->s_xattr = gfs2_xattr_handlers;
+	sb->s_qcop = &gfs2_quotactl_ops;
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
 	sb->s_time_gran = 1;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
@@ -1160,6 +1152,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
 
 	sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+	sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
+	if (sdp->sd_args.ar_statfs_quantum) {
+		sdp->sd_tune.gt_statfs_slow = 0;
+		sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
+	}
+	else {
+		sdp->sd_tune.gt_statfs_slow = 1;
+		sdp->sd_tune.gt_statfs_quantum = 30;
+	}
 
 	error = init_names(sdp, silent);
 	if (error)
@@ -1243,18 +1244,127 @@ fail:
 	return error;
 }
 
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data, struct vfsmount *mnt)
+static int set_gfs2_super(struct super_block *s, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
+	s->s_bdev = data;
+	s->s_dev = s->s_bdev->bd_dev;
+
+	/*
+	 * We set the bdi here to the queue backing, file systems can
+	 * overwrite this in ->fill_super()
+	 */
+	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+	return 0;
 }
 
-static int test_meta_super(struct super_block *s, void *ptr)
+static int test_gfs2_super(struct super_block *s, void *ptr)
 {
 	struct block_device *bdev = ptr;
 	return (bdev == s->s_bdev);
 }
 
+/**
+ * gfs2_get_sb - Get the GFS2 superblock
+ * @fs_type: The GFS2 filesystem type
+ * @flags: Mount flags
+ * @dev_name: The name of the device
+ * @data: The mount arguments
+ * @mnt: The vfsmnt for this mount
+ *
+ * Q. Why not use get_sb_bdev() ?
+ * A. We need to select one of two root directories to mount, independent
+ *    of whether this is the initial, or subsequent, mount of this sb
+ *
+ * Returns: 0 or -ve on error
+ */
+
+static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	struct block_device *bdev;
+	struct super_block *s;
+	fmode_t mode = FMODE_READ;
+	int error;
+	struct gfs2_args args;
+	struct gfs2_sbd *sdp;
+
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		error = -EBUSY;
+		goto error_bdev;
+	}
+	s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	error = PTR_ERR(s);
+	if (IS_ERR(s))
+		goto error_bdev;
+
+	memset(&args, 0, sizeof(args));
+	args.ar_quota = GFS2_QUOTA_DEFAULT;
+	args.ar_data = GFS2_DATA_DEFAULT;
+	args.ar_commit = 60;
+	args.ar_statfs_quantum = 30;
+	args.ar_quota_quantum = 60;
+	args.ar_errors = GFS2_ERRORS_DEFAULT;
+
+	error = gfs2_mount_args(&args, data);
+	if (error) {
+		printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+		if (s->s_root)
+			goto error_super;
+		deactivate_locked_super(s);
+		return error;
+	}
+
+	if (s->s_root) {
+		error = -EBUSY;
+		if ((flags ^ s->s_flags) & MS_RDONLY)
+			goto error_super;
+		close_bdev_exclusive(bdev, mode);
+	} else {
+		char b[BDEVNAME_SIZE];
+
+		s->s_flags = flags;
+		s->s_mode = mode;
+		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		sb_set_blocksize(s, block_size(bdev));
+		error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(s);
+			return error;
+		}
+		s->s_flags |= MS_ACTIVE;
+		bdev->bd_super = s;
+	}
+
+	sdp = s->s_fs_info;
+	mnt->mnt_sb = s;
+	if (args.ar_meta)
+		mnt->mnt_root = dget(sdp->sd_master_dir);
+	else
+		mnt->mnt_root = dget(sdp->sd_root_dir);
+	return 0;
+
+error_super:
+	deactivate_locked_super(s);
+error_bdev:
+	close_bdev_exclusive(bdev, mode);
+	return error;
+}
+
 static int set_meta_super(struct super_block *s, void *ptr)
 {
 	return -EINVAL;
@@ -1274,13 +1384,17 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 		       dev_name, error);
 		return error;
 	}
-	s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
+	s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
 		 path.dentry->d_inode->i_sb->s_bdev);
 	path_put(&path);
 	if (IS_ERR(s)) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
 		return PTR_ERR(s);
 	}
+	if ((flags ^ s->s_flags) & MS_RDONLY) {
+		deactivate_locked_super(s);
+		return -EBUSY;
+	}
 	sdp = s->s_fs_info;
 	mnt->mnt_sb = s;
 	mnt->mnt_root = dget(sdp->sd_master_dir);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 2e9b9326bfc..e3bf6eab875 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -15,7 +15,7 @@
  * fuzziness in the current usage value of IDs that are being used on different
  * nodes in the cluster simultaneously.  So, it is possible for a user on
  * multiple nodes to overrun their quota, but that overrun is controlable.
- * Since quota tags are part of transactions, there is no need to a quota check
+ * Since quota tags are part of transactions, there is no need for a quota check
  * program to be run on node crashes or anything like that.
  *
  * There are couple of knobs that let the administrator manage the quota
@@ -47,6 +47,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/quota.h>
+#include <linux/dqblk_xfs.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -65,13 +67,6 @@
 #define QUOTA_USER 1
 #define QUOTA_GROUP 0
 
-struct gfs2_quota_host {
-	u64 qu_limit;
-	u64 qu_warn;
-	s64 qu_value;
-	u32 qu_ll_next;
-};
-
 struct gfs2_quota_change_host {
 	u64 qc_change;
 	u32 qc_flags; /* GFS2_QCF_... */
@@ -164,7 +159,7 @@ fail:
 	return error;
 }
 
-static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
 		  struct gfs2_quota_data **qdp)
 {
 	struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
@@ -202,7 +197,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
 
 		spin_unlock(&qd_lru_lock);
 
-		if (qd || !create) {
+		if (qd) {
 			if (new_qd) {
 				gfs2_glock_put(new_qd->qd_gl);
 				kmem_cache_free(gfs2_quotad_cachep, new_qd);
@@ -461,12 +456,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)
 	qd_put(qd);
 }
 
-static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
 		    struct gfs2_quota_data **qdp)
 {
 	int error;
 
-	error = qd_get(sdp, user, id, create, qdp);
+	error = qd_get(sdp, user, id, qdp);
 	if (error)
 		return error;
 
@@ -508,20 +503,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
 	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
 		return 0;
 
-	error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, CREATE, qd);
+	error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
 	if (error)
 		goto out;
 	al->al_qd_num++;
 	qd++;
 
-	error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, CREATE, qd);
+	error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
 	if (error)
 		goto out;
 	al->al_qd_num++;
 	qd++;
 
 	if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
-		error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
+		error = qdsb_get(sdp, QUOTA_USER, uid, qd);
 		if (error)
 			goto out;
 		al->al_qd_num++;
@@ -529,7 +524,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
 	}
 
 	if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
-		error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
+		error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
 		if (error)
 			goto out;
 		al->al_qd_num++;
@@ -617,48 +612,36 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 	mutex_unlock(&sdp->sd_quota_mutex);
 }
 
-static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
-{
-	const struct gfs2_quota *str = buf;
-
-	qu->qu_limit = be64_to_cpu(str->qu_limit);
-	qu->qu_warn = be64_to_cpu(str->qu_warn);
-	qu->qu_value = be64_to_cpu(str->qu_value);
-	qu->qu_ll_next = be32_to_cpu(str->qu_ll_next);
-}
-
-static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
-{
-	struct gfs2_quota *str = buf;
-
-	str->qu_limit = cpu_to_be64(qu->qu_limit);
-	str->qu_warn = cpu_to_be64(qu->qu_warn);
-	str->qu_value = cpu_to_be64(qu->qu_value);
-	str->qu_ll_next = cpu_to_be32(qu->qu_ll_next);
-	memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
-}
-
 /**
- * gfs2_adjust_quota
+ * gfs2_adjust_quota - adjust record of current block usage
+ * @ip: The quota inode
+ * @loc: Offset of the entry in the quota file
+ * @change: The amount of usage change to record
+ * @qd: The quota data
+ * @fdq: The updated limits to record
  *
  * This function was mostly borrowed from gfs2_block_truncate_page which was
  * in turn mostly borrowed from ext3
+ *
+ * Returns: 0 or -ve on error
  */
+
 static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
-			     s64 change, struct gfs2_quota_data *qd)
+			     s64 change, struct gfs2_quota_data *qd,
+			     struct fs_disk_quota *fdq)
 {
 	struct inode *inode = &ip->i_inode;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long index = loc >> PAGE_CACHE_SHIFT;
 	unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
 	unsigned blocksize, iblock, pos;
-	struct buffer_head *bh;
+	struct buffer_head *bh, *dibh;
 	struct page *page;
 	void *kaddr;
-	char *ptr;
-	struct gfs2_quota_host qp;
+	struct gfs2_quota *qp;
 	s64 value;
 	int err = -EIO;
+	u64 size;
 
 	if (gfs2_is_stuffed(ip))
 		gfs2_unstuff_dinode(ip, NULL);
@@ -700,18 +683,38 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	gfs2_trans_add_bh(ip->i_gl, bh, 0);
 
 	kaddr = kmap_atomic(page, KM_USER0);
-	ptr = kaddr + offset;
-	gfs2_quota_in(&qp, ptr);
-	qp.qu_value += change;
-	value = qp.qu_value;
-	gfs2_quota_out(&qp, ptr);
+	qp = kaddr + offset;
+	value = (s64)be64_to_cpu(qp->qu_value) + change;
+	qp->qu_value = cpu_to_be64(value);
+	qd->qd_qb.qb_value = qp->qu_value;
+	if (fdq) {
+		if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+			qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+			qd->qd_qb.qb_warn = qp->qu_warn;
+		}
+		if (fdq->d_fieldmask & FS_DQ_BHARD) {
+			qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+			qd->qd_qb.qb_limit = qp->qu_limit;
+		}
+	}
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_USER0);
-	err = 0;
-	qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
-	qd->qd_qb.qb_value = cpu_to_be64(value);
-	((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC);
-	((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value);
+
+	err = gfs2_meta_inode_buffer(ip, &dibh);
+	if (err)
+		goto unlock;
+
+	size = loc + sizeof(struct gfs2_quota);
+	if (size > inode->i_size) {
+		ip->i_disksize = size;
+		i_size_write(inode, size);
+	}
+	inode->i_mtime = inode->i_atime = CURRENT_TIME;
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+	mark_inode_dirty(inode);
+
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
@@ -739,9 +742,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 		return -ENOMEM;
 
 	sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+	mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA);
 	for (qx = 0; qx < num_qd; qx++) {
-		error = gfs2_glock_nq_init(qda[qx]->qd_gl,
-					   LM_ST_EXCLUSIVE,
+		error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
 					   GL_NOCACHE, &ghs[qx]);
 		if (error)
 			goto out;
@@ -795,9 +798,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 	for (x = 0; x < num_qd; x++) {
 		qd = qda[x];
 		offset = qd2offset(qd);
-		error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
-					  (struct gfs2_quota_data *)
-					  qd);
+		error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL);
 		if (error)
 			goto out_end_trans;
 
@@ -817,21 +818,44 @@ out_gunlock:
 out:
 	while (qx--)
 		gfs2_glock_dq_uninit(&ghs[qx]);
+	mutex_unlock(&ip->i_inode.i_mutex);
 	kfree(ghs);
 	gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
 	return error;
 }
 
+static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_quota q;
+	struct gfs2_quota_lvb *qlvb;
+	loff_t pos;
+	int error;
+
+	memset(&q, 0, sizeof(struct gfs2_quota));
+	pos = qd2offset(qd);
+	error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q));
+	if (error < 0)
+		return error;
+
+	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+	qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+	qlvb->__pad = 0;
+	qlvb->qb_limit = q.qu_limit;
+	qlvb->qb_warn = q.qu_warn;
+	qlvb->qb_value = q.qu_value;
+	qd->qd_qb = *qlvb;
+
+	return 0;
+}
+
 static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
 		    struct gfs2_holder *q_gh)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
 	struct gfs2_holder i_gh;
-	struct gfs2_quota_host q;
-	char buf[sizeof(struct gfs2_quota)];
 	int error;
-	struct gfs2_quota_lvb *qlvb;
 
 restart:
 	error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
@@ -841,11 +865,9 @@ restart:
 	qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
 
 	if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
-		loff_t pos;
 		gfs2_glock_dq_uninit(q_gh);
-		error = gfs2_glock_nq_init(qd->qd_gl,
-					   LM_ST_EXCLUSIVE, GL_NOCACHE,
-					   q_gh);
+		error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, q_gh);
 		if (error)
 			return error;
 
@@ -853,29 +875,14 @@ restart:
 		if (error)
 			goto fail;
 
-		memset(buf, 0, sizeof(struct gfs2_quota));
-		pos = qd2offset(qd);
-		error = gfs2_internal_read(ip, NULL, buf, &pos,
-					   sizeof(struct gfs2_quota));
-		if (error < 0)
+		error = update_qd(sdp, qd);
+		if (error)
 			goto fail_gunlock;
 
 		gfs2_glock_dq_uninit(&i_gh);
-
-		gfs2_quota_in(&q, buf);
-		qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
-		qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
-		qlvb->__pad = 0;
-		qlvb->qb_limit = cpu_to_be64(q.qu_limit);
-		qlvb->qb_warn = cpu_to_be64(q.qu_warn);
-		qlvb->qb_value = cpu_to_be64(q.qu_value);
-		qd->qd_qb = *qlvb;
-
-		if (gfs2_glock_is_blocking(qd->qd_gl)) {
-			gfs2_glock_dq_uninit(q_gh);
-			force_refresh = 0;
-			goto restart;
-		}
+		gfs2_glock_dq_uninit(q_gh);
+		force_refresh = 0;
+		goto restart;
 	}
 
 	return 0;
@@ -995,7 +1002,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 
-	printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
+	printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
 	       sdp->sd_fsname, type,
 	       (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
 	       qd->qd_id);
@@ -1032,6 +1039,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 
 		if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
 			print_message(qd, "exceeded");
+			quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+					   USRQUOTA : GRPQUOTA, qd->qd_id,
+					   sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
+
 			error = -EDQUOT;
 			break;
 		} else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
@@ -1039,6 +1050,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 			   time_after_eq(jiffies, qd->qd_last_warn +
 					 gfs2_tune_get(sdp,
 						gt_quota_warn_period) * HZ)) {
+			quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+					   USRQUOTA : GRPQUOTA, qd->qd_id,
+					   sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
 			error = print_message(qd, "warning");
 			qd->qd_last_warn = jiffies;
 		}
@@ -1069,8 +1083,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 	}
 }
 
-int gfs2_quota_sync(struct gfs2_sbd *sdp)
+int gfs2_quota_sync(struct super_block *sb, int type)
 {
+	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_quota_data **qda;
 	unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
 	unsigned int num_qd;
@@ -1118,7 +1133,7 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
 	struct gfs2_holder q_gh;
 	int error;
 
-	error = qd_get(sdp, user, id, CREATE, &qd);
+	error = qd_get(sdp, user, id, &qd);
 	if (error)
 		return error;
 
@@ -1127,7 +1142,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
 		gfs2_glock_dq_uninit(&q_gh);
 
 	qd_put(qd);
-
 	return error;
 }
 
@@ -1298,12 +1312,12 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
 }
 
 static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
-			       int (*fxn)(struct gfs2_sbd *sdp),
+			       int (*fxn)(struct super_block *sb, int type),
 			       unsigned long t, unsigned long *timeo,
 			       unsigned int *new_timeo)
 {
 	if (t >= *timeo) {
-		int error = fxn(sdp);
+		int error = fxn(sdp->sd_vfs, 0);
 		quotad_error(sdp, msg, error);
 		*timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
 	} else {
@@ -1330,6 +1344,14 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
 	}
 }
 
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
+	if (!sdp->sd_statfs_force_sync) {
+		sdp->sd_statfs_force_sync = 1;
+		wake_up(&sdp->sd_quota_wait);
+	}
+}
+
+
 /**
  * gfs2_quotad - Write cached quota changes into the quota file
  * @sdp: Pointer to GFS2 superblock
@@ -1349,8 +1371,15 @@ int gfs2_quotad(void *data)
 	while (!kthread_should_stop()) {
 
 		/* Update the master statfs file */
-		quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
-				   &statfs_timeo, &tune->gt_statfs_quantum);
+		if (sdp->sd_statfs_force_sync) {
+			int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
+			quotad_error(sdp, "statfs", error);
+			statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+		}
+		else
+			quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+				   	   &statfs_timeo,
+					   &tune->gt_statfs_quantum);
 
 		/* Update quota file */
 		quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
@@ -1367,7 +1396,7 @@ int gfs2_quotad(void *data)
 		spin_lock(&sdp->sd_trunc_lock);
 		empty = list_empty(&sdp->sd_trunc_list);
 		spin_unlock(&sdp->sd_trunc_lock);
-		if (empty)
+		if (empty && !sdp->sd_statfs_force_sync)
 			t -= schedule_timeout(t);
 		else
 			t = 0;
@@ -1377,3 +1406,181 @@ int gfs2_quotad(void *data)
 	return 0;
 }
 
+static int gfs2_quota_get_xstate(struct super_block *sb,
+				 struct fs_quota_stat *fqs)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+
+	memset(fqs, 0, sizeof(struct fs_quota_stat));
+	fqs->qs_version = FS_QSTAT_VERSION;
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
+		fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+	else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
+		fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+	if (sdp->sd_quota_inode) {
+		fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
+		fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
+	}
+	fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
+	fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
+	fqs->qs_incoredqs = atomic_read(&qd_lru_count);
+	return 0;
+}
+
+static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
+			   struct fs_disk_quota *fdq)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_quota_lvb *qlvb;
+	struct gfs2_quota_data *qd;
+	struct gfs2_holder q_gh;
+	int error;
+
+	memset(fdq, 0, sizeof(struct fs_disk_quota));
+
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return -ESRCH; /* Crazy XFS error code */
+
+	if (type == USRQUOTA)
+		type = QUOTA_USER;
+	else if (type == GRPQUOTA)
+		type = QUOTA_GROUP;
+	else
+		return -EINVAL;
+
+	error = qd_get(sdp, type, id, &qd);
+	if (error)
+		return error;
+	error = do_glock(qd, FORCE, &q_gh);
+	if (error)
+		goto out;
+
+	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+	fdq->d_version = FS_DQUOT_VERSION;
+	fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+	fdq->d_id = id;
+	fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
+	fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
+	fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+
+	gfs2_glock_dq_uninit(&q_gh);
+out:
+	qd_put(qd);
+	return error;
+}
+
+/* GFS2 only supports a subset of the XFS fields */
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+
+static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
+			   struct fs_disk_quota *fdq)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_quota_data *qd;
+	struct gfs2_holder q_gh, i_gh;
+	unsigned int data_blocks, ind_blocks;
+	unsigned int blocks = 0;
+	int alloc_required;
+	struct gfs2_alloc *al;
+	loff_t offset;
+	int error;
+
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return -ESRCH; /* Crazy XFS error code */
+
+	switch(type) {
+	case USRQUOTA:
+		type = QUOTA_USER;
+		if (fdq->d_flags != XFS_USER_QUOTA)
+			return -EINVAL;
+		break;
+	case GRPQUOTA:
+		type = QUOTA_GROUP;
+		if (fdq->d_flags != XFS_GROUP_QUOTA)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
+		return -EINVAL;
+	if (fdq->d_id != id)
+		return -EINVAL;
+
+	error = qd_get(sdp, type, id, &qd);
+	if (error)
+		return error;
+
+	mutex_lock(&ip->i_inode.i_mutex);
+	error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
+	if (error)
+		goto out_put;
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		goto out_q;
+
+	/* Check for existing entry, if none then alloc new blocks */
+	error = update_qd(sdp, qd);
+	if (error)
+		goto out_i;
+
+	/* If nothing has changed, this is a no-op */
+	if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
+	    (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+		fdq->d_fieldmask ^= FS_DQ_BSOFT;
+	if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
+	    (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+		fdq->d_fieldmask ^= FS_DQ_BHARD;
+	if (fdq->d_fieldmask == 0)
+		goto out_i;
+
+	offset = qd2offset(qd);
+	error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
+					  &alloc_required);
+	if (error)
+		goto out_i;
+	if (alloc_required) {
+		al = gfs2_alloc_get(ip);
+		if (al == NULL)
+			goto out_i;
+		gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+				       &data_blocks, &ind_blocks);
+		blocks = al->al_requested = 1 + data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto out_alloc;
+	}
+
+	error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+	if (error)
+		goto out_release;
+
+	/* Apply changes */
+	error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+
+	gfs2_trans_end(sdp);
+out_release:
+	if (alloc_required) {
+		gfs2_inplace_release(ip);
+out_alloc:
+		gfs2_alloc_put(ip);
+	}
+out_i:
+	gfs2_glock_dq_uninit(&i_gh);
+out_q:
+	gfs2_glock_dq_uninit(&q_gh);
+out_put:
+	mutex_unlock(&ip->i_inode.i_mutex);
+	qd_put(qd);
+	return error;
+}
+
+const struct quotactl_ops gfs2_quotactl_ops = {
+	.quota_sync     = gfs2_quota_sync,
+	.get_xstate     = gfs2_quota_get_xstate,
+	.get_xquota	= gfs2_xquota_get,
+	.set_xquota	= gfs2_xquota_set,
+};
+
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 0fa5fa63d0e..e271fa07ad0 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,13 +25,15 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
 extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 			      u32 uid, u32 gid);
 
-extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_sync(struct super_block *sb, int type);
 extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
 
 extern int gfs2_quota_init(struct gfs2_sbd *sdp);
 extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
 extern int gfs2_quotad(void *data);
 
+extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
+
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -50,5 +52,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 }
 
 extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern const struct quotactl_ops gfs2_quotactl_ops;
 
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 59d2695509d..4b9bece3d43 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -7,6 +7,7 @@
  * of the GNU General Public License version 2.
  */
 
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
@@ -409,7 +410,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 	memset(lh, 0, sizeof(struct gfs2_log_header));
 	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
 	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+	lh->lh_header.__pad0 = cpu_to_be64(0);
 	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
 	lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
 	lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
 	lh->lh_blkno = cpu_to_be32(lblock);
@@ -593,6 +596,7 @@ fail:
 }
 
 struct slow_work_ops gfs2_recover_ops = {
+	.owner	 = THIS_MODULE,
 	.get_ref = gfs2_recover_get_ref,
 	.put_ref = gfs2_recover_put_ref,
 	.execute = gfs2_recover_work,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8f1cfb02a6c..0608f490c29 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1710,11 +1710,16 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
 {
 	struct gfs2_rgrpd *rgd;
 	struct gfs2_holder ri_gh, rgd_gh;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+	int ri_locked = 0;
 	int error;
 
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		goto fail;
+	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+		error = gfs2_rindex_hold(sdp, &ri_gh);
+		if (error)
+			goto fail;
+		ri_locked = 1;
+	}
 
 	error = -EINVAL;
 	rgd = gfs2_blk2rgrpd(sdp, no_addr);
@@ -1730,7 +1735,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
 
 	gfs2_glock_dq_uninit(&rgd_gh);
 fail_rindex:
-	gfs2_glock_dq_uninit(&ri_gh);
+	if (ri_locked)
+		gfs2_glock_dq_uninit(&ri_gh);
 fail:
 	return error;
 }
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0ec3ec672de..c282ad41f3d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -70,6 +70,11 @@ enum {
 	Opt_commit,
 	Opt_err_withdraw,
 	Opt_err_panic,
+	Opt_statfs_quantum,
+	Opt_statfs_percent,
+	Opt_quota_quantum,
+	Opt_barrier,
+	Opt_nobarrier,
 	Opt_error,
 };
 
@@ -101,18 +106,23 @@ static const match_table_t tokens = {
 	{Opt_commit, "commit=%d"},
 	{Opt_err_withdraw, "errors=withdraw"},
 	{Opt_err_panic, "errors=panic"},
+	{Opt_statfs_quantum, "statfs_quantum=%d"},
+	{Opt_statfs_percent, "statfs_percent=%d"},
+	{Opt_quota_quantum, "quota_quantum=%d"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
 	{Opt_error, NULL}
 };
 
 /**
  * gfs2_mount_args - Parse mount options
- * @sdp:
- * @data:
+ * @args: The structure into which the parsed options will be written
+ * @options: The options to parse
  *
  * Return: errno
  */
 
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+int gfs2_mount_args(struct gfs2_args *args, char *options)
 {
 	char *o;
 	int token;
@@ -157,7 +167,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 			break;
 		case Opt_debug:
 			if (args->ar_errors == GFS2_ERRORS_PANIC) {
-				fs_info(sdp, "-o debug and -o errors=panic "
+				printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
 				       "are mutually exclusive.\n");
 				return -EINVAL;
 			}
@@ -210,7 +220,29 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 		case Opt_commit:
 			rv = match_int(&tmp[0], &args->ar_commit);
 			if (rv || args->ar_commit <= 0) {
-				fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+				printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_statfs_quantum:
+			rv = match_int(&tmp[0], &args->ar_statfs_quantum);
+			if (rv || args->ar_statfs_quantum < 0) {
+				printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_quota_quantum:
+			rv = match_int(&tmp[0], &args->ar_quota_quantum);
+			if (rv || args->ar_quota_quantum <= 0) {
+				printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_statfs_percent:
+			rv = match_int(&tmp[0], &args->ar_statfs_percent);
+			if (rv || args->ar_statfs_percent < 0 ||
+			    args->ar_statfs_percent > 100) {
+				printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n");
 				return rv ? rv : -EINVAL;
 			}
 			break;
@@ -219,15 +251,21 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 			break;
 		case Opt_err_panic:
 			if (args->ar_debug) {
-				fs_info(sdp, "-o debug and -o errors=panic "
+				printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
 					"are mutually exclusive.\n");
 				return -EINVAL;
 			}
 			args->ar_errors = GFS2_ERRORS_PANIC;
 			break;
+		case Opt_barrier:
+			args->ar_nobarrier = 0;
+			break;
+		case Opt_nobarrier:
+			args->ar_nobarrier = 1;
+			break;
 		case Opt_error:
 		default:
-			fs_info(sdp, "invalid mount option: %s\n", o);
+			printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o);
 			return -EINVAL;
 		}
 	}
@@ -442,7 +480,10 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 {
 	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
 	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
 	struct buffer_head *l_bh;
+	s64 x, y;
+	int need_sync = 0;
 	int error;
 
 	error = gfs2_meta_inode_buffer(l_ip, &l_bh);
@@ -456,9 +497,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 	l_sc->sc_free += free;
 	l_sc->sc_dinodes += dinodes;
 	gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+	if (sdp->sd_args.ar_statfs_percent) {
+		x = 100 * l_sc->sc_free;
+		y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
+		if (x >= y || x <= -y)
+			need_sync = 1;
+	}
 	spin_unlock(&sdp->sd_statfs_spin);
 
 	brelse(l_bh);
+	if (need_sync)
+		gfs2_wake_up_statfs(sdp);
 }
 
 void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
@@ -484,8 +533,9 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
 	gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
 }
 
-int gfs2_statfs_sync(struct gfs2_sbd *sdp)
+int gfs2_statfs_sync(struct super_block *sb, int type)
 {
+	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
 	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
@@ -521,6 +571,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
 		goto out_bh2;
 
 	update_statfs(sdp, m_bh, l_bh);
+	sdp->sd_statfs_force_sync = 0;
 
 	gfs2_trans_end(sdp);
 
@@ -712,8 +763,8 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	int error;
 
 	flush_workqueue(gfs2_delete_workqueue);
-	gfs2_quota_sync(sdp);
-	gfs2_statfs_sync(sdp);
+	gfs2_quota_sync(sdp->sd_vfs, 0);
+	gfs2_statfs_sync(sdp->sd_vfs, 0);
 
 	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
 				   &t_gh);
@@ -1061,8 +1112,13 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	spin_lock(&gt->gt_spin);
 	args.ar_commit = gt->gt_log_flush_secs;
+	args.ar_quota_quantum = gt->gt_quota_quantum;
+	if (gt->gt_statfs_slow)
+		args.ar_statfs_quantum = 0;
+	else
+		args.ar_statfs_quantum = gt->gt_statfs_quantum;
 	spin_unlock(&gt->gt_spin);
-	error = gfs2_mount_args(sdp, &args, data);
+	error = gfs2_mount_args(&args, data);
 	if (error)
 		return error;
 
@@ -1097,8 +1153,21 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 		sb->s_flags |= MS_POSIXACL;
 	else
 		sb->s_flags &= ~MS_POSIXACL;
+	if (sdp->sd_args.ar_nobarrier)
+		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+	else
+		clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
 	spin_lock(&gt->gt_spin);
 	gt->gt_log_flush_secs = args.ar_commit;
+	gt->gt_quota_quantum = args.ar_quota_quantum;
+	if (args.ar_statfs_quantum) {
+		gt->gt_statfs_slow = 0;
+		gt->gt_statfs_quantum = args.ar_statfs_quantum;
+	}
+	else {
+		gt->gt_statfs_slow = 1;
+		gt->gt_statfs_quantum = 30;
+	}
 	spin_unlock(&gt->gt_spin);
 
 	gfs2_online_uevent(sdp);
@@ -1179,7 +1248,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
 	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
 	struct gfs2_args *args = &sdp->sd_args;
-	int lfsecs;
+	int val;
 
 	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
 		seq_printf(s, ",meta");
@@ -1240,9 +1309,17 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	}
 	if (args->ar_discard)
 		seq_printf(s, ",discard");
-	lfsecs = sdp->sd_tune.gt_log_flush_secs;
-	if (lfsecs != 60)
-		seq_printf(s, ",commit=%d", lfsecs);
+	val = sdp->sd_tune.gt_log_flush_secs;
+	if (val != 60)
+		seq_printf(s, ",commit=%d", val);
+	val = sdp->sd_tune.gt_statfs_quantum;
+	if (val != 30)
+		seq_printf(s, ",statfs_quantum=%d", val);
+	val = sdp->sd_tune.gt_quota_quantum;
+	if (val != 60)
+		seq_printf(s, ",quota_quantum=%d", val);
+	if (args->ar_statfs_percent)
+		seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
 	if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
 		const char *state;
 
@@ -1259,6 +1336,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		}
 		seq_printf(s, ",errors=%s", state);
 	}
+	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+		seq_printf(s, ",nobarrier");
+
 	return 0;
 }
 
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 235db368288..3df60f2d84e 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -27,7 +27,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 
 extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
-extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
+extern int gfs2_mount_args(struct gfs2_args *args, char *data);
 
 extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
 extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
@@ -44,7 +44,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
 				  const void *buf);
 extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
 			  struct buffer_head *l_bh);
-extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+extern int gfs2_statfs_sync(struct super_block *sb, int type);
 
 extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 446329728d5..0dc34621f6a 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -85,11 +85,7 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
 	buf[0] = '\0';
 	if (!gfs2_uuid_valid(uuid))
 		return 0;
-	return snprintf(buf, PAGE_SIZE, "%02X%02X%02X%02X-%02X%02X-"
-			"%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
-			uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5],
-			uuid[6], uuid[7], uuid[8], uuid[9], uuid[10], uuid[11],
-			uuid[12], uuid[13], uuid[14], uuid[15]);
+	return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid);
 }
 
 static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
@@ -158,7 +154,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
 	if (simple_strtol(buf, NULL, 0) != 1)
 		return -EINVAL;
 
-	gfs2_statfs_sync(sdp);
+	gfs2_statfs_sync(sdp->sd_vfs, 0);
 	return len;
 }
 
@@ -171,13 +167,14 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
 	if (simple_strtol(buf, NULL, 0) != 1)
 		return -EINVAL;
 
-	gfs2_quota_sync(sdp);
+	gfs2_quota_sync(sdp->sd_vfs, 0);
 	return len;
 }
 
 static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
 					size_t len)
 {
+	int error;
 	u32 id;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -185,13 +182,14 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
 
 	id = simple_strtoul(buf, NULL, 0);
 
-	gfs2_quota_refresh(sdp, 1, id);
-	return len;
+	error = gfs2_quota_refresh(sdp, 1, id);
+	return error ? error : len;
 }
 
 static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
 					 size_t len)
 {
+	int error;
 	u32 id;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -199,8 +197,8 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
 
 	id = simple_strtoul(buf, NULL, 0);
 
-	gfs2_quota_refresh(sdp, 0, id);
-	return len;
+	error = gfs2_quota_refresh(sdp, 0, id);
+	return error ? error : len;
 }
 
 static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
@@ -573,14 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
 	if (!sdp->sd_args.ar_spectator)
 		add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
-	if (gfs2_uuid_valid(uuid)) {
-		add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
-			       "%02X%02X-%02X%02X%02X%02X%02X%02X",
-			       uuid[0], uuid[1], uuid[2], uuid[3], uuid[4],
-			       uuid[5], uuid[6], uuid[7], uuid[8], uuid[9],
-			       uuid[10], uuid[11], uuid[12], uuid[13],
-			       uuid[14], uuid[15]);
-	}
+	if (gfs2_uuid_valid(uuid))
+		add_uevent_var(env, "UUID=%pUB", uuid);
 	return 0;
 }
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8a0f8ef6ee2..912f5cbc474 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -186,8 +186,8 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
 	return 0;
 }
 
-int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-		 struct gfs2_ea_location *el)
+static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+			struct gfs2_ea_location *el)
 {
 	struct ea_find ef;
 	int error;
@@ -516,8 +516,8 @@ out:
 	return error;
 }
 
-int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-		     char *data, size_t size)
+static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			    char *data, size_t size)
 {
 	int ret;
 	size_t len = GFS2_EA_DATA_LEN(el->el_ea);
@@ -534,6 +534,36 @@ int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
 	return len;
 }
 
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata)
+{
+	struct gfs2_ea_location el;
+	int error;
+	int len;
+	char *data;
+
+	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		goto out;
+	if (!GFS2_EA_DATA_LEN(el.el_ea))
+		goto out;
+
+	len = GFS2_EA_DATA_LEN(el.el_ea);
+	data = kmalloc(len, GFP_NOFS);
+	error = -ENOMEM;
+	if (data == NULL)
+		goto out;
+
+	error = gfs2_ea_get_copy(ip, &el, data, len);
+	if (error == 0)
+		error = len;
+	*ppdata = data;
+out:
+	brelse(el.el_bh);
+	return error;
+}
+
 /**
  * gfs2_xattr_get - Get a GFS2 extended attribute
  * @inode: The inode
@@ -1259,22 +1289,26 @@ fail:
 	return error;
 }
 
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-		      struct iattr *attr, char *data)
+int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
+	struct gfs2_ea_location el;
 	struct buffer_head *dibh;
 	int error;
 
-	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
+	if (error)
+		return error;
+
+	if (GFS2_EA_IS_STUFFED(el.el_ea)) {
 		error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
 		if (error)
 			return error;
 
-		gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
-		memcpy(GFS2_EA2DATA(el->el_ea), data,
-		       GFS2_EA_DATA_LEN(el->el_ea));
+		gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
+		memcpy(GFS2_EA2DATA(el.el_ea), data,
+		       GFS2_EA_DATA_LEN(el.el_ea));
 	} else
-		error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+		error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
 
 	if (error)
 		return error;
@@ -1507,18 +1541,6 @@ static int gfs2_xattr_user_set(struct inode *inode, const char *name,
 	return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
 }
 
-static int gfs2_xattr_system_get(struct inode *inode, const char *name,
-				 void *buffer, size_t size)
-{
-	return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
-}
-
-static int gfs2_xattr_system_set(struct inode *inode, const char *name,
-				 const void *value, size_t size, int flags)
-{
-	return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
-}
-
 static int gfs2_xattr_security_get(struct inode *inode, const char *name,
 				   void *buffer, size_t size)
 {
@@ -1543,12 +1565,6 @@ static struct xattr_handler gfs2_xattr_security_handler = {
 	.set    = gfs2_xattr_security_set,
 };
 
-static struct xattr_handler gfs2_xattr_system_handler = {
-	.prefix = XATTR_SYSTEM_PREFIX,
-	.get    = gfs2_xattr_system_get,
-	.set    = gfs2_xattr_system_set,
-};
-
 struct xattr_handler *gfs2_xattr_handlers[] = {
 	&gfs2_xattr_user_handler,
 	&gfs2_xattr_security_handler,
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index cbdfd774373..8d6ae5813c4 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -62,11 +62,7 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
 
 /* Exported to acl.c */
 
-extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-			struct gfs2_ea_location *el);
-extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-			    char *data, size_t size);
-extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-			     struct iattr *attr, char *data);
+extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
+extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
 
 #endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 6d98f116ca0..424b0337f52 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -289,6 +289,10 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
 	err = hfs_brec_find(&src_fd);
 	if (err)
 		goto out;
+	if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
+		err = -EIO;
+		goto out;
+	}
 
 	hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
 			    src_fd.entrylength);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 7c69b98a2e4..2b3b8611b41 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -79,6 +79,11 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		filp->f_pos++;
 		/* fall through */
 	case 1:
+		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+			err = -EIO;
+			goto out;
+		}
+
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
 		if (entry.type != HFS_CDR_THD) {
 			printk(KERN_ERR "hfs: bad catalog folder thread\n");
@@ -109,6 +114,12 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			err = -EIO;
 			goto out;
 		}
+
+		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+			err = -EIO;
+			goto out;
+		}
+
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
 		type = entry.type;
 		len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index f7fcbe49da7..5ed7252b7b2 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -409,8 +409,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	/* try to get the root inode */
 	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
 	res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
-	if (!res)
+	if (!res) {
+		if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
+			res =  -EIO;
+			goto bail;
+		}
 		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
+	}
 	if (res) {
 		hfs_find_exit(&fd);
 		goto bail_no_root;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f2feaa06bf2..cadc4ce4865 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -14,6 +14,7 @@
 #include <linux/magic.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/bitmap.h>
 
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
 
@@ -115,15 +116,13 @@ static void hpfs_put_super(struct super_block *s)
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
 {
 	struct quad_buffer_head qbh;
-	unsigned *bits;
-	unsigned i, count;
-	if (!(bits = hpfs_map_4sectors(s, secno, &qbh, 4))) return 0;
-	count = 0;
-	for (i = 0; i < 2048 / sizeof(unsigned); i++) {
-		unsigned b; 
-		if (!bits[i]) continue;
-		for (b = bits[i]; b; b>>=1) count += b & 1;
-	}
+	unsigned long *bits;
+	unsigned count;
+
+	bits = hpfs_map_4sectors(s, secno, &qbh, 4);
+	if (!bits)
+		return 0;
+	count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
 	hpfs_brelse4(&qbh);
 	return count;
 }
diff --git a/fs/inode.c b/fs/inode.c
index 4d8e3be5597..06c1f02de61 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,7 +18,6 @@
 #include <linux/hash.h>
 #include <linux/swap.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
@@ -157,11 +156,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 
 	if (security_inode_alloc(inode))
 		goto out;
-
-	/* allocate and initialize an i_integrity */
-	if (ima_inode_alloc(inode))
-		goto out_free_security;
-
 	spin_lock_init(&inode->i_lock);
 	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
 
@@ -201,9 +195,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 #endif
 
 	return 0;
-
-out_free_security:
-	security_inode_free(inode);
 out:
 	return -ENOMEM;
 }
@@ -235,7 +226,6 @@ static struct inode *alloc_inode(struct super_block *sb)
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
-	ima_inode_free(inode);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
 #ifdef CONFIG_FS_POSIX_ACL
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index defb932eee9..0b3fa7974fa 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -36,286 +36,323 @@ static void *zisofs_zlib_workspace;
 static DEFINE_MUTEX(zisofs_zlib_lock);
 
 /*
- * When decompressing, we typically obtain more than one page
- * per reference.  We inject the additional pages into the page
- * cache as a form of readahead.
+ * Read data of @inode from @block_start to @block_end and uncompress
+ * to one zisofs block. Store the data in the @pages array with @pcount
+ * entries. Start storing at offset @poffset of the first page.
  */
-static int zisofs_readpage(struct file *file, struct page *page)
+static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
+				      loff_t block_end, int pcount,
+				      struct page **pages, unsigned poffset,
+				      int *errp)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct address_space *mapping = inode->i_mapping;
-	unsigned int maxpage, xpage, fpage, blockindex;
-	unsigned long offset;
-	unsigned long blockptr, blockendptr, cstart, cend, csize;
-	struct buffer_head *bh, *ptrbh[2];
-	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
-	unsigned int bufshift = ISOFS_BUFFER_BITS(inode);
-	unsigned long bufmask  = bufsize - 1;
-	int err = -EIO;
-	int i;
-	unsigned int header_size = ISOFS_I(inode)->i_format_parm[0];
 	unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
-	/* unsigned long zisofs_block_size = 1UL << zisofs_block_shift; */
-	unsigned int zisofs_block_page_shift = zisofs_block_shift-PAGE_CACHE_SHIFT;
-	unsigned long zisofs_block_pages = 1UL << zisofs_block_page_shift;
-	unsigned long zisofs_block_page_mask = zisofs_block_pages-1;
-	struct page *pages[zisofs_block_pages];
-	unsigned long index = page->index;
-	int indexblocks;
-
-	/* We have already been given one page, this is the one
-	   we must do. */
-	xpage = index & zisofs_block_page_mask;
-	pages[xpage] = page;
- 
-	/* The remaining pages need to be allocated and inserted */
-	offset = index & ~zisofs_block_page_mask;
-	blockindex = offset >> zisofs_block_page_shift;
-	maxpage = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
-	/*
-	 * If this page is wholly outside i_size we just return zero;
-	 * do_generic_file_read() will handle this for us
-	 */
-	if (page->index >= maxpage) {
-		SetPageUptodate(page);
-		unlock_page(page);
+	unsigned int bufsize = ISOFS_BUFFER_SIZE(inode);
+	unsigned int bufshift = ISOFS_BUFFER_BITS(inode);
+	unsigned int bufmask = bufsize - 1;
+	int i, block_size = block_end - block_start;
+	z_stream stream = { .total_out = 0,
+			    .avail_in = 0,
+			    .avail_out = 0, };
+	int zerr;
+	int needblocks = (block_size + (block_start & bufmask) + bufmask)
+				>> bufshift;
+	int haveblocks;
+	blkcnt_t blocknum;
+	struct buffer_head *bhs[needblocks + 1];
+	int curbh, curpage;
+
+	if (block_size > deflateBound(1UL << zisofs_block_shift)) {
+		*errp = -EIO;
 		return 0;
 	}
-
-	maxpage = min(zisofs_block_pages, maxpage-offset);
-
-	for ( i = 0 ; i < maxpage ; i++, offset++ ) {
-		if ( i != xpage ) {
-			pages[i] = grab_cache_page_nowait(mapping, offset);
-		}
-		page = pages[i];
-		if ( page ) {
-			ClearPageError(page);
-			kmap(page);
+	/* Empty block? */
+	if (block_size == 0) {
+		for ( i = 0 ; i < pcount ; i++ ) {
+			if (!pages[i])
+				continue;
+			memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE);
+			flush_dcache_page(pages[i]);
+			SetPageUptodate(pages[i]);
 		}
+		return ((loff_t)pcount) << PAGE_CACHE_SHIFT;
 	}
 
-	/* This is the last page filled, plus one; used in case of abort. */
-	fpage = 0;
+	/* Because zlib is not thread-safe, do all the I/O at the top. */
+	blocknum = block_start >> bufshift;
+	memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *));
+	haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
+	ll_rw_block(READ, haveblocks, bhs);
 
-	/* Find the pointer to this specific chunk */
-	/* Note: we're not using isonum_731() here because the data is known aligned */
-	/* Note: header_size is in 32-bit words (4 bytes) */
-	blockptr = (header_size + blockindex) << 2;
-	blockendptr = blockptr + 4;
+	curbh = 0;
+	curpage = 0;
+	/*
+	 * First block is special since it may be fractional.  We also wait for
+	 * it before grabbing the zlib mutex; odds are that the subsequent
+	 * blocks are going to come in in short order so we don't hold the zlib
+	 * mutex longer than necessary.
+	 */
 
-	indexblocks = ((blockptr^blockendptr) >> bufshift) ? 2 : 1;
-	ptrbh[0] = ptrbh[1] = NULL;
+	if (!bhs[0])
+		goto b_eio;
 
-	if ( isofs_get_blocks(inode, blockptr >> bufshift, ptrbh, indexblocks) != indexblocks ) {
-		if ( ptrbh[0] ) brelse(ptrbh[0]);
-		printk(KERN_DEBUG "zisofs: Null buffer on reading block table, inode = %lu, block = %lu\n",
-		       inode->i_ino, blockptr >> bufshift);
-		goto eio;
-	}
-	ll_rw_block(READ, indexblocks, ptrbh);
-
-	bh = ptrbh[0];
-	if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
-		printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n",
-		       inode->i_ino, blockptr >> bufshift);
-		if ( ptrbh[1] )
-			brelse(ptrbh[1]);
-		goto eio;
-	}
-	cstart = le32_to_cpu(*(__le32 *)(bh->b_data + (blockptr & bufmask)));
-
-	if ( indexblocks == 2 ) {
-		/* We just crossed a block boundary.  Switch to the next block */
-		brelse(bh);
-		bh = ptrbh[1];
-		if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
-			printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n",
-			       inode->i_ino, blockendptr >> bufshift);
-			goto eio;
-		}
+	wait_on_buffer(bhs[0]);
+	if (!buffer_uptodate(bhs[0])) {
+		*errp = -EIO;
+		goto b_eio;
 	}
-	cend = le32_to_cpu(*(__le32 *)(bh->b_data + (blockendptr & bufmask)));
-	brelse(bh);
 
-	if (cstart > cend)
-		goto eio;
+	stream.workspace = zisofs_zlib_workspace;
+	mutex_lock(&zisofs_zlib_lock);
 		
-	csize = cend-cstart;
-
-	if (csize > deflateBound(1UL << zisofs_block_shift))
-		goto eio;
-
-	/* Now page[] contains an array of pages, any of which can be NULL,
-	   and the locks on which we hold.  We should now read the data and
-	   release the pages.  If the pages are NULL the decompressed data
-	   for that particular page should be discarded. */
-	
-	if ( csize == 0 ) {
-		/* This data block is empty. */
-
-		for ( fpage = 0 ; fpage < maxpage ; fpage++ ) {
-			if ( (page = pages[fpage]) != NULL ) {
-				memset(page_address(page), 0, PAGE_CACHE_SIZE);
-				
-				flush_dcache_page(page);
-				SetPageUptodate(page);
-				kunmap(page);
-				unlock_page(page);
-				if ( fpage == xpage )
-					err = 0; /* The critical page */
-				else
-					page_cache_release(page);
+	zerr = zlib_inflateInit(&stream);
+	if (zerr != Z_OK) {
+		if (zerr == Z_MEM_ERROR)
+			*errp = -ENOMEM;
+		else
+			*errp = -EIO;
+		printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n",
+			       zerr);
+		goto z_eio;
+	}
+
+	while (curpage < pcount && curbh < haveblocks &&
+	       zerr != Z_STREAM_END) {
+		if (!stream.avail_out) {
+			if (pages[curpage]) {
+				stream.next_out = page_address(pages[curpage])
+						+ poffset;
+				stream.avail_out = PAGE_CACHE_SIZE - poffset;
+				poffset = 0;
+			} else {
+				stream.next_out = (void *)&zisofs_sink_page;
+				stream.avail_out = PAGE_CACHE_SIZE;
 			}
 		}
-	} else {
-		/* This data block is compressed. */
-		z_stream stream;
-		int bail = 0, left_out = -1;
-		int zerr;
-		int needblocks = (csize + (cstart & bufmask) + bufmask) >> bufshift;
-		int haveblocks;
-		struct buffer_head *bhs[needblocks+1];
-		struct buffer_head **bhptr;
-
-		/* Because zlib is not thread-safe, do all the I/O at the top. */
-
-		blockptr = cstart >> bufshift;
-		memset(bhs, 0, (needblocks+1)*sizeof(struct buffer_head *));
-		haveblocks = isofs_get_blocks(inode, blockptr, bhs, needblocks);
-		ll_rw_block(READ, haveblocks, bhs);
-
-		bhptr = &bhs[0];
-		bh = *bhptr++;
-
-		/* First block is special since it may be fractional.
-		   We also wait for it before grabbing the zlib
-		   mutex; odds are that the subsequent blocks are
-		   going to come in in short order so we don't hold
-		   the zlib mutex longer than necessary. */
-
-		if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
-			printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n",
-			       fpage, xpage, csize);
-			goto b_eio;
-		}
-		stream.next_in  = bh->b_data + (cstart & bufmask);
-		stream.avail_in = min(bufsize-(cstart & bufmask), csize);
-		csize -= stream.avail_in;
-
-		stream.workspace = zisofs_zlib_workspace;
-		mutex_lock(&zisofs_zlib_lock);
-		
-		zerr = zlib_inflateInit(&stream);
-		if ( zerr != Z_OK ) {
-			if ( err && zerr == Z_MEM_ERROR )
-				err = -ENOMEM;
-			printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n",
-			       zerr);
-			goto z_eio;
+		if (!stream.avail_in) {
+			wait_on_buffer(bhs[curbh]);
+			if (!buffer_uptodate(bhs[curbh])) {
+				*errp = -EIO;
+				break;
+			}
+			stream.next_in  = bhs[curbh]->b_data +
+						(block_start & bufmask);
+			stream.avail_in = min_t(unsigned, bufsize -
+						(block_start & bufmask),
+						block_size);
+			block_size -= stream.avail_in;
+			block_start = 0;
 		}
 
-		while ( !bail && fpage < maxpage ) {
-			page = pages[fpage];
-			if ( page )
-				stream.next_out = page_address(page);
-			else
-				stream.next_out = (void *)&zisofs_sink_page;
-			stream.avail_out = PAGE_CACHE_SIZE;
-
-			while ( stream.avail_out ) {
-				int ao, ai;
-				if ( stream.avail_in == 0 && left_out ) {
-					if ( !csize ) {
-						printk(KERN_WARNING "zisofs: ZF read beyond end of input\n");
-						bail = 1;
-						break;
-					} else {
-						bh = *bhptr++;
-						if ( !bh ||
-						     (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
-							/* Reached an EIO */
- 							printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n",
-							       fpage, xpage, csize);
-							       
-							bail = 1;
-							break;
-						}
-						stream.next_in = bh->b_data;
-						stream.avail_in = min(csize,bufsize);
-						csize -= stream.avail_in;
-					}
-				}
-				ao = stream.avail_out;  ai = stream.avail_in;
-				zerr = zlib_inflate(&stream, Z_SYNC_FLUSH);
-				left_out = stream.avail_out;
-				if ( zerr == Z_BUF_ERROR && stream.avail_in == 0 )
-					continue;
-				if ( zerr != Z_OK ) {
-					/* EOF, error, or trying to read beyond end of input */
-					if ( err && zerr == Z_MEM_ERROR )
-						err = -ENOMEM;
-					if ( zerr != Z_STREAM_END )
-						printk(KERN_DEBUG "zisofs: zisofs_inflate returned %d, inode = %lu, index = %lu, fpage = %d, xpage = %d, avail_in = %d, avail_out = %d, ai = %d, ao = %d\n",
-						       zerr, inode->i_ino, index,
-						       fpage, xpage,
-						       stream.avail_in, stream.avail_out,
-						       ai, ao);
-					bail = 1;
-					break;
+		while (stream.avail_out && stream.avail_in) {
+			zerr = zlib_inflate(&stream, Z_SYNC_FLUSH);
+			if (zerr == Z_BUF_ERROR && stream.avail_in == 0)
+				break;
+			if (zerr == Z_STREAM_END)
+				break;
+			if (zerr != Z_OK) {
+				/* EOF, error, or trying to read beyond end of input */
+				if (zerr == Z_MEM_ERROR)
+					*errp = -ENOMEM;
+				else {
+					printk(KERN_DEBUG
+					       "zisofs: zisofs_inflate returned"
+					       " %d, inode = %lu,"
+					       " page idx = %d, bh idx = %d,"
+					       " avail_in = %d,"
+					       " avail_out = %d\n",
+					       zerr, inode->i_ino, curpage,
+					       curbh, stream.avail_in,
+					       stream.avail_out);
+					*errp = -EIO;
 				}
+				goto inflate_out;
 			}
+		}
 
-			if ( stream.avail_out && zerr == Z_STREAM_END ) {
-				/* Fractional page written before EOF.  This may
-				   be the last page in the file. */
-				memset(stream.next_out, 0, stream.avail_out);
-				stream.avail_out = 0;
+		if (!stream.avail_out) {
+			/* This page completed */
+			if (pages[curpage]) {
+				flush_dcache_page(pages[curpage]);
+				SetPageUptodate(pages[curpage]);
 			}
+			curpage++;
+		}
+		if (!stream.avail_in)
+			curbh++;
+	}
+inflate_out:
+	zlib_inflateEnd(&stream);
 
-			if ( !stream.avail_out ) {
-				/* This page completed */
-				if ( page ) {
-					flush_dcache_page(page);
-					SetPageUptodate(page);
-					kunmap(page);
-					unlock_page(page);
-					if ( fpage == xpage )
-						err = 0; /* The critical page */
-					else
-						page_cache_release(page);
-				}
-				fpage++;
-			}
+z_eio:
+	mutex_unlock(&zisofs_zlib_lock);
+
+b_eio:
+	for (i = 0; i < haveblocks; i++)
+		brelse(bhs[i]);
+	return stream.total_out;
+}
+
+/*
+ * Uncompress data so that pages[full_page] is fully uptodate and possibly
+ * fills in other pages if we have data for them.
+ */
+static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
+			     struct page **pages)
+{
+	loff_t start_off, end_off;
+	loff_t block_start, block_end;
+	unsigned int header_size = ISOFS_I(inode)->i_format_parm[0];
+	unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+	unsigned int blockptr;
+	loff_t poffset = 0;
+	blkcnt_t cstart_block, cend_block;
+	struct buffer_head *bh;
+	unsigned int blkbits = ISOFS_BUFFER_BITS(inode);
+	unsigned int blksize = 1 << blkbits;
+	int err;
+	loff_t ret;
+
+	BUG_ON(!pages[full_page]);
+
+	/*
+	 * We want to read at least 'full_page' page. Because we have to
+	 * uncompress the whole compression block anyway, fill the surrounding
+	 * pages with the data we have anyway...
+	 */
+	start_off = page_offset(pages[full_page]);
+	end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size);
+
+	cstart_block = start_off >> zisofs_block_shift;
+	cend_block = (end_off + (1 << zisofs_block_shift) - 1)
+			>> zisofs_block_shift;
+
+	WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) !=
+		((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK));
+
+	/* Find the pointer to this specific chunk */
+	/* Note: we're not using isonum_731() here because the data is known aligned */
+	/* Note: header_size is in 32-bit words (4 bytes) */
+	blockptr = (header_size + cstart_block) << 2;
+	bh = isofs_bread(inode, blockptr >> blkbits);
+	if (!bh)
+		return -EIO;
+	block_start = le32_to_cpu(*(__le32 *)
+				(bh->b_data + (blockptr & (blksize - 1))));
+
+	while (cstart_block < cend_block && pcount > 0) {
+		/* Load end of the compressed block in the file */
+		blockptr += 4;
+		/* Traversed to next block? */
+		if (!(blockptr & (blksize - 1))) {
+			brelse(bh);
+
+			bh = isofs_bread(inode, blockptr >> blkbits);
+			if (!bh)
+				return -EIO;
+		}
+		block_end = le32_to_cpu(*(__le32 *)
+				(bh->b_data + (blockptr & (blksize - 1))));
+		if (block_start > block_end) {
+			brelse(bh);
+			return -EIO;
+		}
+		err = 0;
+		ret = zisofs_uncompress_block(inode, block_start, block_end,
+					      pcount, pages, poffset, &err);
+		poffset += ret;
+		pages += poffset >> PAGE_CACHE_SHIFT;
+		pcount -= poffset >> PAGE_CACHE_SHIFT;
+		full_page -= poffset >> PAGE_CACHE_SHIFT;
+		poffset &= ~PAGE_CACHE_MASK;
+
+		if (err) {
+			brelse(bh);
+			/*
+			 * Did we finish reading the page we really wanted
+			 * to read?
+			 */
+			if (full_page < 0)
+				return 0;
+			return err;
 		}
-		zlib_inflateEnd(&stream);
 
-	z_eio:
-		mutex_unlock(&zisofs_zlib_lock);
+		block_start = block_end;
+		cstart_block++;
+	}
+
+	if (poffset && *pages) {
+		memset(page_address(*pages) + poffset, 0,
+		       PAGE_CACHE_SIZE - poffset);
+		flush_dcache_page(*pages);
+		SetPageUptodate(*pages);
+	}
+	return 0;
+}
 
-	b_eio:
-		for ( i = 0 ; i < haveblocks ; i++ ) {
-			if ( bhs[i] )
-				brelse(bhs[i]);
+/*
+ * When decompressing, we typically obtain more than one page
+ * per reference.  We inject the additional pages into the page
+ * cache as a form of readahead.
+ */
+static int zisofs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	int err;
+	int i, pcount, full_page;
+	unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+	unsigned int zisofs_pages_per_cblock =
+		PAGE_CACHE_SHIFT <= zisofs_block_shift ?
+		(1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0;
+	struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
+	pgoff_t index = page->index, end_index;
+
+	end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	/*
+	 * If this page is wholly outside i_size we just return zero;
+	 * do_generic_file_read() will handle this for us
+	 */
+	if (index >= end_index) {
+		SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (PAGE_CACHE_SHIFT <= zisofs_block_shift) {
+		/* We have already been given one page, this is the one
+		   we must do. */
+		full_page = index & (zisofs_pages_per_cblock - 1);
+		pcount = min_t(int, zisofs_pages_per_cblock,
+			end_index - (index & ~(zisofs_pages_per_cblock - 1)));
+		index -= full_page;
+	} else {
+		full_page = 0;
+		pcount = 1;
+	}
+	pages[full_page] = page;
+
+	for (i = 0; i < pcount; i++, index++) {
+		if (i != full_page)
+			pages[i] = grab_cache_page_nowait(mapping, index);
+		if (pages[i]) {
+			ClearPageError(pages[i]);
+			kmap(pages[i]);
 		}
 	}
 
-eio:
+	err = zisofs_fill_pages(inode, full_page, pcount, pages);
 
 	/* Release any residual pages, do not SetPageUptodate */
-	while ( fpage < maxpage ) {
-		page = pages[fpage];
-		if ( page ) {
-			flush_dcache_page(page);
-			if ( fpage == xpage )
-				SetPageError(page);
-			kunmap(page);
-			unlock_page(page);
-			if ( fpage != xpage )
-				page_cache_release(page);
+	for (i = 0; i < pcount; i++) {
+		if (pages[i]) {
+			flush_dcache_page(pages[i]);
+			if (i == full_page && err)
+				SetPageError(pages[i]);
+			kunmap(pages[i]);
+			unlock_page(pages[i]);
+			if (i != full_page)
+				page_cache_release(pages[i]);
 		}
-		fpage++;
 	}			
 
 	/* At this point, err contains 0 or -EIO depending on the "critical" page */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index c2fb2dd0131..96a685c550f 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -518,8 +518,7 @@ repeat:
 			if (algo == SIG('p', 'z')) {
 				int block_shift =
 					isonum_711(&rr->u.ZF.parms[1]);
-				if (block_shift < PAGE_CACHE_SHIFT
-						|| block_shift > 17) {
+				if (block_shift > 17) {
 					printk(KERN_WARNING "isofs: "
 						"Can't handle ZF block "
 						"size of 2^%d\n",
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
index 4e28beeed15..a8408983abd 100644
--- a/fs/jbd/Kconfig
+++ b/fs/jbd/Kconfig
@@ -1,5 +1,6 @@
 config JBD
 	tristate
+	select FS_JOURNAL_INFO
 	help
 	  This is a generic journalling layer for block devices.  It is
 	  currently used by the ext3 file system, but it could also be
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index f32f346f4b0..0f7d1ceafdf 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -1,6 +1,7 @@
 config JBD2
 	tristate
 	select CRC32
+	select FS_JOURNAL_INFO
 	help
 	  This is a generic journaling layer for block devices that support
 	  both 32-bit and 64-bit block numbers.  It is currently used by
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d4cfd6d2779..6a10238d2c6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -286,7 +286,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 		if (err) {
 			/*
 			 * Because AS_EIO is cleared by
-			 * wait_on_page_writeback_range(), set it again so
+			 * filemap_fdatawait_range(), set it again so
 			 * that user process can get -EIO from fsync().
 			 */
 			set_bit(AS_EIO,
@@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		JBUFFER_TRACE(jh, "ph3: write metadata");
 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 						      jh, &new_jh, blocknr);
+		if (flags < 0) {
+			jbd2_journal_abort(journal, flags);
+			continue;
+		}
 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 		wbuf[bufs++] = jh2bh(new_jh);
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fed85388ee8..b7ca3a92a4d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
 EXPORT_SYMBOL(jbd2_journal_clear_err);
 EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
 EXPORT_SYMBOL(jbd2_journal_start_commit);
 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
 EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -358,6 +359,10 @@ repeat:
 
 		jbd_unlock_bh_state(bh_in);
 		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
+		if (!tmp) {
+			jbd2_journal_put_journal_head(new_jh);
+			return -ENOMEM;
+		}
 		jbd_lock_bh_state(bh_in);
 		if (jh_in->b_frozen_data) {
 			jbd2_free(tmp, bh_in->b_size);
@@ -1248,6 +1253,13 @@ int jbd2_journal_load(journal_t *journal)
 	if (jbd2_journal_recover(journal))
 		goto recovery_error;
 
+	if (journal->j_failed_commit) {
+		printk(KERN_ERR "JBD2: journal transaction %u on %s "
+		       "is corrupt.\n", journal->j_failed_commit,
+		       journal->j_devname);
+		return -EIO;
+	}
+
 	/* OK, we've finished with the dynamic journal bits:
 	 * reinitialise the dynamic contents of the superblock in memory
 	 * and reset them on disk. */
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f25e70c1b51..f0294410868 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -177,7 +177,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 		spin_unlock(&jffs2_compressor_list_lock);
 		break;
 	default:
-		printk(KERN_ERR "JFFS2: unknow compression mode.\n");
+		printk(KERN_ERR "JFFS2: unknown compression mode.\n");
 	}
  out:
 	if (ret == JFFS2_COMPR_NONE) {
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 090c556ffed..3b6f2fa12cf 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -700,7 +700,8 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 	struct jffs2_raw_inode ri;
 	struct jffs2_node_frag *last_frag;
 	union jffs2_device_node dev;
-	char *mdata = NULL, mdatalen = 0;
+	char *mdata = NULL;
+	int mdatalen = 0;
 	uint32_t alloclen, ilen;
 	int ret;
 
diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c
index cfe05c1966a..3f39be1b045 100644
--- a/fs/jffs2/read.c
+++ b/fs/jffs2/read.c
@@ -164,12 +164,15 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 
 	/* XXX FIXME: Where a single physical node actually shows up in two
 	   frags, we read it twice. Don't do that. */
-	/* Now we're pointing at the first frag which overlaps our page */
+	/* Now we're pointing at the first frag which overlaps our page
+	 * (or perhaps is before it, if we've been asked to read off the
+	 * end of the file). */
 	while(offset < end) {
 		D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end));
-		if (unlikely(!frag || frag->ofs > offset)) {
+		if (unlikely(!frag || frag->ofs > offset ||
+			     frag->ofs + frag->size <= offset)) {
 			uint32_t holesize = end - offset;
-			if (frag) {
+			if (frag && frag->ofs > offset) {
 				D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset));
 				holesize = min(holesize, frag->ofs - offset);
 			}
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1a80301004b..e22de8397b7 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -931,7 +931,7 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
  * Helper function for jffs2_get_inode_nodes().
  * The function detects whether more data should be read and reads it if yes.
  *
- * Returns: 0 on succes;
+ * Returns: 0 on success;
  * 	    negative error code on failure.
  */
 static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
@@ -1284,7 +1284,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
 				f->target = NULL;
 				mutex_unlock(&f->sem);
 				jffs2_do_clear_inode(c, f);
-				return -ret;
+				return ret;
 			}
 
 			f->target[je32_to_cpu(latest_node->csize)] = '\0';
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 6caf1e1ee26..800171dca53 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -23,7 +23,7 @@
 
 int jffs2_sum_init(struct jffs2_sb_info *c)
 {
-	uint32_t sum_size = max_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
+	uint32_t sum_size = min_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
 
 	c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
 
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 082e844ab2d..4b107881acd 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
  *   is used to release xattr name/value pair and detach from c->xattrindex.
  * reclaim_xattr_datum(c)
  *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
- *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold 
  *   is hard coded as 32KiB.
  * do_verify_xattr_datum(c, xd)
  *   is used to load the xdatum informations without name/value pair from the medium.
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 2bc7d8aa574..d9b031cf69f 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -755,7 +755,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	 * allocation group.
 	 */
 	if ((blkno & (bmp->db_agsize - 1)) == 0)
-		/* check if the AG is currenly being written to.
+		/* check if the AG is currently being written to.
 		 * if so, call dbNextAG() to find a non-busy
 		 * AG with sufficient free space.
 		 */
@@ -3337,7 +3337,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 	for (i = 0, n = 0; i < agno; n++) {
 		bmp->db_agfree[n] = 0;	/* init collection point */
 
-		/* coalesce cotiguous k AGs; */
+		/* coalesce contiguous k AGs; */
 		for (j = 0; j < k && i < agno; j++, i++) {
 			/* merge AGi to AGn */
 			bmp->db_agfree[n] += bmp->db_agfree[i];
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 1a54ae14a19..e50cfa3d965 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -371,82 +371,74 @@ EXPORT_SYMBOL_GPL(lockd_down);
 
 static ctl_table nlm_sysctls[] = {
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nlm_grace_period",
 		.data		= &nlm_grace_period,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= (unsigned long *) &nlm_grace_period_min,
 		.extra2		= (unsigned long *) &nlm_grace_period_max,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nlm_timeout",
 		.data		= &nlm_timeout,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= (unsigned long *) &nlm_timeout_min,
 		.extra2		= (unsigned long *) &nlm_timeout_max,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nlm_udpport",
 		.data		= &nlm_udpport,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (int *) &nlm_port_min,
 		.extra2		= (int *) &nlm_port_max,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nlm_tcpport",
 		.data		= &nlm_tcpport,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (int *) &nlm_port_min,
 		.extra2		= (int *) &nlm_port_max,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nsm_use_hostnames",
 		.data		= &nsm_use_hostnames,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nsm_local_state",
 		.data		= &nsm_local_state,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table nlm_sysctl_dir[] = {
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nfs",
 		.mode		= 0555,
 		.child		= nlm_sysctls,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table nlm_sysctl_root[] = {
 	{
-		.ctl_name	= CTL_FS,
 		.procname	= "fs",
 		.mode		= 0555,
 		.child		= nlm_sysctl_dir,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 #endif	/* CONFIG_SYSCTL */
diff --git a/fs/namei.c b/fs/namei.c
index d11f404667e..87f97ba90ad 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1279,28 +1279,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	return __lookup_hash(&this, base, NULL);
 }
 
-/**
- * lookup_one_noperm - bad hack for sysfs
- * @name:	pathname component to lookup
- * @base:	base directory to lookup from
- *
- * This is a variant of lookup_one_len that doesn't perform any permission
- * checks.   It's a horrible hack to work around the braindead sysfs
- * architecture and should not be used anywhere else.
- *
- * DON'T USE THIS FUNCTION EVER, thanks.
- */
-struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
-{
-	int err;
-	struct qstr this;
-
-	err = __lookup_one_len(name, &this, base, strlen(name));
-	if (err)
-		return ERR_PTR(err);
-	return __lookup_hash(&this, base, NULL);
-}
-
 int user_path_at(int dfd, const char __user *name, unsigned flags,
 		 struct path *path)
 {
@@ -1678,6 +1656,15 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	int will_write;
 	int flag = open_to_namei_flags(open_flag);
 
+	/*
+	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+	 * check for O_DSYNC if the need any syncing at all we enforce it's
+	 * always set instead of having to deal with possibly weird behaviour
+	 * for malicious applications setting only __O_SYNC.
+	 */
+	if (open_flag & __O_SYNC)
+		open_flag |= O_DSYNC;
+
 	if (!acc_mode)
 		acc_mode = MAY_OPEN | ACC_MODE(flag);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index bdc3cb4fd22..7d70d63ceb2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1921,6 +1921,16 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (data_page)
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
 
+	/* ... and get the mountpoint */
+	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
+	if (retval)
+		return retval;
+
+	retval = security_sb_mount(dev_name, &path,
+				   type_page, flags, data_page);
+	if (retval)
+		goto dput_out;
+
 	/* Default to relatime unless overriden */
 	if (!(flags & MS_NOATIME))
 		mnt_flags |= MNT_RELATIME;
@@ -1945,16 +1955,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
 		   MS_STRICTATIME);
 
-	/* ... and get the mountpoint */
-	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
-	if (retval)
-		return retval;
-
-	retval = security_sb_mount(dev_name, &path,
-				   type_page, flags, data_page);
-	if (retval)
-		goto dput_out;
-
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0d58caf4a6e..ec8f45f12e0 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -835,7 +835,7 @@ static int ncp_ioctl_need_write(unsigned int cmd)
 	case NCP_IOC_SETROOT:
 		return 0;
 	default:
-		/* unkown IOCTL command, assume write */
+		/* unknown IOCTL command, assume write */
 		return 1;
 	}
 }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 293fa0528a6..73ab220354d 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,11 +78,6 @@ nfs4_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	/*
-	 * FIXME: do we really need to run this under the BKL? If so, please
-	 * add a comment about what it's intended to protect.
-	 */
-	lock_kernel();
 	while (!kthread_should_stop()) {
 		/*
 		 * Listen for a request on the socket
@@ -104,7 +99,6 @@ nfs4_callback_svc(void *vrqstp)
 		preverr = err;
 		svc_process(rqstp);
 	}
-	unlock_kernel();
 	return 0;
 }
 
@@ -160,11 +154,6 @@ nfs41_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	/*
-	 * FIXME: do we really need to run this under the BKL? If so, please
-	 * add a comment about what it's intended to protect.
-	 */
-	lock_kernel();
 	while (!kthread_should_stop()) {
 		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
 		spin_lock_bh(&serv->sv_cb_lock);
@@ -183,7 +172,6 @@ nfs41_callback_svc(void *vrqstp)
 		}
 		finish_wait(&serv->sv_cb_waitq, &wq);
 	}
-	unlock_kernel();
 	return 0;
 }
 
@@ -397,6 +385,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
  */
 static struct svc_version *nfs4_callback_version[] = {
 	[1] = &nfs4_callback_version1,
+	[4] = &nfs4_callback_version4,
 };
 
 static struct svc_stat nfs4_callback_stats;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 07baa8254ca..d4036be0b58 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -106,6 +106,19 @@ struct cb_sequenceres {
 extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
 				       struct cb_sequenceres *res);
 
+extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
+					     const nfs4_stateid *stateid);
+
+#define RCA4_TYPE_MASK_RDATA_DLG	0
+#define RCA4_TYPE_MASK_WDATA_DLG	1
+
+struct cb_recallanyargs {
+	struct sockaddr	*craa_addr;
+	uint32_t	craa_objs_to_keep;
+	uint32_t	craa_type_mask;
+};
+
+extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
 #endif /* CONFIG_NFS_V4_1 */
 
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
@@ -114,8 +127,9 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
+extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
+					    const nfs4_stateid *stateid);
 #endif /* CONFIG_NFS_V4 */
-
 /*
  * nfs41: Callbacks are expected to not cause substantial latency,
  * so we limit their concurrency to 1 by setting up the maximum number
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index b7da1f54da6..defa9b4c470 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -61,6 +61,16 @@ out:
 	return res->status;
 }
 
+static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *)
+{
+#if defined(CONFIG_NFS_V4_1)
+	if (clp->cl_minorversion > 0)
+		return nfs41_validate_delegation_stateid;
+#endif
+	return nfs4_validate_delegation_stateid;
+}
+
+
 __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 {
 	struct nfs_client *clp;
@@ -81,7 +91,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 		inode = nfs_delegation_find_inode(clp, &args->fh);
 		if (inode != NULL) {
 			/* Set up a helper thread to actually return the delegation */
-			switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
+			switch (nfs_async_inode_return_delegation(inode, &args->stateid,
+								  nfs_validate_delegation_stateid(clp))) {
 				case 0:
 					res = 0;
 					break;
@@ -102,8 +113,31 @@ out:
 	return res;
 }
 
+int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
+{
+	if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
+					 sizeof(delegation->stateid.data)) != 0)
+		return 0;
+	return 1;
+}
+
 #if defined(CONFIG_NFS_V4_1)
 
+int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
+{
+	if (delegation == NULL)
+		return 0;
+
+	/* seqid is 4-bytes long */
+	if (((u32 *) &stateid->data)[0] != 0)
+		return 0;
+	if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
+		   sizeof(stateid->data)-4))
+		return 0;
+
+	return 1;
+}
+
 /*
  * Validate the sequenceID sent by the server.
  * Return success if the sequenceID is one more than what we last saw on
@@ -227,4 +261,32 @@ out:
 	return res->csr_status;
 }
 
+unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+{
+	struct nfs_client *clp;
+	int status;
+	fmode_t flags = 0;
+
+	status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	clp = nfs_find_client(args->craa_addr, 4);
+	if (clp == NULL)
+		goto out;
+
+	dprintk("NFS: RECALL_ANY callback request from %s\n",
+		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
+		     &args->craa_type_mask))
+		flags = FMODE_READ;
+	if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
+		     &args->craa_type_mask))
+		flags |= FMODE_WRITE;
+
+	if (flags)
+		nfs_expire_all_delegation_types(clp, flags);
+	status = htonl(NFS4_OK);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 76b0aa0f73b..8e1a2511c8b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -23,6 +23,7 @@
 #if defined(CONFIG_NFS_V4_1)
 #define CB_OP_SEQUENCE_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
 					4 + 1 + 3)
+#define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #endif /* CONFIG_NFS_V4_1 */
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -326,6 +327,25 @@ out_free:
 	goto out;
 }
 
+static unsigned decode_recallany_args(struct svc_rqst *rqstp,
+				      struct xdr_stream *xdr,
+				      struct cb_recallanyargs *args)
+{
+	uint32_t *p;
+
+	args->craa_addr = svc_addr(rqstp);
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+	args->craa_objs_to_keep = ntohl(*p++);
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+	args->craa_type_mask = ntohl(*p);
+
+	return 0;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -533,6 +553,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	case OP_CB_GETATTR:
 	case OP_CB_RECALL:
 	case OP_CB_SEQUENCE:
+	case OP_CB_RECALL_ANY:
 		*op = &callback_ops[op_nr];
 		break;
 
@@ -540,7 +561,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	case OP_CB_NOTIFY_DEVICEID:
 	case OP_CB_NOTIFY:
 	case OP_CB_PUSH_DELEG:
-	case OP_CB_RECALL_ANY:
 	case OP_CB_RECALLABLE_OBJ_AVAIL:
 	case OP_CB_RECALL_SLOT:
 	case OP_CB_WANTS_CANCELLED:
@@ -688,6 +708,11 @@ static struct callback_op callback_ops[] = {
 		.encode_res = (callback_encode_res_t)encode_cb_sequence_res,
 		.res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
 	},
+	[OP_CB_RECALL_ANY] = {
+		.process_op = (callback_process_op_t)nfs4_callback_recallany,
+		.decode_args = (callback_decode_arg_t)decode_recallany_args,
+		.res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
+	},
 #endif /* CONFIG_NFS_V4_1 */
 };
 
@@ -718,3 +743,10 @@ struct svc_version nfs4_callback_version1 = {
 	.vs_dispatch = NULL,
 };
 
+struct svc_version nfs4_callback_version4 = {
+	.vs_vers = 4,
+	.vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+	.vs_proc = nfs4_callback_procedures1,
+	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+	.vs_dispatch = NULL,
+};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 99ea196f071..ee77713ce68 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1260,10 +1260,20 @@ error:
 static void nfs4_session_set_rwsize(struct nfs_server *server)
 {
 #ifdef CONFIG_NFS_V4_1
+	struct nfs4_session *sess;
+	u32 server_resp_sz;
+	u32 server_rqst_sz;
+
 	if (!nfs4_has_session(server->nfs_client))
 		return;
-	server->rsize = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
-	server->wsize = server->nfs_client->cl_session->fc_attrs.max_rqst_sz;
+	sess = server->nfs_client->cl_session;
+	server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead;
+	server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead;
+
+	if (server->rsize > server_resp_sz)
+		server->rsize = server_resp_sz;
+	if (server->wsize > server_rqst_sz)
+		server->wsize = server_rqst_sz;
 #endif /* CONFIG_NFS_V4_1 */
 }
 
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 6dd48a4405b..2563bebc4c6 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -92,7 +92,7 @@ out:
 	return status;
 }
 
-static void nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
+static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_open_context *ctx;
@@ -116,10 +116,11 @@ again:
 			err = nfs_delegation_claim_locks(ctx, state);
 		put_nfs_open_context(ctx);
 		if (err != 0)
-			return;
+			return err;
 		goto again;
 	}
 	spin_unlock(&inode->i_lock);
+	return 0;
 }
 
 /*
@@ -261,30 +262,34 @@ static void nfs_msync_inode(struct inode *inode)
 /*
  * Basic procedure for returning a delegation to the server
  */
-static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
+static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int err;
 
-	nfs_msync_inode(inode);
 	/*
 	 * Guard against new delegated open/lock/unlock calls and against
 	 * state recovery
 	 */
 	down_write(&nfsi->rwsem);
-	nfs_delegation_claim_opens(inode, &delegation->stateid);
+	err = nfs_delegation_claim_opens(inode, &delegation->stateid);
 	up_write(&nfsi->rwsem);
-	nfs_msync_inode(inode);
+	if (err)
+		goto out;
 
-	return nfs_do_return_delegation(inode, delegation, 1);
+	err = nfs_do_return_delegation(inode, delegation, issync);
+out:
+	return err;
 }
 
 /*
  * Return all delegations that have been marked for return
  */
-void nfs_client_return_marked_delegations(struct nfs_client *clp)
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
 {
 	struct nfs_delegation *delegation;
 	struct inode *inode;
+	int err = 0;
 
 restart:
 	rcu_read_lock();
@@ -298,12 +303,18 @@ restart:
 		delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
 		spin_unlock(&clp->cl_lock);
 		rcu_read_unlock();
-		if (delegation != NULL)
-			__nfs_inode_return_delegation(inode, delegation);
+		if (delegation != NULL) {
+			filemap_flush(inode->i_mapping);
+			err = __nfs_inode_return_delegation(inode, delegation, 0);
+		}
 		iput(inode);
-		goto restart;
+		if (!err)
+			goto restart;
+		set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+		return err;
 	}
 	rcu_read_unlock();
+	return 0;
 }
 
 /*
@@ -338,8 +349,10 @@ int nfs_inode_return_delegation(struct inode *inode)
 		spin_lock(&clp->cl_lock);
 		delegation = nfs_detach_delegation_locked(nfsi, NULL);
 		spin_unlock(&clp->cl_lock);
-		if (delegation != NULL)
-			err = __nfs_inode_return_delegation(inode, delegation);
+		if (delegation != NULL) {
+			nfs_msync_inode(inode);
+			err = __nfs_inode_return_delegation(inode, delegation, 1);
+		}
 	}
 	return err;
 }
@@ -368,33 +381,47 @@ void nfs_super_return_all_delegations(struct super_block *sb)
 		spin_unlock(&delegation->lock);
 	}
 	rcu_read_unlock();
-	nfs_client_return_marked_delegations(clp);
+	if (nfs_client_return_marked_delegations(clp) != 0)
+		nfs4_schedule_state_manager(clp);
 }
 
-static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+static
+void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
 	struct nfs_delegation *delegation;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-		set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-		set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+		if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
+			continue;
+		if (delegation->type & flags)
+			nfs_mark_return_delegation(clp, delegation);
 	}
 	rcu_read_unlock();
 }
 
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+{
+	nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
+}
+
 static void nfs_delegation_run_state_manager(struct nfs_client *clp)
 {
 	if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
 		nfs4_schedule_state_manager(clp);
 }
 
-void nfs_expire_all_delegations(struct nfs_client *clp)
+void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
-	nfs_client_mark_return_all_delegations(clp);
+	nfs_client_mark_return_all_delegation_types(clp, flags);
 	nfs_delegation_run_state_manager(clp);
 }
 
+void nfs_expire_all_delegations(struct nfs_client *clp)
+{
+	nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
+}
+
 /*
  * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
  */
@@ -413,8 +440,7 @@ static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *c
 	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
 		if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
 			continue;
-		set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-		set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+		nfs_mark_return_delegation(clp, delegation);
 	}
 	rcu_read_unlock();
 }
@@ -428,18 +454,21 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 /*
  * Asynchronous delegation recall!
  */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
+				      int (*validate_stateid)(struct nfs_delegation *delegation,
+							      const nfs4_stateid *stateid))
 {
 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_delegation *delegation;
 
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
-	if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
-				sizeof(delegation->stateid.data)) != 0) {
+
+	if (!validate_stateid(delegation, stateid)) {
 		rcu_read_unlock();
 		return -ENOENT;
 	}
+
 	nfs_mark_return_delegation(clp, delegation);
 	rcu_read_unlock();
 	nfs_delegation_run_state_manager(clp);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 09f38379517..944b627ec6e 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,15 +34,18 @@ enum {
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 int nfs_inode_return_delegation(struct inode *inode);
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
+				      int (*validate_stateid)(struct nfs_delegation *delegation,
+							      const nfs4_stateid *stateid));
 void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
 void nfs_super_return_all_delegations(struct super_block *sb);
 void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
-void nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_client_return_marked_delegations(struct nfs_client *clp);
 
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7cb298525ee..2c5ace4f00a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1579,55 +1579,46 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *dentry = NULL, *rehash = NULL;
 	int error = -EBUSY;
 
-	/*
-	 * To prevent any new references to the target during the rename,
-	 * we unhash the dentry and free the inode in advance.
-	 */
-	if (!d_unhashed(new_dentry)) {
-		d_drop(new_dentry);
-		rehash = new_dentry;
-	}
-
 	dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
 		 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
 		 atomic_read(&new_dentry->d_count));
 
 	/*
-	 * First check whether the target is busy ... we can't
-	 * safely do _any_ rename if the target is in use.
-	 *
-	 * For files, make a copy of the dentry and then do a 
-	 * silly-rename. If the silly-rename succeeds, the
-	 * copied dentry is hashed and becomes the new target.
+	 * For non-directories, check whether the target is busy and if so,
+	 * make a copy of the dentry and then do a silly-rename. If the
+	 * silly-rename succeeds, the copied dentry is hashed and becomes
+	 * the new target.
 	 */
-	if (!new_inode)
-		goto go_ahead;
-	if (S_ISDIR(new_inode->i_mode)) {
-		error = -EISDIR;
-		if (!S_ISDIR(old_inode->i_mode))
-			goto out;
-	} else if (atomic_read(&new_dentry->d_count) > 2) {
-		int err;
-		/* copy the target dentry's name */
-		dentry = d_alloc(new_dentry->d_parent,
-				 &new_dentry->d_name);
-		if (!dentry)
-			goto out;
+	if (new_inode && !S_ISDIR(new_inode->i_mode)) {
+		/*
+		 * To prevent any new references to the target during the
+		 * rename, we unhash the dentry in advance.
+		 */
+		if (!d_unhashed(new_dentry)) {
+			d_drop(new_dentry);
+			rehash = new_dentry;
+		}
+
+		if (atomic_read(&new_dentry->d_count) > 2) {
+			int err;
+
+			/* copy the target dentry's name */
+			dentry = d_alloc(new_dentry->d_parent,
+					 &new_dentry->d_name);
+			if (!dentry)
+				goto out;
 
-		/* silly-rename the existing target ... */
-		err = nfs_sillyrename(new_dir, new_dentry);
-		if (!err) {
-			new_dentry = rehash = dentry;
+			/* silly-rename the existing target ... */
+			err = nfs_sillyrename(new_dir, new_dentry);
+			if (err)
+				goto out;
+
+			new_dentry = dentry;
 			new_inode = NULL;
-			/* instantiate the replacement target */
-			d_instantiate(new_dentry, NULL);
-		} else if (atomic_read(&new_dentry->d_count) > 1)
-			/* dentry still busy? */
-			goto out;
+		}
 	}
 
-go_ahead:
 	/*
 	 * ... prune child dentries and writebacks if needed.
 	 */
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index f4d54ba97cc..95e1ca765d4 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -146,7 +146,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
 	return 0;
 }
 
-struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
 		struct nfs_dns_ent *key)
 {
 	struct cache_head *ch;
@@ -159,7 +159,7 @@ struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
 	return container_of(ch, struct nfs_dns_ent, h);
 }
 
-struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
 		struct nfs_dns_ent *new,
 		struct nfs_dns_ent *key)
 {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f5fdd39e037..6b891328f33 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -581,7 +581,7 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
 {
 	struct nfs_open_context *ctx;
 
-	if (IS_SYNC(inode) || (filp->f_flags & O_SYNC))
+	if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
 		return 1;
 	ctx = nfs_file_open_context(filp);
 	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
@@ -622,7 +622,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 
 	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
 	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
-	/* Return error values for O_SYNC and IS_SYNC() */
+	/* Return error values for O_DSYNC and IS_SYNC() */
 	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
 		int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
 		if (err < 0)
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 70fad69eb95..fa588006588 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -359,17 +359,13 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 
 	BUG_ON(!cookie);
 
-	if (fscache_check_page_write(cookie, page)) {
-		if (!(gfp & __GFP_WAIT))
-			return 0;
-		fscache_wait_on_page_write(cookie, page);
-	}
-
 	if (PageFsCache(page)) {
 		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
 			 cookie, page, nfsi);
 
-		fscache_uncache_page(cookie, page);
+		if (!fscache_maybe_release_page(cookie, page, gfp))
+			return 0;
+
 		nfs_add_fscache_stats(page->mapping->host,
 				      NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
 	}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e21b1bb9972..29e464d23b3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -30,6 +30,15 @@ static inline int nfs4_has_session(const struct nfs_client *clp)
 	return 0;
 }
 
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+	if (nfs4_has_session(clp))
+		return (clp->cl_session->flags & SESSION4_PERSIST);
+#endif /* CONFIG_NFS_V4_1 */
+	return 0;
+}
+
 struct nfs_clone_mount {
 	const struct super_block *sb;
 	const struct dentry *dentry;
@@ -156,6 +165,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
 
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
+extern struct svc_version nfs4_callback_version4;
 
 /* pagelist.c */
 extern int __init nfs_init_nfspagecache(void);
@@ -177,24 +187,14 @@ extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
 extern struct rpc_procinfo nfs3_procedures[];
 extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
 
-/* nfs4proc.c */
-static inline void nfs4_restart_rpc(struct rpc_task *task,
-				    const struct nfs_client *clp)
-{
-#ifdef CONFIG_NFS_V4_1
-	if (nfs4_has_session(clp) &&
-	    test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
-		rpc_restart_call_prepare(task);
-		return;
-	}
-#endif /* CONFIG_NFS_V4_1 */
-	rpc_restart_call(task);
-}
-
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
 extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 #endif
+#ifdef CONFIG_NFS_V4_1
+extern const u32 nfs41_maxread_overhead;
+extern const u32 nfs41_maxwrite_overhead;
+#endif
 
 /* nfs4proc.c */
 #ifdef CONFIG_NFS_V4
@@ -273,20 +273,6 @@ extern int _nfs4_call_sync_session(struct nfs_server *server,
 				   struct nfs4_sequence_res *res,
 				   int cache_reply);
 
-#ifdef CONFIG_NFS_V4_1
-extern void nfs41_sequence_free_slot(const struct nfs_client *,
-				     struct nfs4_sequence_res *res);
-#endif /* CONFIG_NFS_V4_1 */
-
-static inline void nfs4_sequence_free_slot(const struct nfs_client *clp,
-					   struct nfs4_sequence_res *res)
-{
-#ifdef CONFIG_NFS_V4_1
-	if (nfs4_has_session(clp))
-		nfs41_sequence_free_slot(clp, res);
-#endif /* CONFIG_NFS_V4_1 */
-}
-
 /*
  * Determine the device name as a string
  */
@@ -380,3 +366,15 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 	return ((unsigned long)len + (unsigned long)base +
 		PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
+
+/*
+ * Helper for restarting RPC calls in the possible presence of NFSv4.1
+ * sessions.
+ */
+static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
+{
+	if (nfs4_has_session(clp))
+		rpc_restart_call_prepare(task);
+	else
+		rpc_restart_call(task);
+}
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index ceda50aad73..46d779abafd 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -25,13 +25,7 @@ struct nfs_iostats {
 static inline void nfs_inc_server_stats(const struct nfs_server *server,
 					enum nfs_stat_eventcounters stat)
 {
-	struct nfs_iostats *iostats;
-	int cpu;
-
-	cpu = get_cpu();
-	iostats = per_cpu_ptr(server->io_stats, cpu);
-	iostats->events[stat]++;
-	put_cpu();
+	this_cpu_inc(server->io_stats->events[stat]);
 }
 
 static inline void nfs_inc_stats(const struct inode *inode,
@@ -44,13 +38,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server,
 					enum nfs_stat_bytecounters stat,
 					unsigned long addend)
 {
-	struct nfs_iostats *iostats;
-	int cpu;
-
-	cpu = get_cpu();
-	iostats = per_cpu_ptr(server->io_stats, cpu);
-	iostats->bytes[stat] += addend;
-	put_cpu();
+	this_cpu_add(server->io_stats->bytes[stat], addend);
 }
 
 static inline void nfs_add_stats(const struct inode *inode,
@@ -65,13 +53,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
 					 enum nfs_stat_fscachecounters stat,
 					 unsigned long addend)
 {
-	struct nfs_iostats *iostats;
-	int cpu;
-
-	cpu = get_cpu();
-	iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
-	iostats->fscache[stat] += addend;
-	put_cpu();
+	this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
 }
 #endif
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 6ea07a3c75d..7e57b04e401 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,7 +44,8 @@ enum nfs4_client_state {
 	NFS4CLNT_RECLAIM_REBOOT,
 	NFS4CLNT_RECLAIM_NOGRACE,
 	NFS4CLNT_DELEGRETURN,
-	NFS4CLNT_SESSION_SETUP,
+	NFS4CLNT_SESSION_RESET,
+	NFS4CLNT_SESSION_DRAINING,
 };
 
 /*
@@ -180,6 +181,7 @@ struct nfs4_state_recovery_ops {
 	int (*recover_lock)(struct nfs4_state *, struct file_lock *);
 	int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
 	struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
+	int (*reclaim_complete)(struct nfs_client *);
 };
 
 struct nfs4_state_maintenance_ops {
@@ -200,9 +202,11 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
+extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
@@ -218,9 +222,11 @@ extern int nfs4_setup_sequence(struct nfs_client *clp,
 		int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
-extern int nfs4_proc_create_session(struct nfs_client *, int reset);
+extern int nfs4_proc_create_session(struct nfs_client *);
 extern int nfs4_proc_destroy_session(struct nfs4_session *);
 extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
+		struct nfs_fsinfo *fsinfo);
 #else /* CONFIG_NFS_v4_1 */
 static inline int nfs4_setup_sequence(struct nfs_client *clp,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
@@ -267,6 +273,7 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_state_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
+extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
@@ -287,6 +294,7 @@ struct nfs4_mount_data;
 
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
+extern struct svc_version nfs4_callback_version4;
 
 #else
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 741a562177f..9f5f11ecfd9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -270,11 +270,18 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR: %d Reset session\n", __func__,
 				errorcode);
-			set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+			nfs4_schedule_state_recovery(clp);
 			exception->retry = 1;
-			/* FALLTHROUGH */
+			break;
 #endif /* !defined(CONFIG_NFS_V4_1) */
 		case -NFS4ERR_FILE_OPEN:
+			if (exception->timeout > HZ) {
+				/* We have retried a decent amount, time to
+				 * fail
+				 */
+				ret = -EBUSY;
+				break;
+			}
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
 			ret = nfs4_delay(server->client, &exception->timeout);
@@ -311,48 +318,54 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
  * so we need to scan down from highest_used_slotid to 0 looking for the now
  * highest slotid in use.
  * If none found, highest_used_slotid is set to -1.
+ *
+ * Must be called while holding tbl->slot_tbl_lock
  */
 static void
 nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
 {
 	int slotid = free_slotid;
 
-	spin_lock(&tbl->slot_tbl_lock);
 	/* clear used bit in bitmap */
 	__clear_bit(slotid, tbl->used_slots);
 
 	/* update highest_used_slotid when it is freed */
 	if (slotid == tbl->highest_used_slotid) {
 		slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
-		if (slotid >= 0 && slotid < tbl->max_slots)
+		if (slotid < tbl->max_slots)
 			tbl->highest_used_slotid = slotid;
 		else
 			tbl->highest_used_slotid = -1;
 	}
-	rpc_wake_up_next(&tbl->slot_tbl_waitq);
-	spin_unlock(&tbl->slot_tbl_lock);
 	dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__,
 		free_slotid, tbl->highest_used_slotid);
 }
 
-void nfs41_sequence_free_slot(const struct nfs_client *clp,
+static void nfs41_sequence_free_slot(const struct nfs_client *clp,
 			      struct nfs4_sequence_res *res)
 {
 	struct nfs4_slot_table *tbl;
 
-	if (!nfs4_has_session(clp)) {
-		dprintk("%s: No session\n", __func__);
-		return;
-	}
 	tbl = &clp->cl_session->fc_slot_table;
 	if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
-		dprintk("%s: No slot\n", __func__);
 		/* just wake up the next guy waiting since
 		 * we may have not consumed a slot after all */
-		rpc_wake_up_next(&tbl->slot_tbl_waitq);
+		dprintk("%s: No slot\n", __func__);
 		return;
 	}
+
+	spin_lock(&tbl->slot_tbl_lock);
 	nfs4_free_slot(tbl, res->sr_slotid);
+
+	/* Signal state manager thread if session is drained */
+	if (test_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) {
+		if (tbl->highest_used_slotid == -1) {
+			dprintk("%s COMPLETE: Session Drained\n", __func__);
+			complete(&clp->cl_session->complete);
+		}
+	} else
+		rpc_wake_up_next(&tbl->slot_tbl_waitq);
+	spin_unlock(&tbl->slot_tbl_lock);
 	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
 
@@ -377,10 +390,10 @@ static void nfs41_sequence_done(struct nfs_client *clp,
 	if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
 		goto out;
 
-	tbl = &clp->cl_session->fc_slot_table;
-	slot = tbl->slots + res->sr_slotid;
-
+	/* Check the SEQUENCE operation status */
 	if (res->sr_status == 0) {
+		tbl = &clp->cl_session->fc_slot_table;
+		slot = tbl->slots + res->sr_slotid;
 		/* Update the slot's sequence and clientid lease timer */
 		++slot->seq_nr;
 		timestamp = res->sr_renewal_time;
@@ -388,7 +401,8 @@ static void nfs41_sequence_done(struct nfs_client *clp,
 		if (time_before(clp->cl_last_renewal, timestamp))
 			clp->cl_last_renewal = timestamp;
 		spin_unlock(&clp->cl_lock);
-		return;
+		/* Check sequence flags */
+		nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
 	}
 out:
 	/* The session may be reset by one of the error handlers. */
@@ -429,24 +443,6 @@ out:
 	return ret_id;
 }
 
-static int nfs4_recover_session(struct nfs4_session *session)
-{
-	struct nfs_client *clp = session->clp;
-	unsigned int loop;
-	int ret;
-
-	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
-		ret = nfs4_wait_clnt_recover(clp);
-		if (ret != 0)
-			break;
-		if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
-			break;
-		nfs4_schedule_state_manager(clp);
-		ret = -EIO;
-	}
-	return ret;
-}
-
 static int nfs41_setup_sequence(struct nfs4_session *session,
 				struct nfs4_sequence_args *args,
 				struct nfs4_sequence_res *res,
@@ -455,7 +451,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 {
 	struct nfs4_slot *slot;
 	struct nfs4_slot_table *tbl;
-	int status = 0;
 	u8 slotid;
 
 	dprintk("--> %s\n", __func__);
@@ -468,21 +463,15 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 	tbl = &session->fc_slot_table;
 
 	spin_lock(&tbl->slot_tbl_lock);
-	if (test_bit(NFS4CLNT_SESSION_SETUP, &session->clp->cl_state)) {
-		if (tbl->highest_used_slotid != -1) {
-			rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
-			spin_unlock(&tbl->slot_tbl_lock);
-			dprintk("<-- %s: Session reset: draining\n", __func__);
-			return -EAGAIN;
-		}
-
-		/* The slot table is empty; start the reset thread */
-		dprintk("%s Session Reset\n", __func__);
+	if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state)) {
+		/*
+		 * The state manager will wait until the slot table is empty.
+		 * Schedule the reset thread
+		 */
+		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
 		spin_unlock(&tbl->slot_tbl_lock);
-		status = nfs4_recover_session(session);
-		if (status)
-			return status;
-		spin_lock(&tbl->slot_tbl_lock);
+		dprintk("%s Schedule Session Reset\n", __func__);
+		return -EAGAIN;
 	}
 
 	slotid = nfs4_find_slot(tbl, task);
@@ -527,7 +516,7 @@ int nfs4_setup_sequence(struct nfs_client *clp,
 		goto out;
 	ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
 				   task);
-	if (ret != -EAGAIN) {
+	if (ret && ret != -EAGAIN) {
 		/* terminate rpc task */
 		task->tk_status = ret;
 		task->tk_action = NULL;
@@ -561,7 +550,6 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
 	struct nfs41_call_sync_data *data = calldata;
 
 	nfs41_sequence_done(data->clp, data->seq_res, task->tk_status);
-	nfs41_sequence_free_slot(data->clp, data->seq_res);
 }
 
 struct rpc_call_ops nfs41_call_sync_ops = {
@@ -637,15 +625,6 @@ static void nfs4_sequence_done(const struct nfs_server *server,
 #endif /* CONFIG_NFS_V4_1 */
 }
 
-/* no restart, therefore free slot here */
-static void nfs4_sequence_done_free_slot(const struct nfs_server *server,
-					 struct nfs4_sequence_res *res,
-					 int rpc_status)
-{
-	nfs4_sequence_done(server, res, rpc_status);
-	nfs4_sequence_free_slot(server->nfs_client, res);
-}
-
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
@@ -720,9 +699,15 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
 	p->o_arg.bitmask = server->attr_bitmask;
 	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
 	if (flags & O_EXCL) {
-		u32 *s = (u32 *) p->o_arg.u.verifier.data;
-		s[0] = jiffies;
-		s[1] = current->pid;
+		if (nfs4_has_persistent_session(server->nfs_client)) {
+			/* GUARDED */
+			p->o_arg.u.attrs = &p->attrs;
+			memcpy(&p->attrs, attrs, sizeof(p->attrs));
+		} else { /* EXCLUSIVE4_1 */
+			u32 *s = (u32 *) p->o_arg.u.verifier.data;
+			s[0] = jiffies;
+			s[1] = current->pid;
+		}
 	} else if (flags & O_CREAT) {
 		p->o_arg.u.attrs = &p->attrs;
 		memcpy(&p->attrs, attrs, sizeof(p->attrs));
@@ -776,13 +761,16 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode
 		goto out;
 	switch (mode & (FMODE_READ|FMODE_WRITE)) {
 		case FMODE_READ:
-			ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0;
+			ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
+				&& state->n_rdonly != 0;
 			break;
 		case FMODE_WRITE:
-			ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0;
+			ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0
+				&& state->n_wronly != 0;
 			break;
 		case FMODE_READ|FMODE_WRITE:
-			ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
+			ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0
+				&& state->n_rdwr != 0;
 	}
 out:
 	return ret;
@@ -1183,6 +1171,14 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 			case -ENOENT:
 			case -ESTALE:
 				goto out;
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+			case -NFS4ERR_DEADSESSION:
+				nfs4_schedule_state_recovery(
+					server->nfs_client);
+				goto out;
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_EXPIRED:
@@ -1336,8 +1332,8 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
 
 	data->rpc_status = task->tk_status;
 
-	nfs4_sequence_done_free_slot(data->o_arg.server, &data->o_res.seq_res,
-				     task->tk_status);
+	nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res,
+			task->tk_status);
 
 	if (RPC_ASSASSINATED(task))
 		return;
@@ -1488,7 +1484,7 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
 	return ret;
 }
 
-static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
+static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	struct nfs4_exception exception = { };
@@ -1496,10 +1492,16 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4
 
 	do {
 		err = _nfs4_open_expired(ctx, state);
-		if (err != -NFS4ERR_DELAY)
-			break;
-		nfs4_handle_exception(server, err, &exception);
+		switch (err) {
+		default:
+			goto out;
+		case -NFS4ERR_GRACE:
+		case -NFS4ERR_DELAY:
+			nfs4_handle_exception(server, err, &exception);
+			err = 0;
+		}
 	} while (exception.retry);
+out:
 	return err;
 }
 
@@ -1712,6 +1714,18 @@ static void nfs4_free_closedata(void *data)
 	kfree(calldata);
 }
 
+static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
+		fmode_t fmode)
+{
+	spin_lock(&state->owner->so_lock);
+	if (!(fmode & FMODE_READ))
+		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+	if (!(fmode & FMODE_WRITE))
+		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+	clear_bit(NFS_O_RDWR_STATE, &state->flags);
+	spin_unlock(&state->owner->so_lock);
+}
+
 static void nfs4_close_done(struct rpc_task *task, void *data)
 {
 	struct nfs4_closedata *calldata = data;
@@ -1728,6 +1742,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 		case 0:
 			nfs_set_open_stateid(state, &calldata->res.stateid, 0);
 			renew_lease(server, calldata->timestamp);
+			nfs4_close_clear_stateid_flags(state,
+					calldata->arg.fmode);
 			break;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_OLD_STATEID:
@@ -1737,11 +1753,10 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 				break;
 		default:
 			if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
-				nfs4_restart_rpc(task, server->nfs_client);
+				nfs_restart_rpc(task, server->nfs_client);
 				return;
 			}
 	}
-	nfs4_sequence_free_slot(server->nfs_client, &calldata->res.seq_res);
 	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
 }
 
@@ -1749,38 +1764,39 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs4_closedata *calldata = data;
 	struct nfs4_state *state = calldata->state;
-	int clear_rd, clear_wr, clear_rdwr;
+	int call_close = 0;
 
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
 		return;
 
-	clear_rd = clear_wr = clear_rdwr = 0;
+	task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+	calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
 	spin_lock(&state->owner->so_lock);
 	/* Calculate the change in open mode */
 	if (state->n_rdwr == 0) {
 		if (state->n_rdonly == 0) {
-			clear_rd |= test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags);
-			clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags);
+			call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
+			call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+			calldata->arg.fmode &= ~FMODE_READ;
 		}
 		if (state->n_wronly == 0) {
-			clear_wr |= test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags);
-			clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags);
+			call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
+			call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+			calldata->arg.fmode &= ~FMODE_WRITE;
 		}
 	}
 	spin_unlock(&state->owner->so_lock);
-	if (!clear_rd && !clear_wr && !clear_rdwr) {
+
+	if (!call_close) {
 		/* Note: exit _without_ calling nfs4_close_done */
 		task->tk_action = NULL;
 		return;
 	}
+
+	if (calldata->arg.fmode == 0)
+		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+
 	nfs_fattr_init(calldata->res.fattr);
-	if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
-		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-		calldata->arg.fmode = FMODE_READ;
-	} else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
-		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-		calldata->arg.fmode = FMODE_WRITE;
-	}
 	calldata->timestamp = jiffies;
 	if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client,
 				&calldata->arg.seq_args, &calldata->res.seq_res,
@@ -1981,7 +1997,7 @@ out_drop:
 	return 0;
 }
 
-void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 {
 	if (ctx->state == NULL)
 		return;
@@ -2532,7 +2548,6 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	nfs4_sequence_done(res->server, &res->seq_res, task->tk_status);
 	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
 		return 0;
-	nfs4_sequence_free_slot(res->server->nfs_client, &res->seq_res);
 	update_changeattr(dir, &res->cinfo);
 	nfs_post_op_update_inode(dir, &res->dir_attr);
 	return 1;
@@ -2971,11 +2986,10 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 
 	dprintk("--> %s\n", __func__);
 
-	/* nfs4_sequence_free_slot called in the read rpc_call_done */
 	nfs4_sequence_done(server, &data->res.seq_res, task->tk_status);
 
 	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
-		nfs4_restart_rpc(task, server->nfs_client);
+		nfs_restart_rpc(task, server->nfs_client);
 		return -EAGAIN;
 	}
 
@@ -2995,12 +3009,11 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct inode *inode = data->inode;
 	
-	/* slot is freed in nfs_writeback_done */
 	nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
 			   task->tk_status);
 
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
-		nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
 		return -EAGAIN;
 	}
 	if (task->tk_status >= 0) {
@@ -3028,11 +3041,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 	nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
 			   task->tk_status);
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
-		nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
 		return -EAGAIN;
 	}
-	nfs4_sequence_free_slot(NFS_SERVER(inode)->nfs_client,
-				&data->res.seq_res);
 	nfs_refresh_inode(inode, data->res.fattr);
 	return 0;
 }
@@ -3350,7 +3361,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
-			set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+			nfs4_schedule_state_recovery(clp);
 			task->tk_status = 0;
 			return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -3483,12 +3494,23 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_delegreturndata *data = calldata;
 
-	nfs4_sequence_done_free_slot(data->res.server, &data->res.seq_res,
-				     task->tk_status);
+	nfs4_sequence_done(data->res.server, &data->res.seq_res,
+			task->tk_status);
 
-	data->rpc_status = task->tk_status;
-	if (data->rpc_status == 0)
+	switch (task->tk_status) {
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_EXPIRED:
+	case 0:
 		renew_lease(data->res.server, data->timestamp);
+		break;
+	default:
+		if (nfs4_async_handle_error(task, data->res.server, NULL) ==
+				-EAGAIN) {
+			nfs_restart_rpc(task, data->res.server->nfs_client);
+			return;
+		}
+	}
+	data->rpc_status = task->tk_status;
 }
 
 static void nfs4_delegreturn_release(void *calldata)
@@ -3741,11 +3763,9 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 			break;
 		default:
 			if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
-				nfs4_restart_rpc(task,
-						calldata->server->nfs_client);
+				nfs_restart_rpc(task,
+						 calldata->server->nfs_client);
 	}
-	nfs4_sequence_free_slot(calldata->server->nfs_client,
-				&calldata->res.seq_res);
 }
 
 static void nfs4_locku_prepare(struct rpc_task *task, void *data)
@@ -3927,8 +3947,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 
 	dprintk("%s: begin!\n", __func__);
 
-	nfs4_sequence_done_free_slot(data->server, &data->res.seq_res,
-				     task->tk_status);
+	nfs4_sequence_done(data->server, &data->res.seq_res,
+			task->tk_status);
 
 	data->rpc_status = task->tk_status;
 	if (RPC_ASSASSINATED(task))
@@ -4049,10 +4069,16 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
 			return 0;
 		err = _nfs4_do_setlk(state, F_SETLK, request, 0);
-		if (err != -NFS4ERR_DELAY)
-			break;
-		nfs4_handle_exception(server, err, &exception);
+		switch (err) {
+		default:
+			goto out;
+		case -NFS4ERR_GRACE:
+		case -NFS4ERR_DELAY:
+			nfs4_handle_exception(server, err, &exception);
+			err = 0;
+		}
 	} while (exception.retry);
+out:
 	return err;
 }
 
@@ -4172,6 +4198,11 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 			case -NFS4ERR_EXPIRED:
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+			case -NFS4ERR_DEADSESSION:
 				nfs4_schedule_state_recovery(server->nfs_client);
 				goto out;
 			case -ERESTARTSYS:
@@ -4296,7 +4327,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
  * NFS4ERR_BADSESSION in the sequence operation, and will therefore
  * be in some phase of session reset.
  */
-static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	nfs4_verifier verifier;
 	struct nfs41_exchange_id_args args = {
@@ -4318,6 +4349,9 @@ static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	dprintk("--> %s\n", __func__);
 	BUG_ON(clp == NULL);
 
+	/* Remove server-only flags */
+	args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
+
 	p = (u32 *)verifier.data;
 	*p++ = htonl((u32)clp->cl_boot_time.tv_sec);
 	*p = htonl((u32)clp->cl_boot_time.tv_nsec);
@@ -4389,10 +4423,9 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
 		dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
 		rpc_delay(task, NFS4_POLL_RETRY_MIN);
 		task->tk_status = 0;
-		nfs4_restart_rpc(task, data->clp);
+		nfs_restart_rpc(task, data->clp);
 		return;
 	}
-	nfs41_sequence_free_slot(data->clp, &data->res->lr_seq_res);
 	dprintk("<-- %s\n", __func__);
 }
 
@@ -4465,7 +4498,6 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots,
 	spin_lock(&tbl->slot_tbl_lock);
 	for (i = 0; i < max_slots; ++i)
 		tbl->slots[i].seq_nr = ivalue;
-	tbl->highest_used_slotid = -1;
 	spin_unlock(&tbl->slot_tbl_lock);
 	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
 		tbl, tbl->slots, tbl->max_slots);
@@ -4515,7 +4547,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session)
 static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
 		int max_slots, int ivalue)
 {
-	int i;
 	struct nfs4_slot *slot;
 	int ret = -ENOMEM;
 
@@ -4526,18 +4557,9 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
 	slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
 	if (!slot)
 		goto out;
-	for (i = 0; i < max_slots; ++i)
-		slot[i].seq_nr = ivalue;
 	ret = 0;
 
 	spin_lock(&tbl->slot_tbl_lock);
-	if (tbl->slots != NULL) {
-		spin_unlock(&tbl->slot_tbl_lock);
-		dprintk("%s: slot table already initialized. tbl=%p slots=%p\n",
-			__func__, tbl, tbl->slots);
-		WARN_ON(1);
-		goto out_free;
-	}
 	tbl->max_slots = max_slots;
 	tbl->slots = slot;
 	tbl->highest_used_slotid = -1;  /* no slot is currently used */
@@ -4547,10 +4569,6 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
 out:
 	dprintk("<-- %s: return %d\n", __func__, ret);
 	return ret;
-
-out_free:
-	kfree(slot);
-	goto out;
 }
 
 /*
@@ -4558,17 +4576,24 @@ out_free:
  */
 static int nfs4_init_slot_tables(struct nfs4_session *session)
 {
-	int status;
+	struct nfs4_slot_table *tbl;
+	int status = 0;
 
-	status = nfs4_init_slot_table(&session->fc_slot_table,
-			session->fc_attrs.max_reqs, 1);
-	if (status)
-		return status;
+	tbl = &session->fc_slot_table;
+	if (tbl->slots == NULL) {
+		status = nfs4_init_slot_table(tbl,
+				session->fc_attrs.max_reqs, 1);
+		if (status)
+			return status;
+	}
 
-	status = nfs4_init_slot_table(&session->bc_slot_table,
-			session->bc_attrs.max_reqs, 0);
-	if (status)
-		nfs4_destroy_slot_tables(session);
+	tbl = &session->bc_slot_table;
+	if (tbl->slots == NULL) {
+		status = nfs4_init_slot_table(tbl,
+				session->bc_attrs.max_reqs, 0);
+		if (status)
+			nfs4_destroy_slot_tables(session);
+	}
 
 	return status;
 }
@@ -4582,7 +4607,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 	if (!session)
 		return NULL;
 
-	set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
 	/*
 	 * The create session reply races with the server back
 	 * channel probe. Mark the client NFS_CS_SESSION_INITING
@@ -4590,12 +4614,15 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 	 * nfs_client struct
 	 */
 	clp->cl_cons_state = NFS_CS_SESSION_INITING;
+	init_completion(&session->complete);
 
 	tbl = &session->fc_slot_table;
+	tbl->highest_used_slotid = -1;
 	spin_lock_init(&tbl->slot_tbl_lock);
 	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
 
 	tbl = &session->bc_slot_table;
+	tbl->highest_used_slotid = -1;
 	spin_lock_init(&tbl->slot_tbl_lock);
 	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
 
@@ -4747,11 +4774,10 @@ static int _nfs4_proc_create_session(struct nfs_client *clp)
  * It is the responsibility of the caller to verify the session is
  * expired before calling this routine.
  */
-int nfs4_proc_create_session(struct nfs_client *clp, int reset)
+int nfs4_proc_create_session(struct nfs_client *clp)
 {
 	int status;
 	unsigned *ptr;
-	struct nfs_fsinfo fsinfo;
 	struct nfs4_session *session = clp->cl_session;
 
 	dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
@@ -4760,35 +4786,19 @@ int nfs4_proc_create_session(struct nfs_client *clp, int reset)
 	if (status)
 		goto out;
 
-	/* Init or reset the fore channel */
-	if (reset)
-		status = nfs4_reset_slot_tables(session);
-	else
-		status = nfs4_init_slot_tables(session);
-	dprintk("fore channel slot table initialization returned %d\n", status);
+	/* Init and reset the fore channel */
+	status = nfs4_init_slot_tables(session);
+	dprintk("slot table initialization returned %d\n", status);
+	if (status)
+		goto out;
+	status = nfs4_reset_slot_tables(session);
+	dprintk("slot table reset returned %d\n", status);
 	if (status)
 		goto out;
 
 	ptr = (unsigned *)&session->sess_id.data[0];
 	dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
 		clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
-
-	if (reset)
-		/* Lease time is aleady set */
-		goto out;
-
-	/* Get the lease time */
-	status = nfs4_proc_get_lease_time(clp, &fsinfo);
-	if (status == 0) {
-		/* Update lease time and schedule renewal */
-		spin_lock(&clp->cl_lock);
-		clp->cl_lease_time = fsinfo.lease_time * HZ;
-		clp->cl_last_renewal = jiffies;
-		clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-		spin_unlock(&clp->cl_lock);
-
-		nfs4_schedule_state_renewal(clp);
-	}
 out:
 	dprintk("<-- %s\n", __func__);
 	return status;
@@ -4827,13 +4837,16 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
 int nfs4_init_session(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_session *session;
 	int ret;
 
 	if (!nfs4_has_session(clp))
 		return 0;
 
-	clp->cl_session->fc_attrs.max_rqst_sz = server->wsize;
-	clp->cl_session->fc_attrs.max_resp_sz = server->rsize;
+	session = clp->cl_session;
+	session->fc_attrs.max_rqst_sz = server->wsize + nfs41_maxwrite_overhead;
+	session->fc_attrs.max_resp_sz = server->rsize + nfs41_maxread_overhead;
+
 	ret = nfs4_recover_expired_lease(server);
 	if (!ret)
 		ret = nfs4_check_client_ready(clp);
@@ -4872,11 +4885,10 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 
 		if (_nfs4_async_handle_error(task, NULL, clp, NULL)
 								== -EAGAIN) {
-			nfs4_restart_rpc(task, clp);
+			nfs_restart_rpc(task, clp);
 			return;
 		}
 	}
-	nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp);
 	dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
 
 	kfree(task->tk_msg.rpc_argp);
@@ -4931,6 +4943,109 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
 			      &nfs41_sequence_ops, (void *)clp);
 }
 
+struct nfs4_reclaim_complete_data {
+	struct nfs_client *clp;
+	struct nfs41_reclaim_complete_args arg;
+	struct nfs41_reclaim_complete_res res;
+};
+
+static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_reclaim_complete_data *calldata = data;
+
+	if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args,
+				&calldata->res.seq_res, 0, task))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
+{
+	struct nfs4_reclaim_complete_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
+	struct nfs4_sequence_res *res = &calldata->res.seq_res;
+
+	dprintk("--> %s\n", __func__);
+	nfs41_sequence_done(clp, res, task->tk_status);
+	switch (task->tk_status) {
+	case 0:
+	case -NFS4ERR_COMPLETE_ALREADY:
+		break;
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_DEADSESSION:
+		/*
+		 * Handle the session error, but do not retry the operation, as
+		 * we have no way of telling whether the clientid had to be
+		 * reset before we got our reply.  If reset, a new wave of
+		 * reclaim operations will follow, containing their own reclaim
+		 * complete.  We don't want our retry to get on the way of
+		 * recovery by incorrectly indicating to the server that we're
+		 * done reclaiming state since the process had to be restarted.
+		 */
+		_nfs4_async_handle_error(task, NULL, clp, NULL);
+		break;
+	default:
+		if (_nfs4_async_handle_error(
+				task, NULL, clp, NULL) == -EAGAIN) {
+			rpc_restart_call_prepare(task);
+			return;
+		}
+	}
+
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_free_reclaim_complete_data(void *data)
+{
+	struct nfs4_reclaim_complete_data *calldata = data;
+
+	kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
+	.rpc_call_prepare = nfs4_reclaim_complete_prepare,
+	.rpc_call_done = nfs4_reclaim_complete_done,
+	.rpc_release = nfs4_free_reclaim_complete_data,
+};
+
+/*
+ * Issue a global reclaim complete.
+ */
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
+{
+	struct nfs4_reclaim_complete_data *calldata;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_reclaim_complete_call_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status = -ENOMEM;
+
+	dprintk("--> %s\n", __func__);
+	calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+	if (calldata == NULL)
+		goto out;
+	calldata->clp = clp;
+	calldata->arg.one_fs = 0;
+	calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+
+	msg.rpc_argp = &calldata->arg;
+	msg.rpc_resp = &calldata->res;
+	task_setup_data.callback_data = calldata;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		status = PTR_ERR(task);
+	rpc_put_task(task);
+out:
+	dprintk("<-- %s status=%d\n", __func__, status);
+	return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -4948,8 +5063,9 @@ struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
 	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,
 	.recover_open	= nfs4_open_reclaim,
 	.recover_lock	= nfs4_lock_reclaim,
-	.establish_clid = nfs4_proc_exchange_id,
+	.establish_clid = nfs41_init_clientid,
 	.get_clid_cred	= nfs4_get_exchange_id_cred,
+	.reclaim_complete = nfs41_proc_reclaim_complete,
 };
 #endif /* CONFIG_NFS_V4_1 */
 
@@ -4968,7 +5084,7 @@ struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
 	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE,
 	.recover_open	= nfs4_open_expired,
 	.recover_lock	= nfs4_lock_expired,
-	.establish_clid = nfs4_proc_exchange_id,
+	.establish_clid = nfs41_init_clientid,
 	.get_clid_cred	= nfs4_get_exchange_id_cred,
 };
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2ef4fecf398..e76427e6346 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -116,6 +116,68 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
 
 #if defined(CONFIG_NFS_V4_1)
 
+static int nfs41_setup_state_renewal(struct nfs_client *clp)
+{
+	int status;
+	struct nfs_fsinfo fsinfo;
+
+	status = nfs4_proc_get_lease_time(clp, &fsinfo);
+	if (status == 0) {
+		/* Update lease time and schedule renewal */
+		spin_lock(&clp->cl_lock);
+		clp->cl_lease_time = fsinfo.lease_time * HZ;
+		clp->cl_last_renewal = jiffies;
+		spin_unlock(&clp->cl_lock);
+
+		nfs4_schedule_state_renewal(clp);
+	}
+
+	return status;
+}
+
+static void nfs41_end_drain_session(struct nfs_client *clp,
+		struct nfs4_session *ses)
+{
+	if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state))
+		rpc_wake_up(&ses->fc_slot_table.slot_tbl_waitq);
+}
+
+static int nfs41_begin_drain_session(struct nfs_client *clp,
+		struct nfs4_session *ses)
+{
+	struct nfs4_slot_table *tbl = &ses->fc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state);
+	if (tbl->highest_used_slotid != -1) {
+		INIT_COMPLETION(ses->complete);
+		spin_unlock(&tbl->slot_tbl_lock);
+		return wait_for_completion_interruptible(&ses->complete);
+	}
+	spin_unlock(&tbl->slot_tbl_lock);
+	return 0;
+}
+
+int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	int status;
+
+	status = nfs41_begin_drain_session(clp, clp->cl_session);
+	if (status != 0)
+		goto out;
+	status = nfs4_proc_exchange_id(clp, cred);
+	if (status != 0)
+		goto out;
+	status = nfs4_proc_create_session(clp);
+	if (status != 0)
+		goto out;
+	nfs41_end_drain_session(clp, clp->cl_session);
+	nfs41_setup_state_renewal(clp);
+	nfs_mark_client_ready(clp, NFS_CS_READY);
+out:
+	return status;
+}
+
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 {
 	struct rpc_cred *cred;
@@ -877,6 +939,10 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 			case -NFS4ERR_EXPIRED:
 			case -NFS4ERR_NO_GRACE:
 			case -NFS4ERR_STALE_CLIENTID:
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 				goto out;
 			default:
 				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
@@ -959,6 +1025,10 @@ restart:
 			case -NFS4ERR_NO_GRACE:
 				nfs4_state_mark_reclaim_nograce(sp->so_client, state);
 			case -NFS4ERR_STALE_CLIENTID:
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 				goto out_err;
 		}
 		nfs4_put_open_state(state);
@@ -1011,6 +1081,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
 	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
 }
 
+static void nfs4_reclaim_complete(struct nfs_client *clp,
+				 const struct nfs4_state_recovery_ops *ops)
+{
+	/* Notify the server we're done reclaiming our state */
+	if (ops->reclaim_complete)
+		(void)ops->reclaim_complete(clp);
+}
+
 static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
@@ -1020,6 +1098,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
 		return;
 
+	nfs4_reclaim_complete(clp,
+		nfs4_reboot_recovery_ops[clp->cl_minorversion]);
+
 	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
 		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
 		spin_lock(&sp->so_lock);
@@ -1046,25 +1127,25 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
 	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
 
-static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp)
-{
-	clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
-}
-
-static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
+static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
 	switch (error) {
 		case -NFS4ERR_CB_PATH_DOWN:
 			nfs_handle_cb_pathdown(clp);
-			break;
+			return 0;
+		case -NFS4ERR_NO_GRACE:
+			nfs4_state_end_reclaim_reboot(clp);
+			return 0;
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_LEASE_MOVED:
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+			nfs4_state_end_reclaim_reboot(clp);
 			nfs4_state_start_reclaim_reboot(clp);
 			break;
 		case -NFS4ERR_EXPIRED:
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 			nfs4_state_start_reclaim_nograce(clp);
+			break;
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
 		case -NFS4ERR_BAD_HIGH_SLOT:
@@ -1072,8 +1153,11 @@ static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 		case -NFS4ERR_SEQ_FALSE_RETRY:
 		case -NFS4ERR_SEQ_MISORDERED:
-			set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+			/* Zero session reset errors */
+			return 0;
 	}
+	return error;
 }
 
 static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
@@ -1093,8 +1177,7 @@ restart:
 		if (status < 0) {
 			set_bit(ops->owner_flag_bit, &sp->so_flags);
 			nfs4_put_state_owner(sp);
-			nfs4_recovery_handle_error(clp, status);
-			return status;
+			return nfs4_recovery_handle_error(clp, status);
 		}
 		nfs4_put_state_owner(sp);
 		goto restart;
@@ -1124,8 +1207,7 @@ static int nfs4_check_lease(struct nfs_client *clp)
 	status = ops->renew_lease(clp, cred);
 	put_rpccred(cred);
 out:
-	nfs4_recovery_handle_error(clp, status);
-	return status;
+	return nfs4_recovery_handle_error(clp, status);
 }
 
 static int nfs4_reclaim_lease(struct nfs_client *clp)
@@ -1151,55 +1233,65 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 }
 
 #ifdef CONFIG_NFS_V4_1
-static void nfs4_session_recovery_handle_error(struct nfs_client *clp, int err)
+void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 {
-	switch (err) {
-	case -NFS4ERR_STALE_CLIENTID:
+	if (!flags)
+		return;
+	else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) {
 		set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-		set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
-	}
+		nfs4_state_start_reclaim_reboot(clp);
+		nfs4_schedule_state_recovery(clp);
+	} else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
+			    SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
+			    SEQ4_STATUS_ADMIN_STATE_REVOKED |
+			    SEQ4_STATUS_RECALLABLE_STATE_REVOKED |
+			    SEQ4_STATUS_LEASE_MOVED)) {
+		set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+		nfs4_state_start_reclaim_nograce(clp);
+		nfs4_schedule_state_recovery(clp);
+	} else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+			    SEQ4_STATUS_BACKCHANNEL_FAULT |
+			    SEQ4_STATUS_CB_PATH_DOWN_SESSION))
+		nfs_expire_all_delegations(clp);
 }
 
 static int nfs4_reset_session(struct nfs_client *clp)
 {
+	struct nfs4_session *ses = clp->cl_session;
 	int status;
 
+	status = nfs41_begin_drain_session(clp, ses);
+	if (status != 0)
+		return status;
+
 	status = nfs4_proc_destroy_session(clp->cl_session);
 	if (status && status != -NFS4ERR_BADSESSION &&
 	    status != -NFS4ERR_DEADSESSION) {
-		nfs4_session_recovery_handle_error(clp, status);
+		status = nfs4_recovery_handle_error(clp, status);
 		goto out;
 	}
 
 	memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
-	status = nfs4_proc_create_session(clp, 1);
+	status = nfs4_proc_create_session(clp);
 	if (status)
-		nfs4_session_recovery_handle_error(clp, status);
-		/* fall through*/
-out:
-	/* Wake up the next rpc task even on error */
-	rpc_wake_up_next(&clp->cl_session->fc_slot_table.slot_tbl_waitq);
-	return status;
-}
-
-static int nfs4_initialize_session(struct nfs_client *clp)
-{
-	int status;
+		status = nfs4_recovery_handle_error(clp, status);
 
-	status = nfs4_proc_create_session(clp, 0);
-	if (!status) {
-		nfs_mark_client_ready(clp, NFS_CS_READY);
-	} else if (status == -NFS4ERR_STALE_CLIENTID) {
-		set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-		set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
-	} else {
-		nfs_mark_client_ready(clp, status);
+out:
+	/*
+	 * Let the state manager reestablish state
+	 * without waking other tasks yet.
+	 */
+	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
+		/* Wake up the next rpc task */
+		nfs41_end_drain_session(clp, ses);
+		if (status == 0)
+			nfs41_setup_state_renewal(clp);
 	}
 	return status;
 }
+
 #else /* CONFIG_NFS_V4_1 */
 static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
-static int nfs4_initialize_session(struct nfs_client *clp) { return 0; }
 #endif /* CONFIG_NFS_V4_1 */
 
 /* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
@@ -1234,7 +1326,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			status = nfs4_reclaim_lease(clp);
 			if (status) {
 				nfs4_set_lease_expired(clp, status);
-				if (status == -EAGAIN)
+				if (test_bit(NFS4CLNT_LEASE_EXPIRED,
+							&clp->cl_state))
 					continue;
 				if (clp->cl_cons_state ==
 							NFS_CS_SESSION_INITING)
@@ -1242,55 +1335,51 @@ static void nfs4_state_manager(struct nfs_client *clp)
 				goto out_error;
 			}
 			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+			set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
 		}
 
 		if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
 			status = nfs4_check_lease(clp);
-			if (status != 0)
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
 				continue;
+			if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN)
+				goto out_error;
 		}
+
 		/* Initialize or reset the session */
-		if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)
+		if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)
 		   && nfs4_has_session(clp)) {
-			if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
-				status = nfs4_initialize_session(clp);
-			else
-				status = nfs4_reset_session(clp);
-			if (status) {
-				if (status == -NFS4ERR_STALE_CLIENTID)
-					continue;
+			status = nfs4_reset_session(clp);
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+				continue;
+			if (status < 0)
 				goto out_error;
-			}
 		}
+
 		/* First recover reboot state... */
-		if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
+		if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
 			status = nfs4_do_reclaim(clp,
 				nfs4_reboot_recovery_ops[clp->cl_minorversion]);
-			if (status == -NFS4ERR_STALE_CLIENTID)
-				continue;
-			if (test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
+			    test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
 				continue;
 			nfs4_state_end_reclaim_reboot(clp);
-			continue;
+			if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+				continue;
+			if (status < 0)
+				goto out_error;
 		}
 
 		/* Now recover expired state... */
 		if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
 			status = nfs4_do_reclaim(clp,
 				nfs4_nograce_recovery_ops[clp->cl_minorversion]);
-			if (status < 0) {
-				set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
-				if (status == -NFS4ERR_STALE_CLIENTID)
-					continue;
-				if (status == -NFS4ERR_EXPIRED)
-					continue;
-				if (test_bit(NFS4CLNT_SESSION_SETUP,
-								&clp->cl_state))
-					continue;
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
+			    test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
+			    test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+				continue;
+			if (status < 0)
 				goto out_error;
-			} else
-				nfs4_state_end_reclaim_nograce(clp);
-			continue;
 		}
 
 		if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
@@ -1309,8 +1398,6 @@ static void nfs4_state_manager(struct nfs_client *clp)
 out_error:
 	printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
 			" with error %d\n", clp->cl_hostname, -status);
-	if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-		nfs4_state_end_reclaim_reboot(clp);
 	nfs4_clear_state_manager_bit(clp);
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 20b4e30e6c8..e437fd6a819 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -46,11 +46,13 @@
 #include <linux/proc_fs.h>
 #include <linux/kdev_t.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/msg_prot.h>
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
 #include "nfs4_fs.h"
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
@@ -134,7 +136,7 @@ static int nfs4_stat_to_errno(int);
 #define decode_lookup_maxsz	(op_decode_hdr_maxsz)
 #define encode_share_access_maxsz \
 				(2)
-#define encode_createmode_maxsz	(1 + encode_attrs_maxsz)
+#define encode_createmode_maxsz	(1 + encode_attrs_maxsz + encode_verifier_maxsz)
 #define encode_opentype_maxsz	(1 + encode_createmode_maxsz)
 #define encode_claim_null_maxsz	(1 + nfs4_name_maxsz)
 #define encode_open_maxsz	(op_encode_hdr_maxsz + \
@@ -299,6 +301,8 @@ static int nfs4_stat_to_errno(int);
 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
 #define decode_sequence_maxsz	(op_decode_hdr_maxsz + \
 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
+#define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
+#define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz	0
 #define decode_sequence_maxsz	0
@@ -676,6 +680,25 @@ static int nfs4_stat_to_errno(int);
 					 decode_sequence_maxsz + \
 					 decode_putrootfh_maxsz + \
 					 decode_fsinfo_maxsz)
+#define NFS4_enc_reclaim_complete_sz	(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_reclaim_complete_maxsz)
+#define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_reclaim_complete_maxsz)
+
+const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+				      compound_encode_hdr_maxsz +
+				      encode_sequence_maxsz +
+				      encode_putfh_maxsz +
+				      encode_getattr_maxsz) *
+				     XDR_UNIT);
+
+const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+				     compound_decode_hdr_maxsz +
+				     decode_sequence_maxsz +
+				     decode_putfh_maxsz) *
+				    XDR_UNIT);
 #endif /* CONFIG_NFS_V4_1 */
 
 static const umode_t nfs_type2fmt[] = {
@@ -1140,6 +1163,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
 	__be32 *p;
+	struct nfs_client *clp;
 
 	p = reserve_space(xdr, 4);
 	switch(arg->open_flags & O_EXCL) {
@@ -1148,8 +1172,23 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 		encode_attrs(xdr, arg->u.attrs, arg->server);
 		break;
 	default:
-		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
-		encode_nfs4_verifier(xdr, &arg->u.verifier);
+		clp = arg->server->nfs_client;
+		if (clp->cl_minorversion > 0) {
+			if (nfs4_has_persistent_session(clp)) {
+				*p = cpu_to_be32(NFS4_CREATE_GUARDED);
+				encode_attrs(xdr, arg->u.attrs, arg->server);
+			} else {
+				struct iattr dummy;
+
+				*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
+				encode_nfs4_verifier(xdr, &arg->u.verifier);
+				dummy.ia_valid = 0;
+				encode_attrs(xdr, &dummy, arg->server);
+			}
+		} else {
+			*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+			encode_nfs4_verifier(xdr, &arg->u.verifier);
+		}
 	}
 }
 
@@ -1592,6 +1631,19 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 	hdr->nops++;
 	hdr->replen += decode_destroy_session_maxsz;
 }
+
+static void encode_reclaim_complete(struct xdr_stream *xdr,
+				    struct nfs41_reclaim_complete_args *args,
+				    struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(OP_RECLAIM_COMPLETE);
+	*p++ = cpu_to_be32(args->one_fs);
+	hdr->nops++;
+	hdr->replen += decode_reclaim_complete_maxsz;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static void encode_sequence(struct xdr_stream *xdr,
@@ -2096,7 +2148,7 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
 	encode_compound_hdr(&xdr, req, &hdr);
 	encode_sequence(&xdr, &args->seq_args, &hdr);
 	encode_putfh(&xdr, args->fh, &hdr);
-	replen = hdr.replen + nfs4_fattr_bitmap_maxsz + 1;
+	replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
 	encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
@@ -2420,6 +2472,26 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
 	encode_nops(&hdr);
 	return 0;
 }
+
+/*
+ * a RECLAIM_COMPLETE request
+ */
+static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
+				     struct nfs41_reclaim_complete_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args)
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_sequence(&xdr, &args->seq_args, &hdr);
+	encode_reclaim_complete(&xdr, args, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -4528,6 +4600,11 @@ static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
 {
 	return decode_op_hdr(xdr, OP_DESTROY_SESSION);
 }
+
+static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy)
+{
+	return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE);
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static int decode_sequence(struct xdr_stream *xdr,
@@ -4583,8 +4660,8 @@ static int decode_sequence(struct xdr_stream *xdr,
 	dummy = be32_to_cpup(p++);
 	/* target highest slot id - currently not processed */
 	dummy = be32_to_cpup(p++);
-	/* result flags - currently not processed */
-	dummy = be32_to_cpup(p);
+	/* result flags */
+	res->sr_status_flags = be32_to_cpup(p);
 	status = 0;
 out_err:
 	res->sr_status = status;
@@ -5309,7 +5386,7 @@ out:
 }
 
 /*
- * FSINFO request
+ * Decode FSINFO response
  */
 static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
 			       struct nfs4_fsinfo_res *res)
@@ -5330,7 +5407,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
 }
 
 /*
- * PATHCONF request
+ * Decode PATHCONF response
  */
 static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
 				 struct nfs4_pathconf_res *res)
@@ -5351,7 +5428,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
 }
 
 /*
- * STATFS request
+ * Decode STATFS response
  */
 static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
 			       struct nfs4_statfs_res *res)
@@ -5372,7 +5449,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
 }
 
 /*
- * GETATTR_BITMAP request
+ * Decode GETATTR_BITMAP response
  */
 static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
 {
@@ -5411,7 +5488,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
 }
 
 /*
- * a SETCLIENTID request
+ * Decode SETCLIENTID response
  */
 static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
 		struct nfs_client *clp)
@@ -5428,7 +5505,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
 }
 
 /*
- * a SETCLIENTID_CONFIRM request
+ * Decode SETCLIENTID_CONFIRM response
  */
 static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
 {
@@ -5448,7 +5525,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
 }
 
 /*
- * DELEGRETURN request
+ * Decode DELEGRETURN response
  */
 static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
 {
@@ -5474,7 +5551,7 @@ out:
 }
 
 /*
- * FS_LOCATIONS request
+ * Decode FS_LOCATIONS response
  */
 static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
 				     struct nfs4_fs_locations_res *res)
@@ -5504,7 +5581,7 @@ out:
 
 #if defined(CONFIG_NFS_V4_1)
 /*
- * EXCHANGE_ID request
+ * Decode EXCHANGE_ID response
  */
 static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
 				    void *res)
@@ -5521,7 +5598,7 @@ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
 }
 
 /*
- * a CREATE_SESSION request
+ * Decode CREATE_SESSION response
  */
 static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
 				       struct nfs41_create_session_res *res)
@@ -5538,7 +5615,7 @@ static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
 }
 
 /*
- * a DESTROY_SESSION request
+ * Decode DESTROY_SESSION response
  */
 static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
 					void *dummy)
@@ -5555,7 +5632,7 @@ static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
 }
 
 /*
- * a SEQUENCE request
+ * Decode SEQUENCE response
  */
 static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
 				 struct nfs4_sequence_res *res)
@@ -5572,7 +5649,7 @@ static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
 }
 
 /*
- * a GET_LEASE_TIME request
+ * Decode GET_LEASE_TIME response
  */
 static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
 				       struct nfs4_get_lease_time_res *res)
@@ -5591,6 +5668,25 @@ static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
 		status = decode_fsinfo(&xdr, res->lr_fsinfo);
 	return status;
 }
+
+/*
+ * Decode RECLAIM_COMPLETE response
+ */
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
+					 struct nfs41_reclaim_complete_res *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (!status)
+		status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	if (!status)
+		status = decode_reclaim_complete(&xdr, (void *)NULL);
+	return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
@@ -5767,6 +5863,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(DESTROY_SESSION,	enc_destroy_session,	dec_destroy_session),
   PROC(SEQUENCE,	enc_sequence,	dec_sequence),
   PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
+  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 12c9e66d3f1..db9b360ae19 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -356,25 +356,19 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 	struct nfs_readres *resp = &data->res;
 
 	if (resp->eof || resp->count == argp->count)
-		goto out;
+		return;
 
 	/* This is a short read! */
 	nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
 	/* Has the server at least made some progress? */
 	if (resp->count == 0)
-		goto out;
+		return;
 
 	/* Yes, so retry the read at the end of the data */
 	argp->offset += resp->count;
 	argp->pgbase += resp->count;
 	argp->count -= resp->count;
-	nfs4_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
-	return;
-out:
-	nfs4_sequence_free_slot(NFS_SERVER(data->inode)->nfs_client,
-				&data->res.seq_res);
-	return;
-
+	nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
 }
 
 /*
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 90be551b80c..ce907efc550 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -175,14 +175,16 @@ static const match_table_t nfs_mount_option_tokens = {
 };
 
 enum {
-	Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma,
+	Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
 
 	Opt_xprt_err
 };
 
 static const match_table_t nfs_xprt_protocol_tokens = {
 	{ Opt_xprt_udp, "udp" },
+	{ Opt_xprt_udp6, "udp6" },
 	{ Opt_xprt_tcp, "tcp" },
+	{ Opt_xprt_tcp6, "tcp6" },
 	{ Opt_xprt_rdma, "rdma" },
 
 	{ Opt_xprt_err, NULL }
@@ -492,6 +494,45 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
 	return sec_flavours[i].str;
 }
 
+static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss,
+				  int showdefaults)
+{
+	struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address;
+
+	seq_printf(m, ",mountproto=");
+	switch (sap->sa_family) {
+	case AF_INET:
+		switch (nfss->mountd_protocol) {
+		case IPPROTO_UDP:
+			seq_printf(m, RPCBIND_NETID_UDP);
+			break;
+		case IPPROTO_TCP:
+			seq_printf(m, RPCBIND_NETID_TCP);
+			break;
+		default:
+			if (showdefaults)
+				seq_printf(m, "auto");
+		}
+		break;
+	case AF_INET6:
+		switch (nfss->mountd_protocol) {
+		case IPPROTO_UDP:
+			seq_printf(m, RPCBIND_NETID_UDP6);
+			break;
+		case IPPROTO_TCP:
+			seq_printf(m, RPCBIND_NETID_TCP6);
+			break;
+		default:
+			if (showdefaults)
+				seq_printf(m, "auto");
+		}
+		break;
+	default:
+		if (showdefaults)
+			seq_printf(m, "auto");
+	}
+}
+
 static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 				    int showdefaults)
 {
@@ -505,7 +546,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 	}
 	case AF_INET6: {
 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-		seq_printf(m, ",mountaddr=%pI6", &sin6->sin6_addr);
+		seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr);
 		break;
 	}
 	default:
@@ -518,17 +559,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 	if (nfss->mountd_port || showdefaults)
 		seq_printf(m, ",mountport=%u", nfss->mountd_port);
 
-	switch (nfss->mountd_protocol) {
-	case IPPROTO_UDP:
-		seq_printf(m, ",mountproto=udp");
-		break;
-	case IPPROTO_TCP:
-		seq_printf(m, ",mountproto=tcp");
-		break;
-	default:
-		if (showdefaults)
-			seq_printf(m, ",mountproto=auto");
-	}
+	nfs_show_mountd_netid(m, nfss, showdefaults);
 }
 
 /*
@@ -578,7 +609,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 			seq_puts(m, nfs_infop->nostr);
 	}
 	seq_printf(m, ",proto=%s",
-		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
+		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
 	if (version == 4) {
 		if (nfss->port != NFS_PORT)
 			seq_printf(m, ",port=%u", nfss->port);
@@ -714,8 +745,6 @@ static void nfs_umount_begin(struct super_block *sb)
 	struct nfs_server *server;
 	struct rpc_clnt *rpc;
 
-	lock_kernel();
-
 	server = NFS_SB(sb);
 	/* -EIO all pending I/O */
 	rpc = server->client_acl;
@@ -724,8 +753,6 @@ static void nfs_umount_begin(struct super_block *sb)
 	rpc = server->client;
 	if (!IS_ERR(rpc))
 		rpc_killall_tasks(rpc);
-
-	unlock_kernel();
 }
 
 static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
@@ -734,8 +761,6 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (data) {
-		data->rsize		= NFS_MAX_FILE_IO_SIZE;
-		data->wsize		= NFS_MAX_FILE_IO_SIZE;
 		data->acregmin		= NFS_DEF_ACREGMIN;
 		data->acregmax		= NFS_DEF_ACREGMAX;
 		data->acdirmin		= NFS_DEF_ACDIRMIN;
@@ -887,6 +912,8 @@ static int nfs_parse_mount_options(char *raw,
 {
 	char *p, *string, *secdata;
 	int rc, sloppy = 0, invalid_option = 0;
+	unsigned short protofamily = AF_UNSPEC;
+	unsigned short mountfamily = AF_UNSPEC;
 
 	if (!raw) {
 		dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -1232,12 +1259,17 @@ static int nfs_parse_mount_options(char *raw,
 			token = match_token(string,
 					    nfs_xprt_protocol_tokens, args);
 
+			protofamily = AF_INET;
 			switch (token) {
+			case Opt_xprt_udp6:
+				protofamily = AF_INET6;
 			case Opt_xprt_udp:
 				mnt->flags &= ~NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 				kfree(string);
 				break;
+			case Opt_xprt_tcp6:
+				protofamily = AF_INET6;
 			case Opt_xprt_tcp:
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
@@ -1265,10 +1297,15 @@ static int nfs_parse_mount_options(char *raw,
 					    nfs_xprt_protocol_tokens, args);
 			kfree(string);
 
+			mountfamily = AF_INET;
 			switch (token) {
+			case Opt_xprt_udp6:
+				mountfamily = AF_INET6;
 			case Opt_xprt_udp:
 				mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
 				break;
+			case Opt_xprt_tcp6:
+				mountfamily = AF_INET6;
 			case Opt_xprt_tcp:
 				mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
 				break;
@@ -1367,8 +1404,33 @@ static int nfs_parse_mount_options(char *raw,
 	if (!sloppy && invalid_option)
 		return 0;
 
+	/*
+	 * verify that any proto=/mountproto= options match the address
+	 * familiies in the addr=/mountaddr= options.
+	 */
+	if (protofamily != AF_UNSPEC &&
+	    protofamily != mnt->nfs_server.address.ss_family)
+		goto out_proto_mismatch;
+
+	if (mountfamily != AF_UNSPEC) {
+		if (mnt->mount_server.addrlen) {
+			if (mountfamily != mnt->mount_server.address.ss_family)
+				goto out_mountproto_mismatch;
+		} else {
+			if (mountfamily != mnt->nfs_server.address.ss_family)
+				goto out_mountproto_mismatch;
+		}
+	}
+
 	return 1;
 
+out_mountproto_mismatch:
+	printk(KERN_INFO "NFS: mount server address does not match mountproto= "
+			 "option\n");
+	return 0;
+out_proto_mismatch:
+	printk(KERN_INFO "NFS: server address does not match proto= option\n");
+	return 0;
 out_invalid_address:
 	printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
 	return 0;
@@ -1881,7 +1943,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	if (data == NULL)
 		return -ENOMEM;
 
-	lock_kernel();
 	/* fill out struct with values from existing mount */
 	data->flags = nfss->flags;
 	data->rsize = nfss->rsize;
@@ -1907,7 +1968,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	error = nfs_compare_remount_data(nfss, data);
 out:
 	kfree(data);
-	unlock_kernel();
 	return error;
 }
 
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index b62481dabae..70e1fbbaaea 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -22,63 +22,55 @@ static struct ctl_table_header *nfs_callback_sysctl_table;
 static ctl_table nfs_cb_sysctls[] = {
 #ifdef CONFIG_NFS_V4
 	{
-		.ctl_name = CTL_UNNUMBERED,
 		.procname = "nfs_callback_tcpport",
 		.data = &nfs_callback_set_tcpport,
 		.maxlen = sizeof(int),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec_minmax,
+		.proc_handler = proc_dointvec_minmax,
 		.extra1 = (int *)&nfs_set_port_min,
 		.extra2 = (int *)&nfs_set_port_max,
 	},
 	{
-		.ctl_name = CTL_UNNUMBERED,
 		.procname = "idmap_cache_timeout",
 		.data = &nfs_idmap_cache_timeout,
 		.maxlen = sizeof(int),
 		.mode = 0644,
-		.proc_handler = &proc_dointvec_jiffies,
-		.strategy = &sysctl_jiffies,
+		.proc_handler = proc_dointvec_jiffies,
 	},
 #endif
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nfs_mountpoint_timeout",
 		.data		= &nfs_mountpoint_expiry_timeout,
 		.maxlen		= sizeof(nfs_mountpoint_expiry_timeout),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-		.strategy	= &sysctl_jiffies,
+		.proc_handler	= proc_dointvec_jiffies,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nfs_congestion_kb",
 		.data		= &nfs_congestion_kb,
 		.maxlen		= sizeof(nfs_congestion_kb),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table nfs_cb_sysctl_dir[] = {
 	{
-		.ctl_name = CTL_UNNUMBERED,
 		.procname = "nfs",
 		.mode = 0555,
 		.child = nfs_cb_sysctls,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table nfs_cb_sysctl_root[] = {
 	{
-		.ctl_name = CTL_FS,
 		.procname = "fs",
 		.mode = 0555,
 		.child = nfs_cb_sysctl_dir,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 int nfs_register_sysctl(void)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1064c91ae81..6da3d3ff6ed 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -83,7 +83,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 	struct inode *dir = data->dir;
 
 	if (!NFS_PROTO(dir)->unlink_done(task, dir))
-		nfs4_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
+		nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
 }
 
 /**
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53eb26c16b5..d171696017f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc)
 {
 	if (wbc->for_reclaim)
 		return FLUSH_HIGHPRI | FLUSH_STABLE;
-	if (wbc->for_kupdate)
+	if (wbc->for_kupdate || wbc->for_background)
 		return FLUSH_LOWPRI;
 	return 0;
 }
@@ -774,7 +774,7 @@ int nfs_updatepage(struct file *file, struct page *page,
 	 */
 	if (nfs_write_pageuptodate(page, inode) &&
 			inode->i_flock == NULL &&
-			!(file->f_flags & O_SYNC)) {
+			!(file->f_flags & O_DSYNC)) {
 		count = max(count + offset, nfs_page_length(page));
 		offset = 0;
 	}
@@ -1216,7 +1216,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 				 */
 				argp->stable = NFS_FILE_SYNC;
 			}
-			nfs4_restart_rpc(task, server->nfs_client);
+			nfs_restart_rpc(task, server->nfs_client);
 			return -EAGAIN;
 		}
 		if (time_before(complain, jiffies)) {
@@ -1228,7 +1228,6 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		/* Can't do anything about it except throw an error. */
 		task->tk_status = -EIO;
 	}
-	nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res);
 	return 0;
 }
 
@@ -1612,15 +1611,16 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 	if (ret)
 		goto out_unlock;
 	page_cache_get(newpage);
+	spin_lock(&mapping->host->i_lock);
 	req->wb_page = newpage;
 	SetPagePrivate(newpage);
-	set_page_private(newpage, page_private(page));
+	set_page_private(newpage, (unsigned long)req);
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
+	spin_unlock(&mapping->host->i_lock);
 	page_cache_release(page);
 out_unlock:
 	nfs_clear_page_tag_locked(req);
-	nfs_release_request(req);
 out:
 	return ret;
 }
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 251da07b2a1..1225af7b216 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -2,6 +2,7 @@ config NILFS2_FS
 	tristate "NILFS2 file system support (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
 	select CRC32
+	select FS_JOURNAL_INFO
 	help
 	  NILFS2 is a log-structured file system (LFS) supporting continuous
 	  snapshotting.  In addition to versioning capability of the entire
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d69e6ae5925..3f959f1879d 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -142,29 +142,75 @@ static void nilfs_palloc_desc_block_init(struct inode *inode,
 	}
 }
 
+static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
+				  int create,
+				  void (*init_block)(struct inode *,
+						     struct buffer_head *,
+						     void *),
+				  struct buffer_head **bhp,
+				  struct nilfs_bh_assoc *prev,
+				  spinlock_t *lock)
+{
+	int ret;
+
+	spin_lock(lock);
+	if (prev->bh && blkoff == prev->blkoff) {
+		get_bh(prev->bh);
+		*bhp = prev->bh;
+		spin_unlock(lock);
+		return 0;
+	}
+	spin_unlock(lock);
+
+	ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp);
+	if (!ret) {
+		spin_lock(lock);
+		/*
+		 * The following code must be safe for change of the
+		 * cache contents during the get block call.
+		 */
+		brelse(prev->bh);
+		get_bh(*bhp);
+		prev->bh = *bhp;
+		prev->blkoff = blkoff;
+		spin_unlock(lock);
+	}
+	return ret;
+}
+
 static int nilfs_palloc_get_desc_block(struct inode *inode,
 				       unsigned long group,
 				       int create, struct buffer_head **bhp)
 {
-	return nilfs_mdt_get_block(inode,
-				   nilfs_palloc_desc_blkoff(inode, group),
-				   create, nilfs_palloc_desc_block_init, bhp);
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_desc_blkoff(inode, group),
+				      create, nilfs_palloc_desc_block_init,
+				      bhp, &cache->prev_desc, &cache->lock);
 }
 
 static int nilfs_palloc_get_bitmap_block(struct inode *inode,
 					 unsigned long group,
 					 int create, struct buffer_head **bhp)
 {
-	return nilfs_mdt_get_block(inode,
-				   nilfs_palloc_bitmap_blkoff(inode, group),
-				   create, NULL, bhp);
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_bitmap_blkoff(inode, group),
+				      create, NULL, bhp,
+				      &cache->prev_bitmap, &cache->lock);
 }
 
 int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
 				 int create, struct buffer_head **bhp)
 {
-	return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
-				   create, NULL, bhp);
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_entry_blkoff(inode, nr),
+				      create, NULL, bhp,
+				      &cache->prev_entry, &cache->lock);
 }
 
 static struct nilfs_palloc_group_desc *
@@ -176,13 +222,6 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
 		group % nilfs_palloc_groups_per_desc_block(inode);
 }
 
-static unsigned char *
-nilfs_palloc_block_get_bitmap(const struct inode *inode,
-			      const struct buffer_head *bh, void *kaddr)
-{
-	return (unsigned char *)(kaddr + bh_offset(bh));
-}
-
 void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
 				   const struct buffer_head *bh, void *kaddr)
 {
@@ -289,8 +328,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 				if (ret < 0)
 					goto out_desc;
 				bitmap_kaddr = kmap(bitmap_bh->b_page);
-				bitmap = nilfs_palloc_block_get_bitmap(
-					inode, bitmap_bh, bitmap_kaddr);
+				bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 				pos = nilfs_palloc_find_available_slot(
 					inode, group, group_offset, bitmap,
 					entries_per_group);
@@ -351,8 +389,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 	desc = nilfs_palloc_block_get_group_desc(inode, group,
 						 req->pr_desc_bh, desc_kaddr);
 	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
-	bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
-					       bitmap_kaddr);
+	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 
 	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
 				    group_offset, bitmap))
@@ -385,8 +422,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	desc = nilfs_palloc_block_get_group_desc(inode, group,
 						 req->pr_desc_bh, desc_kaddr);
 	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
-	bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
-					       bitmap_kaddr);
+	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
 				    group_offset, bitmap))
 		printk(KERN_WARNING "%s: entry numer %llu already freed\n",
@@ -472,8 +508,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 		desc = nilfs_palloc_block_get_group_desc(
 			inode, group, desc_bh, desc_kaddr);
 		bitmap_kaddr = kmap(bitmap_bh->b_page);
-		bitmap = nilfs_palloc_block_get_bitmap(
-			inode, bitmap_bh, bitmap_kaddr);
+		bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 		for (j = i, n = 0;
 		     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
 							      entry_nrs[j]);
@@ -502,3 +537,30 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 	}
 	return 0;
 }
+
+void nilfs_palloc_setup_cache(struct inode *inode,
+			      struct nilfs_palloc_cache *cache)
+{
+	NILFS_MDT(inode)->mi_palloc_cache = cache;
+	spin_lock_init(&cache->lock);
+}
+
+void nilfs_palloc_clear_cache(struct inode *inode)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	spin_lock(&cache->lock);
+	brelse(cache->prev_desc.bh);
+	brelse(cache->prev_bitmap.bh);
+	brelse(cache->prev_entry.bh);
+	cache->prev_desc.bh = NULL;
+	cache->prev_bitmap.bh = NULL;
+	cache->prev_entry.bh = NULL;
+	spin_unlock(&cache->lock);
+}
+
+void nilfs_palloc_destroy_cache(struct inode *inode)
+{
+	nilfs_palloc_clear_cache(inode);
+	NILFS_MDT(inode)->mi_palloc_cache = NULL;
+}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4ace5475c2c..f4543ac4f56 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -69,4 +69,25 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
 #define nilfs_clear_bit_atomic		ext2_clear_bit_atomic
 #define nilfs_find_next_zero_bit	ext2_find_next_zero_bit
 
+/*
+ * persistent object allocator cache
+ */
+
+struct nilfs_bh_assoc {
+	unsigned long blkoff;
+	struct buffer_head *bh;
+};
+
+struct nilfs_palloc_cache {
+	spinlock_t lock;
+	struct nilfs_bh_assoc prev_desc;
+	struct nilfs_bh_assoc prev_bitmap;
+	struct nilfs_bh_assoc prev_entry;
+};
+
+void nilfs_palloc_setup_cache(struct inode *inode,
+			      struct nilfs_palloc_cache *cache);
+void nilfs_palloc_clear_cache(struct inode *inode);
+void nilfs_palloc_destroy_cache(struct inode *inode);
+
 #endif	/* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 08834df6ec6..f4a14ea2ed9 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -402,19 +402,11 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
 {
 	inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-	if (NILFS_MDT(bmap->b_inode))
-		nilfs_mdt_mark_dirty(bmap->b_inode);
-	else
-		mark_inode_dirty(bmap->b_inode);
 }
 
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
 {
 	inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-	if (NILFS_MDT(bmap->b_inode))
-		nilfs_mdt_mark_dirty(bmap->b_inode);
-	else
-		mark_inode_dirty(bmap->b_inode);
 }
 
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 84c25382f8e..471e269536a 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -68,9 +68,34 @@ void nilfs_btnode_cache_clear(struct address_space *btnc)
 	truncate_inode_pages(btnc, 0);
 }
 
+struct buffer_head *
+nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
+{
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	struct buffer_head *bh;
+
+	bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+	if (unlikely(!bh))
+		return NULL;
+
+	if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+		     buffer_dirty(bh))) {
+		brelse(bh);
+		BUG();
+	}
+	memset(bh->b_data, 0, 1 << inode->i_blkbits);
+	bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+	bh->b_blocknr = blocknr;
+	set_buffer_mapped(bh);
+	set_buffer_uptodate(bh);
+
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	return bh;
+}
+
 int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
-			      sector_t pblocknr, struct buffer_head **pbh,
-			      int newblk)
+			      sector_t pblocknr, struct buffer_head **pbh)
 {
 	struct buffer_head *bh;
 	struct inode *inode = NILFS_BTNC_I(btnc);
@@ -81,19 +106,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
 		return -ENOMEM;
 
 	err = -EEXIST; /* internal code */
-	if (newblk) {
-		if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
-			     buffer_dirty(bh))) {
-			brelse(bh);
-			BUG();
-		}
-		memset(bh->b_data, 0, 1 << inode->i_blkbits);
-		bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
-		bh->b_blocknr = blocknr;
-		set_buffer_mapped(bh);
-		set_buffer_uptodate(bh);
-		goto found;
-	}
 
 	if (buffer_uptodate(bh) || buffer_dirty(bh))
 		goto found;
@@ -135,27 +147,6 @@ out_locked:
 	return err;
 }
 
-int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
-		     sector_t pblocknr, struct buffer_head **pbh, int newblk)
-{
-	struct buffer_head *bh;
-	int err;
-
-	err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
-	if (err == -EEXIST) /* internal code (cache hit) */
-		return 0;
-	if (unlikely(err))
-		return err;
-
-	bh = *pbh;
-	wait_on_buffer(bh);
-	if (!buffer_uptodate(bh)) {
-		brelse(bh);
-		return -EIO;
-	}
-	return 0;
-}
-
 /**
  * nilfs_btnode_delete - delete B-tree node buffer
  * @bh: buffer to be deleted
@@ -244,12 +235,13 @@ retry:
 		unlock_page(obh->b_page);
 	}
 
-	err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
-	if (likely(!err)) {
-		BUG_ON(nbh == obh);
-		ctxt->newbh = nbh;
-	}
-	return err;
+	nbh = nilfs_btnode_create_block(btnc, newkey);
+	if (!nbh)
+		return -ENOMEM;
+
+	BUG_ON(nbh == obh);
+	ctxt->newbh = nbh;
+	return 0;
 
  failed_unlock:
 	unlock_page(obh->b_page);
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 3e2275172ed..07da83f0771 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -40,10 +40,10 @@ struct nilfs_btnode_chkey_ctxt {
 void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
+struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
+					      __u64 blocknr);
 int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
-			      struct buffer_head **, int);
-int nilfs_btnode_get(struct address_space *, __u64, sector_t,
-		     struct buffer_head **, int);
+			      struct buffer_head **);
 void nilfs_btnode_delete(struct buffer_head *);
 int nilfs_btnode_prepare_change_key(struct address_space *,
 				    struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index e25b507a474..7cdd98b8d51 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -114,7 +114,18 @@ static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
 {
 	struct address_space *btnc =
 		&NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-	return nilfs_btnode_get(btnc, ptr, 0, bhp, 0);
+	int err;
+
+	err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
+	if (err)
+		return err == -EEXIST ? 0 : err;
+
+	wait_on_buffer(*bhp);
+	if (!buffer_uptodate(*bhp)) {
+		brelse(*bhp);
+		return -EIO;
+	}
+	return 0;
 }
 
 static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
@@ -122,12 +133,15 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
 {
 	struct address_space *btnc =
 		&NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-	int ret;
+	struct buffer_head *bh;
 
-	ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1);
-	if (!ret)
-		set_buffer_nilfs_volatile(*bhp);
-	return ret;
+	bh = nilfs_btnode_create_block(btnc, ptr);
+	if (!bh)
+		return -ENOMEM;
+
+	set_buffer_nilfs_volatile(bh);
+	*bhp = bh;
+	return 0;
 }
 
 static inline int
@@ -444,6 +458,18 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
 		nilfs_btree_get_nonroot_node(path, level);
 }
 
+static inline int
+nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
+{
+	if (unlikely(nilfs_btree_node_get_level(node) != level)) {
+		dump_stack();
+		printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
+		       nilfs_btree_node_get_level(node), level);
+		return 1;
+	}
+	return 0;
+}
+
 static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 				 struct nilfs_btree_path *path,
 				 __u64 key, __u64 *ptrp, int minlevel)
@@ -467,7 +493,8 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 		if (ret < 0)
 			return ret;
 		node = nilfs_btree_get_nonroot_node(path, level);
-		BUG_ON(level != nilfs_btree_node_get_level(node));
+		if (nilfs_btree_bad_node(node, level))
+			return -EINVAL;
 		if (!found)
 			found = nilfs_btree_node_lookup(node, key, &index);
 		else
@@ -512,7 +539,8 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
 		if (ret < 0)
 			return ret;
 		node = nilfs_btree_get_nonroot_node(path, level);
-		BUG_ON(level != nilfs_btree_node_get_level(node));
+		if (nilfs_btree_bad_node(node, level))
+			return -EINVAL;
 		index = nilfs_btree_node_get_nchildren(node) - 1;
 		ptr = nilfs_btree_node_get_ptr(btree, node, index);
 		path[level].bp_index = index;
@@ -638,13 +666,11 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
 {
 	if (level < nilfs_btree_height(btree) - 1) {
 		do {
-			lock_buffer(path[level].bp_bh);
 			nilfs_btree_node_set_key(
 				nilfs_btree_get_nonroot_node(path, level),
 				path[level].bp_index, key);
 			if (!buffer_dirty(path[level].bp_bh))
 				nilfs_btnode_mark_dirty(path[level].bp_bh);
-			unlock_buffer(path[level].bp_bh);
 		} while ((path[level].bp_index == 0) &&
 			 (++level < nilfs_btree_height(btree) - 1));
 	}
@@ -663,13 +689,11 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
 	struct nilfs_btree_node *node;
 
 	if (level < nilfs_btree_height(btree) - 1) {
-		lock_buffer(path[level].bp_bh);
 		node = nilfs_btree_get_nonroot_node(path, level);
 		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
 					path[level].bp_index);
 		if (!buffer_dirty(path[level].bp_bh))
 			nilfs_btnode_mark_dirty(path[level].bp_bh);
-		unlock_buffer(path[level].bp_bh);
 
 		if (path[level].bp_index == 0)
 			nilfs_btree_promote_key(btree, path, level + 1,
@@ -689,9 +713,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
 	struct nilfs_btree_node *node, *left;
 	int nchildren, lnchildren, n, move;
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	left = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
@@ -712,9 +733,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(node, 0));
 
@@ -740,9 +758,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
 	struct nilfs_btree_node *node, *right;
 	int nchildren, rnchildren, n, move;
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
@@ -763,9 +778,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	path[level + 1].bp_index++;
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(right, 0));
@@ -794,9 +806,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 	__u64 newptr;
 	int nchildren, n, move;
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
@@ -815,9 +824,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	newkey = nilfs_btree_node_get_key(right, 0);
 	newptr = path[level].bp_newreq.bpr_ptr;
 
@@ -852,8 +858,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
 	struct nilfs_btree_node *root, *child;
 	int n;
 
-	lock_buffer(path[level].bp_sib_bh);
-
 	root = nilfs_btree_get_root(btree);
 	child = nilfs_btree_get_sib_node(path, level);
 
@@ -865,8 +869,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_sib_bh);
-
 	path[level].bp_bh = path[level].bp_sib_bh;
 	path[level].bp_sib_bh = NULL;
 
@@ -1023,11 +1025,9 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 		stats->bs_nblocks++;
 
-		lock_buffer(bh);
 		nilfs_btree_node_init(btree,
 				      (struct nilfs_btree_node *)bh->b_data,
 				      0, level, 0, NULL, NULL);
-		unlock_buffer(bh);
 		path[level].bp_sib_bh = bh;
 		path[level].bp_op = nilfs_btree_split;
 	}
@@ -1052,10 +1052,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	if (ret < 0)
 		goto err_out_curr_node;
 
-	lock_buffer(bh);
 	nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
 			      0, level, 0, NULL, NULL);
-	unlock_buffer(bh);
 	path[level].bp_sib_bh = bh;
 	path[level].bp_op = nilfs_btree_grow;
 
@@ -1154,13 +1152,11 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
 	struct nilfs_btree_node *node;
 
 	if (level < nilfs_btree_height(btree) - 1) {
-		lock_buffer(path[level].bp_bh);
 		node = nilfs_btree_get_nonroot_node(path, level);
 		nilfs_btree_node_delete(btree, node, keyp, ptrp,
 					path[level].bp_index);
 		if (!buffer_dirty(path[level].bp_bh))
 			nilfs_btnode_mark_dirty(path[level].bp_bh);
-		unlock_buffer(path[level].bp_bh);
 		if (path[level].bp_index == 0)
 			nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(node, 0));
@@ -1180,9 +1176,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	left = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
@@ -1197,9 +1190,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(node, 0));
 
@@ -1217,9 +1207,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
@@ -1234,9 +1221,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	path[level + 1].bp_index++;
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(right, 0));
@@ -1255,9 +1239,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	left = nilfs_btree_get_sib_node(path, level);
 
@@ -1268,9 +1249,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	nilfs_btnode_delete(path[level].bp_bh);
 	path[level].bp_bh = path[level].bp_sib_bh;
 	path[level].bp_sib_bh = NULL;
@@ -1286,9 +1264,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
 
@@ -1299,9 +1274,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
 	if (!buffer_dirty(path[level].bp_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	nilfs_btnode_delete(path[level].bp_sib_bh);
 	path[level].bp_sib_bh = NULL;
 	path[level + 1].bp_index++;
@@ -1316,7 +1288,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
-	lock_buffer(path[level].bp_bh);
 	root = nilfs_btree_get_root(btree);
 	child = nilfs_btree_get_nonroot_node(path, level);
 
@@ -1324,7 +1295,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
 	nilfs_btree_node_set_level(root, level);
 	n = nilfs_btree_node_get_nchildren(child);
 	nilfs_btree_node_move_left(btree, root, child, n);
-	unlock_buffer(path[level].bp_bh);
 
 	nilfs_btnode_delete(path[level].bp_bh);
 	path[level].bp_bh = NULL;
@@ -1699,7 +1669,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 		nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
 
 		/* create child node at level 1 */
-		lock_buffer(bh);
 		node = (struct nilfs_btree_node *)bh->b_data;
 		nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
 		nilfs_btree_node_insert(btree, node,
@@ -1709,7 +1678,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 		if (!nilfs_bmap_dirty(bmap))
 			nilfs_bmap_set_dirty(bmap);
 
-		unlock_buffer(bh);
 		brelse(bh);
 
 		/* create root node at level 2 */
@@ -2050,7 +2018,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
 	     level < NILFS_BTREE_LEVEL_MAX;
 	     level++)
-		list_splice(&lists[level], listp->prev);
+		list_splice_tail(&lists[level], listp);
 }
 
 static int nilfs_btree_assign_p(struct nilfs_btree *btree,
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 0e72bbbc6b6..4b82d84ade7 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@ struct nilfs_btree;
 struct nilfs_btree_path;
 
 /**
- * struct nilfs_btree_node - B-tree node
- * @bn_flags: flags
- * @bn_level: level
- * @bn_nchildren: number of children
- * @bn_pad: padding
- */
-struct nilfs_btree_node {
-	__u8 bn_flags;
-	__u8 bn_level;
-	__le16 bn_nchildren;
-	__le32 bn_pad;
-};
-
-/* flags */
-#define NILFS_BTREE_NODE_ROOT	0x01
-
-/* level */
-#define NILFS_BTREE_LEVEL_DATA		0
-#define NILFS_BTREE_LEVEL_NODE_MIN	(NILFS_BTREE_LEVEL_DATA + 1)
-#define NILFS_BTREE_LEVEL_MAX		14
-
-/**
  * struct nilfs_btree - B-tree structure
  * @bt_bmap: bmap base structure
  */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 3f5d5d06f53..d5ad54e204a 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -926,3 +926,29 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
 	up_read(&NILFS_MDT(cpfile)->mi_sem);
 	return ret;
 }
+
+/**
+ * nilfs_cpfile_read - read cpfile inode
+ * @cpfile: cpfile inode
+ * @raw_inode: on-disk cpfile inode
+ */
+int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
+{
+	return nilfs_read_inode_common(cpfile, raw_inode);
+}
+
+/**
+ * nilfs_cpfile_new - create cpfile
+ * @nilfs: nilfs object
+ * @cpsize: size of a checkpoint entry
+ */
+struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize)
+{
+	struct inode *cpfile;
+
+	cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0);
+	if (cpfile)
+		nilfs_mdt_set_entry_size(cpfile, cpsize,
+					 sizeof(struct nilfs_cpfile_header));
+	return cpfile;
+}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index debea896e70..bc0809e0ab4 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,4 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
 				size_t);
 
+int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode);
+struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize);
+
 #endif	/* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 1ff8e15bd36..187dd07ba86 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -33,6 +33,16 @@
 #define NILFS_CNO_MIN	((__u64)1)
 #define NILFS_CNO_MAX	(~(__u64)0)
 
+struct nilfs_dat_info {
+	struct nilfs_mdt_info mi;
+	struct nilfs_palloc_cache palloc_cache;
+};
+
+static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
+{
+	return (struct nilfs_dat_info *)NILFS_MDT(dat);
+}
+
 static int nilfs_dat_prepare_entry(struct inode *dat,
 				   struct nilfs_palloc_req *req, int create)
 {
@@ -425,3 +435,40 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
 
 	return nvi;
 }
+
+/**
+ * nilfs_dat_read - read dat inode
+ * @dat: dat inode
+ * @raw_inode: on-disk dat inode
+ */
+int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
+{
+	return nilfs_read_inode_common(dat, raw_inode);
+}
+
+/**
+ * nilfs_dat_new - create dat file
+ * @nilfs: nilfs object
+ * @entry_size: size of a dat entry
+ */
+struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
+{
+	static struct lock_class_key dat_lock_key;
+	struct inode *dat;
+	struct nilfs_dat_info *di;
+	int err;
+
+	dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di));
+	if (dat) {
+		err = nilfs_palloc_init_blockgroup(dat, entry_size);
+		if (unlikely(err)) {
+			nilfs_mdt_destroy(dat);
+			return NULL;
+		}
+
+		di = NILFS_DAT_I(dat);
+		lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+		nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+	}
+	return dat;
+}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index 406070d3ff4..d31c3aab0ef 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,4 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
 ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
 
+int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode);
+struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size);
+
 #endif	/* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index e097099bfc8..76d803e060a 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -99,9 +99,9 @@ static int nilfs_prepare_chunk(struct page *page,
 				 NULL, nilfs_get_block);
 }
 
-static int nilfs_commit_chunk(struct page *page,
-			      struct address_space *mapping,
-			      unsigned from, unsigned to)
+static void nilfs_commit_chunk(struct page *page,
+			       struct address_space *mapping,
+			       unsigned from, unsigned to)
 {
 	struct inode *dir = mapping->host;
 	struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
@@ -112,15 +112,13 @@ static int nilfs_commit_chunk(struct page *page,
 
 	nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
 	copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
-	if (pos + copied > dir->i_size) {
+	if (pos + copied > dir->i_size)
 		i_size_write(dir, pos + copied);
-		mark_inode_dirty(dir);
-	}
 	if (IS_DIRSYNC(dir))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 	err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+	WARN_ON(err); /* do not happen */
 	unlock_page(page);
-	return err;
 }
 
 static void nilfs_check_page(struct page *page)
@@ -455,11 +453,10 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 	BUG_ON(err);
 	de->inode = cpu_to_le64(inode->i_ino);
 	nilfs_set_de_type(de, inode);
-	err = nilfs_commit_chunk(page, mapping, from, to);
+	nilfs_commit_chunk(page, mapping, from, to);
 	nilfs_put_page(page);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 /*	NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
-	mark_inode_dirty(dir);
 }
 
 /*
@@ -548,10 +545,10 @@ got_it:
 	memcpy(de->name, name, namelen);
 	de->inode = cpu_to_le64(inode->i_ino);
 	nilfs_set_de_type(de, inode);
-	err = nilfs_commit_chunk(page, page->mapping, from, to);
+	nilfs_commit_chunk(page, page->mapping, from, to);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 /*	NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
-	mark_inode_dirty(dir);
+	nilfs_mark_inode_dirty(dir);
 	/* OFFSET_CACHE */
 out_put:
 	nilfs_put_page(page);
@@ -595,10 +592,9 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 	if (pde)
 		pde->rec_len = cpu_to_le16(to - from);
 	dir->inode = 0;
-	err = nilfs_commit_chunk(page, mapping, from, to);
+	nilfs_commit_chunk(page, mapping, from, to);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 /*	NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
-	mark_inode_dirty(inode);
 out:
 	nilfs_put_page(page);
 	return err;
@@ -640,7 +636,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
 	memcpy(de->name, "..\0", 4);
 	nilfs_set_de_type(de, inode);
 	kunmap_atomic(kaddr, KM_USER0);
-	err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
+	nilfs_commit_chunk(page, mapping, 0, chunk_size);
 fail:
 	page_cache_release(page);
 	return err;
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
index 93383c5cee9..dd5f7e0a95f 100644
--- a/fs/nilfs2/gcdat.c
+++ b/fs/nilfs2/gcdat.c
@@ -61,6 +61,8 @@ void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
 
 	nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
 
+	nilfs_palloc_clear_cache(dat);
+	nilfs_palloc_clear_cache(gcdat);
 	nilfs_clear_dirty_pages(mapping);
 	nilfs_copy_back_pages(mapping, gmapping);
 	/* note: mdt dirty flags should be cleared by segctor. */
@@ -79,6 +81,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
 	gcdat->i_state = I_CLEAR;
 	gii->i_flags = 0;
 
+	nilfs_palloc_clear_cache(gcdat);
 	truncate_inode_pages(gcdat->i_mapping, 0);
 	truncate_inode_pages(&gii->i_btnode_cache, 0);
 }
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e6de0a27ab5..e16a6664dfa 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -149,7 +149,7 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
 				   __u64 vbn, struct buffer_head **out_bh)
 {
 	int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
-					    vbn ? : pbn, pbn, out_bh, 0);
+					    vbn ? : pbn, pbn, out_bh);
 	if (ret == -EEXIST) /* internal code (cache hit) */
 		ret = 0;
 	return ret;
@@ -212,9 +212,10 @@ void nilfs_destroy_gccache(struct the_nilfs *nilfs)
 static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
 				   __u64 cno)
 {
-	struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
+	struct inode *inode;
 	struct nilfs_inode_info *ii;
 
+	inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
 	if (!inode)
 		return NULL;
 
@@ -265,7 +266,6 @@ struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
  */
 void nilfs_clear_gcinode(struct inode *inode)
 {
-	nilfs_mdt_clear(inode);
 	nilfs_mdt_destroy(inode);
 }
 
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index de86401f209..922d9dd42c8 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -29,6 +29,17 @@
 #include "alloc.h"
 #include "ifile.h"
 
+
+struct nilfs_ifile_info {
+	struct nilfs_mdt_info mi;
+	struct nilfs_palloc_cache palloc_cache;
+};
+
+static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile)
+{
+	return (struct nilfs_ifile_info *)NILFS_MDT(ifile);
+}
+
 /**
  * nilfs_ifile_create_inode - create a new disk inode
  * @ifile: ifile inode
@@ -148,3 +159,27 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
 	}
 	return err;
 }
+
+/**
+ * nilfs_ifile_new - create inode file
+ * @sbi: nilfs_sb_info struct
+ * @inode_size: size of an inode
+ */
+struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size)
+{
+	struct inode *ifile;
+	int err;
+
+	ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO,
+			      sizeof(struct nilfs_ifile_info));
+	if (ifile) {
+		err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+		if (unlikely(err)) {
+			nilfs_mdt_destroy(ifile);
+			return NULL;
+		}
+		nilfs_palloc_setup_cache(ifile,
+					 &NILFS_IFILE_I(ifile)->palloc_cache);
+	}
+	return ifile;
+}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index ecc3ba76db4..cbca32e498f 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,4 +49,6 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
 
+struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size);
+
 #endif	/* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2a0a5a3ac13..7868cc122ac 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -97,6 +97,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 			nilfs_transaction_abort(inode->i_sb);
 			goto out;
 		}
+		nilfs_mark_inode_dirty(inode);
 		nilfs_transaction_commit(inode->i_sb); /* never fails */
 		/* Error handling should be detailed */
 		set_buffer_new(bh_result);
@@ -322,7 +323,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 				    nilfs_init_acl(), proper cancellation of
 				    above jobs should be considered */
 
-	mark_inode_dirty(inode);
 	return inode;
 
  failed_acl:
@@ -525,7 +525,6 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
 
 	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
 
-	/* The buffer is guarded with lock_buffer() by the caller */
 	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
 		memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
 	set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
@@ -599,6 +598,7 @@ void nilfs_truncate(struct inode *inode)
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 
+	nilfs_mark_inode_dirty(inode);
 	nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
 	nilfs_transaction_commit(sb);
 	/* May construct a logical segment and may fail in sync mode.
@@ -623,6 +623,7 @@ void nilfs_delete_inode(struct inode *inode)
 		truncate_inode_pages(&inode->i_data, 0);
 
 	nilfs_truncate_bmap(ii, 0);
+	nilfs_mark_inode_dirty(inode);
 	nilfs_free_inode(inode);
 	/* nilfs_free_inode() marks inode buffer dirty */
 	if (IS_SYNC(inode))
@@ -745,9 +746,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
 			      "failed to reget inode block.\n");
 		return err;
 	}
-	lock_buffer(ibh);
 	nilfs_update_inode(inode, ibh);
-	unlock_buffer(ibh);
 	nilfs_mdt_mark_buffer_dirty(ibh);
 	nilfs_mdt_mark_dirty(sbi->s_ifile);
 	brelse(ibh);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f6326112d64..06713ffcc7f 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -186,7 +186,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 }
 
 static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
-				struct buffer_head **out_bh)
+				int readahead, struct buffer_head **out_bh)
 {
 	struct buffer_head *first_bh, *bh;
 	unsigned long blkoff;
@@ -200,16 +200,18 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
 	if (unlikely(err))
 		goto failed;
 
-	blkoff = block + 1;
-	for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
-		err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
-		if (likely(!err || err == -EEXIST))
-			brelse(bh);
-		else if (err != -EBUSY)
-			break; /* abort readahead if bmap lookup failed */
-
-		if (!buffer_locked(first_bh))
-			goto out_no_wait;
+	if (readahead) {
+		blkoff = block + 1;
+		for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+			err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+			if (likely(!err || err == -EEXIST))
+				brelse(bh);
+			else if (err != -EBUSY)
+				break;
+				/* abort readahead if bmap lookup failed */
+			if (!buffer_locked(first_bh))
+				goto out_no_wait;
+		}
 	}
 
 	wait_on_buffer(first_bh);
@@ -263,7 +265,7 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
 
 	/* Should be rewritten with merging nilfs_mdt_read_block() */
  retry:
-	ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
+	ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh);
 	if (!create || ret != -ENOENT)
 		return ret;
 
@@ -371,7 +373,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
 	struct buffer_head *bh;
 	int err;
 
-	err = nilfs_mdt_read_block(inode, block, &bh);
+	err = nilfs_mdt_read_block(inode, block, 0, &bh);
 	if (unlikely(err))
 		return err;
 	nilfs_mark_buffer_dirty(bh);
@@ -445,9 +447,17 @@ static const struct file_operations def_mdt_fops;
  * longer than those of the super block structs; they may continue for
  * several consecutive mounts/umounts.  This would need discussions.
  */
+/**
+ * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
+ * @nilfs: nilfs object
+ * @sb: super block instance the metadata file belongs to
+ * @ino: inode number
+ * @gfp_mask: gfp mask for data pages
+ * @objsz: size of the private object attached to inode->i_private
+ */
 struct inode *
 nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
-		     ino_t ino, gfp_t gfp_mask)
+		     ino_t ino, gfp_t gfp_mask, size_t objsz)
 {
 	struct inode *inode = nilfs_alloc_inode_common(nilfs);
 
@@ -455,8 +465,9 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 		return NULL;
 	else {
 		struct address_space * const mapping = &inode->i_data;
-		struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
+		struct nilfs_mdt_info *mi;
 
+		mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
 		if (!mi) {
 			nilfs_destroy_inode(inode);
 			return NULL;
@@ -513,11 +524,11 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 }
 
 struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
-			    ino_t ino)
+			    ino_t ino, size_t objsz)
 {
-	struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
-						   NILFS_MDT_GFP);
+	struct inode *inode;
 
+	inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
 	if (!inode)
 		return NULL;
 
@@ -544,14 +555,15 @@ void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
 		&NILFS_I(orig)->i_btnode_cache;
 }
 
-void nilfs_mdt_clear(struct inode *inode)
+static void nilfs_mdt_clear(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 
 	invalidate_mapping_pages(inode->i_mapping, 0, -1);
 	truncate_inode_pages(inode->i_mapping, 0);
 
-	nilfs_bmap_clear(ii->i_bmap);
+	if (test_bit(NILFS_I_BMAP, &ii->i_state))
+		nilfs_bmap_clear(ii->i_bmap);
 	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
 }
 
@@ -559,6 +571,10 @@ void nilfs_mdt_destroy(struct inode *inode)
 {
 	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 
+	if (mdi->mi_palloc_cache)
+		nilfs_palloc_destroy_cache(inode);
+	nilfs_mdt_clear(inode);
+
 	kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
 	kfree(mdi);
 	nilfs_destroy_inode(inode);
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 431599733c9..6c4bbb0470f 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -36,6 +36,7 @@
  * @mi_entry_size: size of an entry
  * @mi_first_entry_offset: offset to the first entry
  * @mi_entries_per_block: number of entries in a block
+ * @mi_palloc_cache: persistent object allocator cache
  * @mi_blocks_per_group: number of blocks in a group
  * @mi_blocks_per_desc_block: number of blocks per descriptor block
  */
@@ -46,6 +47,7 @@ struct nilfs_mdt_info {
 	unsigned		mi_entry_size;
 	unsigned		mi_first_entry_offset;
 	unsigned long		mi_entries_per_block;
+	struct nilfs_palloc_cache *mi_palloc_cache;
 	unsigned long		mi_blocks_per_group;
 	unsigned long		mi_blocks_per_desc_block;
 };
@@ -74,11 +76,11 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
 
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+			    size_t);
 struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
-				   ino_t, gfp_t);
+				   ino_t, gfp_t, size_t);
 void nilfs_mdt_destroy(struct inode *);
-void nilfs_mdt_clear(struct inode *);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
 void nilfs_mdt_set_shadow(struct inode *, struct inode *);
 
@@ -104,21 +106,4 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode)
 #define nilfs_mdt_bgl_lock(inode, bg) \
 	(&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
 
-
-static inline int
-nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
-			    unsigned n)
-{
-	return nilfs_read_inode_common(
-		inode, (struct nilfs_inode *)(bh->b_data + n));
-}
-
-static inline void
-nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
-			     unsigned n)
-{
-	nilfs_write_inode_common(
-		inode, (struct nilfs_inode *)(bh->b_data + n), 1);
-}
-
 #endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ed02e886fa7..07ba838ef08 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -120,7 +120,7 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		inode->i_op = &nilfs_file_inode_operations;
 		inode->i_fop = &nilfs_file_operations;
 		inode->i_mapping->a_ops = &nilfs_aops;
-		mark_inode_dirty(inode);
+		nilfs_mark_inode_dirty(inode);
 		err = nilfs_add_nondir(dentry, inode);
 	}
 	if (!err)
@@ -148,7 +148,7 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
-		mark_inode_dirty(inode);
+		nilfs_mark_inode_dirty(inode);
 		err = nilfs_add_nondir(dentry, inode);
 	}
 	if (!err)
@@ -188,7 +188,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_fail;
 
 	/* mark_inode_dirty(inode); */
-	/* nilfs_new_inode() and page_symlink() do this */
+	/* page_symlink() do this */
 
 	err = nilfs_add_nondir(dentry, inode);
 out:
@@ -200,7 +200,8 @@ out:
 	return err;
 
 out_fail:
-	inode_dec_link_count(inode);
+	drop_nlink(inode);
+	nilfs_mark_inode_dirty(inode);
 	iput(inode);
 	goto out;
 }
@@ -245,7 +246,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		return err;
 
-	inode_inc_link_count(dir);
+	inc_nlink(dir);
 
 	inode = nilfs_new_inode(dir, S_IFDIR | mode);
 	err = PTR_ERR(inode);
@@ -256,7 +257,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	inode->i_fop = &nilfs_dir_operations;
 	inode->i_mapping->a_ops = &nilfs_aops;
 
-	inode_inc_link_count(inode);
+	inc_nlink(inode);
 
 	err = nilfs_make_empty(inode, dir);
 	if (err)
@@ -266,6 +267,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		goto out_fail;
 
+	nilfs_mark_inode_dirty(inode);
 	d_instantiate(dentry, inode);
 out:
 	if (!err)
@@ -276,26 +278,23 @@ out:
 	return err;
 
 out_fail:
-	inode_dec_link_count(inode);
-	inode_dec_link_count(inode);
+	drop_nlink(inode);
+	drop_nlink(inode);
+	nilfs_mark_inode_dirty(inode);
 	iput(inode);
 out_dir:
-	inode_dec_link_count(dir);
+	drop_nlink(dir);
+	nilfs_mark_inode_dirty(dir);
 	goto out;
 }
 
-static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode;
 	struct nilfs_dir_entry *de;
 	struct page *page;
-	struct nilfs_transaction_info ti;
 	int err;
 
-	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
-	if (err)
-		return err;
-
 	err = -ENOENT;
 	de = nilfs_find_entry(dir, dentry, &page);
 	if (!de)
@@ -317,12 +316,28 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
 		goto out;
 
 	inode->i_ctime = dir->i_ctime;
-	inode_dec_link_count(inode);
+	drop_nlink(inode);
 	err = 0;
 out:
-	if (!err)
+	return err;
+}
+
+static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+	if (err)
+		return err;
+
+	err = nilfs_do_unlink(dir, dentry);
+
+	if (!err) {
+		nilfs_mark_inode_dirty(dir);
+		nilfs_mark_inode_dirty(dentry->d_inode);
 		err = nilfs_transaction_commit(dir->i_sb);
-	else
+	} else
 		nilfs_transaction_abort(dir->i_sb);
 
 	return err;
@@ -340,11 +355,13 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	err = -ENOTEMPTY;
 	if (nilfs_empty_dir(inode)) {
-		err = nilfs_unlink(dir, dentry);
+		err = nilfs_do_unlink(dir, dentry);
 		if (!err) {
 			inode->i_size = 0;
-			inode_dec_link_count(inode);
-			inode_dec_link_count(dir);
+			drop_nlink(inode);
+			nilfs_mark_inode_dirty(inode);
+			drop_nlink(dir);
+			nilfs_mark_inode_dirty(dir);
 		}
 	}
 	if (!err)
@@ -395,42 +412,48 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
+		inc_nlink(old_inode);
 		nilfs_set_link(new_dir, new_de, new_page, old_inode);
+		nilfs_mark_inode_dirty(new_dir);
 		new_inode->i_ctime = CURRENT_TIME;
 		if (dir_de)
 			drop_nlink(new_inode);
-		inode_dec_link_count(new_inode);
+		drop_nlink(new_inode);
+		nilfs_mark_inode_dirty(new_inode);
 	} else {
 		if (dir_de) {
 			err = -EMLINK;
 			if (new_dir->i_nlink >= NILFS_LINK_MAX)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
+		inc_nlink(old_inode);
 		err = nilfs_add_link(new_dentry, old_inode);
 		if (err) {
-			inode_dec_link_count(old_inode);
+			drop_nlink(old_inode);
+			nilfs_mark_inode_dirty(old_inode);
 			goto out_dir;
 		}
-		if (dir_de)
-			inode_inc_link_count(new_dir);
+		if (dir_de) {
+			inc_nlink(new_dir);
+			nilfs_mark_inode_dirty(new_dir);
+		}
 	}
 
 	/*
 	 * Like most other Unix systems, set the ctime for inodes on a
 	 * rename.
-	 * inode_dec_link_count() will mark the inode dirty.
 	 */
 	old_inode->i_ctime = CURRENT_TIME;
 
 	nilfs_delete_entry(old_de, old_page);
-	inode_dec_link_count(old_inode);
+	drop_nlink(old_inode);
 
 	if (dir_de) {
 		nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
-		inode_dec_link_count(old_dir);
+		drop_nlink(old_dir);
 	}
+	nilfs_mark_inode_dirty(old_dir);
+	nilfs_mark_inode_dirty(old_inode);
 
 	err = nilfs_transaction_commit(old_dir->i_sb);
 	return err;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 6dc83591d11..c9c96c7825d 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -770,14 +770,8 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
 		nilfs_finish_roll_forward(nilfs, sbi, ri);
 	}
 
-	nilfs_detach_checkpoint(sbi);
-	return 0;
-
  failed:
 	nilfs_detach_checkpoint(sbi);
-	nilfs_mdt_clear(nilfs->ns_cpfile);
-	nilfs_mdt_clear(nilfs->ns_sufile);
-	nilfs_mdt_clear(nilfs->ns_dat);
 	return err;
 }
 
@@ -804,6 +798,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 	struct nilfs_segsum_info ssi;
 	sector_t pseg_start, pseg_end, sr_pseg_start = 0;
 	sector_t seg_start, seg_end; /* range of full segment (block number) */
+	sector_t b, end;
 	u64 seg_seq;
 	__u64 segnum, nextnum = 0;
 	__u64 cno;
@@ -819,6 +814,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 	/* Calculate range of segment */
 	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
 
+	/* Read ahead segment */
+	b = seg_start;
+	while (b <= seg_end)
+		sb_breadahead(sbi->s_super, b++);
+
 	for (;;) {
 		/* Load segment summary */
 		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
@@ -841,14 +841,20 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 		ri->ri_nextnum = nextnum;
 		empty_seg = 0;
 
+		if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) {
+			/* This will never happen because a superblock
+			   (last_segment) always points to a pseg
+			   having a super root. */
+			ret = NILFS_SEG_FAIL_CONSISTENCY;
+			goto failed;
+		}
+
+		if (pseg_start == seg_start) {
+			nilfs_get_segment_range(nilfs, nextnum, &b, &end);
+			while (b <= end)
+				sb_breadahead(sbi->s_super, b++);
+		}
 		if (!NILFS_SEG_HAS_SR(&ssi)) {
-			if (!scan_newer) {
-				/* This will never happen because a superblock
-				   (last_segment) always points to a pseg
-				   having a super root. */
-				ret = NILFS_SEG_FAIL_CONSISTENCY;
-				goto failed;
-			}
 			if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
 				ri->ri_lsegs_start = pseg_start;
 				ri->ri_lsegs_start_seq = seg_seq;
@@ -919,7 +925,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 
  super_root_found:
 	/* Updating pointers relating to the latest checkpoint */
-	list_splice(&segments, ri->ri_used_segments.prev);
+	list_splice_tail(&segments, &ri->ri_used_segments);
 	nilfs->ns_last_pseg = sr_pseg_start;
 	nilfs->ns_last_seq = nilfs->ns_seg_seq;
 	nilfs->ns_last_cno = ri->ri_cno;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index e6d9e37fa24..645c78656aa 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -24,10 +24,22 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/crc32.h>
+#include <linux/backing-dev.h>
 #include "page.h"
 #include "segbuf.h"
 
 
+struct nilfs_write_info {
+	struct the_nilfs       *nilfs;
+	struct bio	       *bio;
+	int 			start, end; /* The region to be submitted */
+	int			rest_blocks;
+	int			max_pages;
+	int			nr_vecs;
+	sector_t		blocknr;
+};
+
+
 static struct kmem_cache *nilfs_segbuf_cachep;
 
 static void nilfs_segbuf_init_once(void *obj)
@@ -63,6 +75,11 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
 	INIT_LIST_HEAD(&segbuf->sb_list);
 	INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
 	INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+
+	init_completion(&segbuf->sb_bio_event);
+	atomic_set(&segbuf->sb_err, 0);
+	segbuf->sb_nbio = 0;
+
 	return segbuf;
 }
 
@@ -83,6 +100,22 @@ void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
 		segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
 }
 
+/**
+ * nilfs_segbuf_map_cont - map a new log behind a given log
+ * @segbuf: new segment buffer
+ * @prev: segment buffer containing a log to be continued
+ */
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+			   struct nilfs_segment_buffer *prev)
+{
+	segbuf->sb_segnum = prev->sb_segnum;
+	segbuf->sb_fseg_start = prev->sb_fseg_start;
+	segbuf->sb_fseg_end = prev->sb_fseg_end;
+	segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks;
+	segbuf->sb_rest_blocks =
+		segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
+
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
 				  __u64 nextnum, struct the_nilfs *nilfs)
 {
@@ -132,8 +165,6 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
 	segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
 	segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
 	segbuf->sb_sum.ctime = ctime;
-
-	segbuf->sb_io_error = 0;
 	return 0;
 }
 
@@ -219,7 +250,7 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
 	raw_sum->ss_datasum = cpu_to_le32(crc);
 }
 
-void nilfs_release_buffers(struct list_head *list)
+static void nilfs_release_buffers(struct list_head *list)
 {
 	struct buffer_head *bh, *n;
 
@@ -241,13 +272,56 @@ void nilfs_release_buffers(struct list_head *list)
 	}
 }
 
+static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+{
+	nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+	nilfs_release_buffers(&segbuf->sb_payload_buffers);
+}
+
+/*
+ * Iterators for segment buffers
+ */
+void nilfs_clear_logs(struct list_head *logs)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	list_for_each_entry(segbuf, logs, sb_list)
+		nilfs_segbuf_clear(segbuf);
+}
+
+void nilfs_truncate_logs(struct list_head *logs,
+			 struct nilfs_segment_buffer *last)
+{
+	struct nilfs_segment_buffer *n, *segbuf;
+
+	segbuf = list_prepare_entry(last, logs, sb_list);
+	list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) {
+		list_del_init(&segbuf->sb_list);
+		nilfs_segbuf_clear(segbuf);
+		nilfs_segbuf_free(segbuf);
+	}
+}
+
+int nilfs_wait_on_logs(struct list_head *logs)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int err;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		err = nilfs_segbuf_wait(segbuf);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
 /*
  * BIO operations
  */
 static void nilfs_end_bio_write(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct nilfs_write_info *wi = bio->bi_private;
+	struct nilfs_segment_buffer *segbuf = bio->bi_private;
 
 	if (err == -EOPNOTSUPP) {
 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
@@ -256,21 +330,22 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
 	}
 
 	if (!uptodate)
-		atomic_inc(&wi->err);
+		atomic_inc(&segbuf->sb_err);
 
 	bio_put(bio);
-	complete(&wi->bio_event);
+	complete(&segbuf->sb_bio_event);
 }
 
-static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
+static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
+				   struct nilfs_write_info *wi, int mode)
 {
 	struct bio *bio = wi->bio;
 	int err;
 
-	if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
-		wait_for_completion(&wi->bio_event);
-		wi->nbio--;
-		if (unlikely(atomic_read(&wi->err))) {
+	if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) {
+		wait_for_completion(&segbuf->sb_bio_event);
+		segbuf->sb_nbio--;
+		if (unlikely(atomic_read(&segbuf->sb_err))) {
 			bio_put(bio);
 			err = -EIO;
 			goto failed;
@@ -278,7 +353,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
 	}
 
 	bio->bi_end_io = nilfs_end_bio_write;
-	bio->bi_private = wi;
+	bio->bi_private = segbuf;
 	bio_get(bio);
 	submit_bio(mode, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
@@ -286,7 +361,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
 		err = -EOPNOTSUPP;
 		goto failed;
 	}
-	wi->nbio++;
+	segbuf->sb_nbio++;
 	bio_put(bio);
 
 	wi->bio = NULL;
@@ -301,17 +376,15 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
 }
 
 /**
- * nilfs_alloc_seg_bio - allocate a bio for writing segment.
- * @sb: super block
- * @start: beginning disk block number of this BIO.
+ * nilfs_alloc_seg_bio - allocate a new bio for writing log
+ * @nilfs: nilfs object
+ * @start: start block number of the bio
  * @nr_vecs: request size of page vector.
  *
- * alloc_seg_bio() allocates a new BIO structure and initialize it.
- *
  * Return Value: On success, pointer to the struct bio is returned.
  * On error, NULL is returned.
  */
-static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
+static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
 				       int nr_vecs)
 {
 	struct bio *bio;
@@ -322,36 +395,33 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
 			bio = bio_alloc(GFP_NOIO, nr_vecs);
 	}
 	if (likely(bio)) {
-		bio->bi_bdev = sb->s_bdev;
-		bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
+		bio->bi_bdev = nilfs->ns_bdev;
+		bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9);
 	}
 	return bio;
 }
 
-void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
-				struct nilfs_write_info *wi)
+static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
+				       struct nilfs_write_info *wi)
 {
 	wi->bio = NULL;
 	wi->rest_blocks = segbuf->sb_sum.nblocks;
-	wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
+	wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev);
 	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
 	wi->start = wi->end = 0;
-	wi->nbio = 0;
 	wi->blocknr = segbuf->sb_pseg_start;
-
-	atomic_set(&wi->err, 0);
-	init_completion(&wi->bio_event);
 }
 
-static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
-			   int mode)
+static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
+				  struct nilfs_write_info *wi,
+				  struct buffer_head *bh, int mode)
 {
 	int len, err;
 
 	BUG_ON(wi->nr_vecs <= 0);
  repeat:
 	if (!wi->bio) {
-		wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
+		wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
 					      wi->nr_vecs);
 		if (unlikely(!wi->bio))
 			return -ENOMEM;
@@ -363,76 +433,83 @@ static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
 		return 0;
 	}
 	/* bio is FULL */
-	err = nilfs_submit_seg_bio(wi, mode);
+	err = nilfs_segbuf_submit_bio(segbuf, wi, mode);
 	/* never submit current bh */
 	if (likely(!err))
 		goto repeat;
 	return err;
 }
 
+/**
+ * nilfs_segbuf_write - submit write requests of a log
+ * @segbuf: buffer storing a log to be written
+ * @nilfs: nilfs object
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
 int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
-		       struct nilfs_write_info *wi)
+		       struct the_nilfs *nilfs)
 {
+	struct nilfs_write_info wi;
 	struct buffer_head *bh;
-	int res, rw = WRITE;
+	int res = 0, rw = WRITE;
+
+	wi.nilfs = nilfs;
+	nilfs_segbuf_prepare_write(segbuf, &wi);
 
 	list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
-		res = nilfs_submit_bh(wi, bh, rw);
+		res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
 		if (unlikely(res))
 			goto failed_bio;
 	}
 
 	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-		res = nilfs_submit_bh(wi, bh, rw);
+		res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
 		if (unlikely(res))
 			goto failed_bio;
 	}
 
-	if (wi->bio) {
+	if (wi.bio) {
 		/*
 		 * Last BIO is always sent through the following
 		 * submission.
 		 */
 		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
-		res = nilfs_submit_seg_bio(wi, rw);
-		if (unlikely(res))
-			goto failed_bio;
+		res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
 	}
 
-	res = 0;
- out:
-	return res;
-
  failed_bio:
-	atomic_inc(&wi->err);
-	goto out;
+	return res;
 }
 
 /**
  * nilfs_segbuf_wait - wait for completion of requested BIOs
- * @wi: nilfs_write_info
+ * @segbuf: segment buffer
  *
  * Return Value: On Success, 0 is returned. On Error, one of the following
  * negative error code is returned.
  *
  * %-EIO - I/O error
  */
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
-		      struct nilfs_write_info *wi)
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
 {
 	int err = 0;
 
-	if (!wi->nbio)
+	if (!segbuf->sb_nbio)
 		return 0;
 
 	do {
-		wait_for_completion(&wi->bio_event);
-	} while (--wi->nbio > 0);
+		wait_for_completion(&segbuf->sb_bio_event);
+	} while (--segbuf->sb_nbio > 0);
 
-	if (unlikely(atomic_read(&wi->err) > 0)) {
+	if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
 		printk(KERN_ERR "NILFS: IO error writing segment\n");
 		err = -EIO;
-		segbuf->sb_io_error = 1;
 	}
 	return err;
 }
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 0c3076f4e59..6af1630fb40 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -27,7 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
 #include <linux/completion.h>
-#include <linux/backing-dev.h>
 
 /**
  * struct nilfs_segsum_info - On-memory segment summary
@@ -77,7 +76,9 @@ struct nilfs_segsum_info {
  * @sb_rest_blocks: Number of residual blocks in the current segment
  * @sb_segsum_buffers: List of buffers for segment summaries
  * @sb_payload_buffers: List of buffers for segment payload
- * @sb_io_error: I/O error status
+ * @sb_nbio: Number of flying bio requests
+ * @sb_err: I/O error status
+ * @sb_bio_event: Completion event of log writing
  */
 struct nilfs_segment_buffer {
 	struct super_block     *sb_super;
@@ -96,7 +97,9 @@ struct nilfs_segment_buffer {
 	struct list_head	sb_payload_buffers; /* including super root */
 
 	/* io status */
-	int			sb_io_error;
+	int			sb_nbio;
+	atomic_t		sb_err;
+	struct completion	sb_bio_event;
 };
 
 #define NILFS_LIST_SEGBUF(head)  \
@@ -125,6 +128,8 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
 void nilfs_segbuf_free(struct nilfs_segment_buffer *);
 void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
 		      struct the_nilfs *);
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+			   struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
 				  struct the_nilfs *);
 int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
@@ -161,41 +166,18 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
 	segbuf->sb_sum.nfileblk++;
 }
 
-void nilfs_release_buffers(struct list_head *);
+int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+		       struct the_nilfs *nilfs);
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
 
-static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+void nilfs_clear_logs(struct list_head *logs);
+void nilfs_truncate_logs(struct list_head *logs,
+			 struct nilfs_segment_buffer *last);
+int nilfs_wait_on_logs(struct list_head *logs);
+
+static inline void nilfs_destroy_logs(struct list_head *logs)
 {
-	nilfs_release_buffers(&segbuf->sb_segsum_buffers);
-	nilfs_release_buffers(&segbuf->sb_payload_buffers);
+	nilfs_truncate_logs(logs, NULL);
 }
 
-struct nilfs_write_info {
-	struct bio	       *bio;
-	int 			start, end; /* The region to be submitted */
-	int			rest_blocks;
-	int			max_pages;
-	int			nr_vecs;
-	sector_t		blocknr;
-
-	int			nbio;
-	atomic_t		err;
-	struct completion	bio_event;
-				/* completion event of segment write */
-
-	/*
-	 * The following fields must be set explicitly
-	 */
-	struct super_block     *sb;
-	struct backing_dev_info *bdi; /* backing dev info */
-	struct buffer_head     *bh_sr;
-};
-
-
-void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
-				struct nilfs_write_info *);
-int nilfs_segbuf_write(struct nilfs_segment_buffer *,
-		       struct nilfs_write_info *);
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
-		      struct nilfs_write_info *);
-
 #endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6eff66a070d..17584c52448 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -974,12 +974,12 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
 	raw_sr->sr_flags = 0;
 
-	nilfs_mdt_write_inode_direct(
-		nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
-	nilfs_mdt_write_inode_direct(
-		nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
-	nilfs_mdt_write_inode_direct(
-		nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
+	nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
+				 NILFS_SR_DAT_OFFSET(isz), 1);
+	nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
+				 NILFS_SR_CPFILE_OFFSET(isz), 1);
+	nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
+				 NILFS_SR_SUFILE_OFFSET(isz), 1);
 }
 
 static void nilfs_redirty_inodes(struct list_head *head)
@@ -1273,73 +1273,75 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 	return err;
 }
 
-static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
-{
-	struct buffer_head *bh_su;
-	struct nilfs_segment_usage *raw_su;
-	int err;
-
-	err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
-	if (unlikely(err))
-		return err;
-	nilfs_mdt_mark_buffer_dirty(bh_su);
-	nilfs_mdt_mark_dirty(sufile);
-	nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
-	return 0;
-}
-
+/**
+ * nilfs_segctor_begin_construction - setup segment buffer to make a new log
+ * @sci: nilfs_sc_info
+ * @nilfs: nilfs object
+ */
 static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
 					    struct the_nilfs *nilfs)
 {
-	struct nilfs_segment_buffer *segbuf, *n;
+	struct nilfs_segment_buffer *segbuf, *prev;
 	__u64 nextnum;
-	int err;
+	int err, alloc = 0;
 
-	if (list_empty(&sci->sc_segbufs)) {
-		segbuf = nilfs_segbuf_new(sci->sc_super);
-		if (unlikely(!segbuf))
-			return -ENOMEM;
-		list_add(&segbuf->sb_list, &sci->sc_segbufs);
-	} else
-		segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+	segbuf = nilfs_segbuf_new(sci->sc_super);
+	if (unlikely(!segbuf))
+		return -ENOMEM;
+
+	if (list_empty(&sci->sc_write_logs)) {
+		nilfs_segbuf_map(segbuf, nilfs->ns_segnum,
+				 nilfs->ns_pseg_offset, nilfs);
+		if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+			nilfs_shift_to_next_segment(nilfs);
+			nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+		}
 
-	nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
-			 nilfs);
+		segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
+		nextnum = nilfs->ns_nextnum;
 
-	if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
-		nilfs_shift_to_next_segment(nilfs);
-		nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+		if (nilfs->ns_segnum == nilfs->ns_nextnum)
+			/* Start from the head of a new full segment */
+			alloc++;
+	} else {
+		/* Continue logs */
+		prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
+		nilfs_segbuf_map_cont(segbuf, prev);
+		segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq;
+		nextnum = prev->sb_nextnum;
+
+		if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+			nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+			segbuf->sb_sum.seg_seq++;
+			alloc++;
+		}
 	}
-	sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
 
-	err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
-	if (unlikely(err))
-		return err;
+	err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum);
+	if (err)
+		goto failed;
 
-	if (nilfs->ns_segnum == nilfs->ns_nextnum) {
-		/* Start from the head of a new full segment */
+	if (alloc) {
 		err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
-		if (unlikely(err))
-			return err;
-	} else
-		nextnum = nilfs->ns_nextnum;
-
-	segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
+		if (err)
+			goto failed;
+	}
 	nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
 
-	/* truncating segment buffers */
-	list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
-					  sb_list) {
-		list_del_init(&segbuf->sb_list);
-		nilfs_segbuf_free(segbuf);
-	}
+	BUG_ON(!list_empty(&sci->sc_segbufs));
+	list_add_tail(&segbuf->sb_list, &sci->sc_segbufs);
+	sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
 	return 0;
+
+ failed:
+	nilfs_segbuf_free(segbuf);
+	return err;
 }
 
 static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
 					 struct the_nilfs *nilfs, int nadd)
 {
-	struct nilfs_segment_buffer *segbuf, *prev, *n;
+	struct nilfs_segment_buffer *segbuf, *prev;
 	struct inode *sufile = nilfs->ns_sufile;
 	__u64 nextnextnum;
 	LIST_HEAD(list);
@@ -1352,7 +1354,7 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
 	 * not be dirty.  The following call ensures that the buffer is dirty
 	 * and will pin the buffer on memory until the sufile is written.
 	 */
-	err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
+	err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum);
 	if (unlikely(err))
 		return err;
 
@@ -1378,33 +1380,33 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
 		list_add_tail(&segbuf->sb_list, &list);
 		prev = segbuf;
 	}
-	list_splice(&list, sci->sc_segbufs.prev);
+	list_splice_tail(&list, &sci->sc_segbufs);
 	return 0;
 
  failed_segbuf:
 	nilfs_segbuf_free(segbuf);
  failed:
-	list_for_each_entry_safe(segbuf, n, &list, sb_list) {
+	list_for_each_entry(segbuf, &list, sb_list) {
 		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
 		WARN_ON(ret); /* never fails */
-		list_del_init(&segbuf->sb_list);
-		nilfs_segbuf_free(segbuf);
 	}
+	nilfs_destroy_logs(&list);
 	return err;
 }
 
-static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
-						   struct the_nilfs *nilfs)
+static void nilfs_free_incomplete_logs(struct list_head *logs,
+				       struct the_nilfs *nilfs)
 {
-	struct nilfs_segment_buffer *segbuf;
-	int ret, done = 0;
+	struct nilfs_segment_buffer *segbuf, *prev;
+	struct inode *sufile = nilfs->ns_sufile;
+	int ret;
 
-	segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+	segbuf = NILFS_FIRST_SEGBUF(logs);
 	if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
-		ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
 		WARN_ON(ret); /* never fails */
 	}
-	if (segbuf->sb_io_error) {
+	if (atomic_read(&segbuf->sb_err)) {
 		/* Case 1: The first segment failed */
 		if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
 			/* Case 1a:  Partial segment appended into an existing
@@ -1413,106 +1415,54 @@ static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
 						segbuf->sb_fseg_end);
 		else /* Case 1b:  New full segment */
 			set_nilfs_discontinued(nilfs);
-		done++;
 	}
 
-	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
-		ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
-		WARN_ON(ret); /* never fails */
-		if (!done && segbuf->sb_io_error) {
-			if (segbuf->sb_segnum != nilfs->ns_nextnum)
-				/* Case 2: extended segment (!= next) failed */
-				nilfs_sufile_set_error(nilfs->ns_sufile,
-						       segbuf->sb_segnum);
-			done++;
-		}
-	}
-}
-
-static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
-{
-	struct nilfs_segment_buffer *segbuf;
-
-	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
-		nilfs_segbuf_clear(segbuf);
-	sci->sc_super_root = NULL;
-}
-
-static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
-{
-	struct nilfs_segment_buffer *segbuf;
-
-	while (!list_empty(&sci->sc_segbufs)) {
-		segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
-		list_del_init(&segbuf->sb_list);
-		nilfs_segbuf_free(segbuf);
-	}
-	/* sci->sc_curseg = NULL; */
-}
-
-static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
-					   struct the_nilfs *nilfs, int err)
-{
-	if (unlikely(err)) {
-		nilfs_segctor_free_incomplete_segments(sci, nilfs);
-		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
-			int ret;
-
-			ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
-							sci->sc_freesegs,
-							sci->sc_nfreesegs,
-							NULL);
-			WARN_ON(ret); /* do not happen */
+	prev = segbuf;
+	list_for_each_entry_continue(segbuf, logs, sb_list) {
+		if (prev->sb_nextnum != segbuf->sb_nextnum) {
+			ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+			WARN_ON(ret); /* never fails */
 		}
+		if (atomic_read(&segbuf->sb_err) &&
+		    segbuf->sb_segnum != nilfs->ns_nextnum)
+			/* Case 2: extended segment (!= next) failed */
+			nilfs_sufile_set_error(sufile, segbuf->sb_segnum);
+		prev = segbuf;
 	}
-	nilfs_segctor_clear_segment_buffers(sci);
 }
 
 static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
 					  struct inode *sufile)
 {
 	struct nilfs_segment_buffer *segbuf;
-	struct buffer_head *bh_su;
-	struct nilfs_segment_usage *raw_su;
 	unsigned long live_blocks;
 	int ret;
 
 	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-		ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
-						     &raw_su, &bh_su);
-		WARN_ON(ret); /* always succeed because bh_su is dirty */
 		live_blocks = segbuf->sb_sum.nblocks +
 			(segbuf->sb_pseg_start - segbuf->sb_fseg_start);
-		raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
-		raw_su->su_nblocks = cpu_to_le32(live_blocks);
-		nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
-					       bh_su);
+		ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+						     live_blocks,
+						     sci->sc_seg_ctime);
+		WARN_ON(ret); /* always succeed because the segusage is dirty */
 	}
 }
 
-static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
-					  struct inode *sufile)
+static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile)
 {
 	struct nilfs_segment_buffer *segbuf;
-	struct buffer_head *bh_su;
-	struct nilfs_segment_usage *raw_su;
 	int ret;
 
-	segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
-	ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
-					     &raw_su, &bh_su);
-	WARN_ON(ret); /* always succeed because bh_su is dirty */
-	raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
-					 segbuf->sb_fseg_start);
-	nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
+	segbuf = NILFS_FIRST_SEGBUF(logs);
+	ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+					     segbuf->sb_pseg_start -
+					     segbuf->sb_fseg_start, 0);
+	WARN_ON(ret); /* always succeed because the segusage is dirty */
 
-	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
-		ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
-						     &raw_su, &bh_su);
+	list_for_each_entry_continue(segbuf, logs, sb_list) {
+		ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+						     0, 0);
 		WARN_ON(ret); /* always succeed */
-		raw_su->su_nblocks = 0;
-		nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
-					       bh_su);
 	}
 }
 
@@ -1520,17 +1470,15 @@ static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
 					    struct nilfs_segment_buffer *last,
 					    struct inode *sufile)
 {
-	struct nilfs_segment_buffer *segbuf = last, *n;
+	struct nilfs_segment_buffer *segbuf = last;
 	int ret;
 
-	list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
-					  sb_list) {
-		list_del_init(&segbuf->sb_list);
+	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
 		sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
 		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
 		WARN_ON(ret);
-		nilfs_segbuf_free(segbuf);
 	}
+	nilfs_truncate_logs(&sci->sc_segbufs, last);
 }
 
 
@@ -1569,7 +1517,7 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 							NULL);
 			WARN_ON(err); /* do not happen */
 		}
-		nilfs_segctor_clear_segment_buffers(sci);
+		nilfs_clear_logs(&sci->sc_segbufs);
 
 		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
 		if (unlikely(err))
@@ -1814,26 +1762,18 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
 }
 
 static int nilfs_segctor_write(struct nilfs_sc_info *sci,
-			       struct backing_dev_info *bdi)
+			       struct the_nilfs *nilfs)
 {
 	struct nilfs_segment_buffer *segbuf;
-	struct nilfs_write_info wi;
-	int err, res;
-
-	wi.sb = sci->sc_super;
-	wi.bh_sr = sci->sc_super_root;
-	wi.bdi = bdi;
+	int ret = 0;
 
 	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-		nilfs_segbuf_prepare_write(segbuf, &wi);
-		err = nilfs_segbuf_write(segbuf, &wi);
-
-		res = nilfs_segbuf_wait(segbuf, &wi);
-		err = err ? : res;
-		if (err)
-			return err;
+		ret = nilfs_segbuf_write(segbuf, nilfs);
+		if (ret)
+			break;
 	}
-	return 0;
+	list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
+	return ret;
 }
 
 static void __nilfs_end_page_io(struct page *page, int err)
@@ -1911,15 +1851,17 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
 	}
 }
 
-static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
-				      struct page *failed_page, int err)
+static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
+			     struct buffer_head *bh_sr, int err)
 {
 	struct nilfs_segment_buffer *segbuf;
 	struct page *bd_page = NULL, *fs_page = NULL;
+	struct buffer_head *bh;
 
-	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-		struct buffer_head *bh;
+	if (list_empty(logs))
+		return;
 
+	list_for_each_entry(segbuf, logs, sb_list) {
 		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
 				    b_assoc_buffers) {
 			if (bh->b_page != bd_page) {
@@ -1931,7 +1873,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
 
 		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
 				    b_assoc_buffers) {
-			if (bh == sci->sc_super_root) {
+			if (bh == bh_sr) {
 				if (bh->b_page != bd_page) {
 					end_page_writeback(bd_page);
 					bd_page = bh->b_page;
@@ -1941,7 +1883,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
 			if (bh->b_page != fs_page) {
 				nilfs_end_page_io(fs_page, err);
 				if (fs_page && fs_page == failed_page)
-					goto done;
+					return;
 				fs_page = bh->b_page;
 			}
 		}
@@ -1950,8 +1892,34 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
 		end_page_writeback(bd_page);
 
 	nilfs_end_page_io(fs_page, err);
- done:
+}
+
+static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs, int err)
+{
+	LIST_HEAD(logs);
+	int ret;
+
+	list_splice_tail_init(&sci->sc_write_logs, &logs);
+	ret = nilfs_wait_on_logs(&logs);
+	if (ret)
+		nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
+
+	list_splice_tail_init(&sci->sc_segbufs, &logs);
+	nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
+	nilfs_free_incomplete_logs(&logs, nilfs);
 	nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
+
+	if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+		ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+						sci->sc_freesegs,
+						sci->sc_nfreesegs,
+						NULL);
+		WARN_ON(ret); /* do not happen */
+	}
+
+	nilfs_destroy_logs(&logs);
+	sci->sc_super_root = NULL;
 }
 
 static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1973,7 +1941,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int update_sr = (sci->sc_super_root != NULL);
 
-	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+	list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
 		struct buffer_head *bh;
 
 		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
@@ -2046,7 +2014,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 
 	sci->sc_nblk_inc += sci->sc_nblk_this_inc;
 
-	segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+	segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
 	nilfs_set_next_segment(nilfs, segbuf);
 
 	if (update_sr) {
@@ -2057,10 +2025,23 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
 		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
 		set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+		nilfs_segctor_clear_metadata_dirty(sci);
 	} else
 		clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
 }
 
+static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
+{
+	int ret;
+
+	ret = nilfs_wait_on_logs(&sci->sc_write_logs);
+	if (!ret) {
+		nilfs_segctor_complete_write(sci);
+		nilfs_destroy_logs(&sci->sc_write_logs);
+	}
+	return ret;
+}
+
 static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 					struct nilfs_sb_info *sbi)
 {
@@ -2173,7 +2154,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 		/* Avoid empty segment */
 		if (sci->sc_stage.scnt == NILFS_ST_DONE &&
 		    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
-			nilfs_segctor_end_construction(sci, nilfs, 1);
+			nilfs_segctor_abort_construction(sci, nilfs, 1);
 			goto out;
 		}
 
@@ -2187,7 +2168,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 		if (has_sr) {
 			err = nilfs_segctor_fill_in_checkpoint(sci);
 			if (unlikely(err))
-				goto failed_to_make_up;
+				goto failed_to_write;
 
 			nilfs_segctor_fill_in_super_root(sci, nilfs);
 		}
@@ -2195,42 +2176,46 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 
 		/* Write partial segments */
 		err = nilfs_segctor_prepare_write(sci, &failed_page);
-		if (unlikely(err))
+		if (err) {
+			nilfs_abort_logs(&sci->sc_segbufs, failed_page,
+					 sci->sc_super_root, err);
 			goto failed_to_write;
-
+		}
 		nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
 
-		err = nilfs_segctor_write(sci, nilfs->ns_bdi);
+		err = nilfs_segctor_write(sci, nilfs);
 		if (unlikely(err))
 			goto failed_to_write;
 
-		nilfs_segctor_complete_write(sci);
-
-		/* Commit segments */
-		if (has_sr)
-			nilfs_segctor_clear_metadata_dirty(sci);
-
-		nilfs_segctor_end_construction(sci, nilfs, 0);
-
+		if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+		    nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
+			/*
+			 * At this point, we avoid double buffering
+			 * for blocksize < pagesize because page dirty
+			 * flag is turned off during write and dirty
+			 * buffers are not properly collected for
+			 * pages crossing over segments.
+			 */
+			err = nilfs_segctor_wait(sci);
+			if (err)
+				goto failed_to_write;
+		}
 	} while (sci->sc_stage.scnt != NILFS_ST_DONE);
 
+	sci->sc_super_root = NULL;
+
  out:
-	nilfs_segctor_destroy_segment_buffers(sci);
 	nilfs_segctor_check_out_files(sci, sbi);
 	return err;
 
  failed_to_write:
-	nilfs_segctor_abort_write(sci, failed_page, err);
-	nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
-
- failed_to_make_up:
 	if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
 		nilfs_redirty_inodes(&sci->sc_dirty_files);
 
  failed:
 	if (nilfs_doing_gc())
 		nilfs_redirty_inodes(&sci->sc_gc_inodes);
-	nilfs_segctor_end_construction(sci, nilfs, err);
+	nilfs_segctor_abort_construction(sci, nilfs, err);
 	goto out;
 }
 
@@ -2559,7 +2544,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 
 	sci->sc_freesegs = kbufs[4];
 	sci->sc_nfreesegs = argv[4].v_nmembs;
-	list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
+	list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
 
 	for (;;) {
 		nilfs_segctor_accept(sci, &req);
@@ -2788,6 +2773,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
 	spin_lock_init(&sci->sc_state_lock);
 	INIT_LIST_HEAD(&sci->sc_dirty_files);
 	INIT_LIST_HEAD(&sci->sc_segbufs);
+	INIT_LIST_HEAD(&sci->sc_write_logs);
 	INIT_LIST_HEAD(&sci->sc_gc_inodes);
 	INIT_LIST_HEAD(&sci->sc_copied_buffers);
 
@@ -2855,6 +2841,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 	}
 
 	WARN_ON(!list_empty(&sci->sc_segbufs));
+	WARN_ON(!list_empty(&sci->sc_write_logs));
 
 	down_write(&sbi->s_nilfs->ns_segctor_sem);
 
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 0d2a475a741..3d3ab2f9864 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -97,6 +97,7 @@ struct nilfs_segsum_pointer {
  * @sc_dsync_start: start byte offset of data pages
  * @sc_dsync_end: end byte offset of data pages (inclusive)
  * @sc_segbufs: List of segment buffers
+ * @sc_write_logs: List of segment buffers to hold logs under writing
  * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
  * @sc_curseg: Current segment buffer
  * @sc_super_root: Pointer to the super root buffer
@@ -143,6 +144,7 @@ struct nilfs_sc_info {
 
 	/* Segment buffers */
 	struct list_head	sc_segbufs;
+	struct list_head	sc_write_logs;
 	unsigned long		sc_segbuf_nblocks;
 	struct nilfs_segment_buffer *sc_curseg;
 	struct buffer_head     *sc_super_root;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 37994d4a59c..b6c36d0cc33 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -31,6 +31,16 @@
 #include "sufile.h"
 
 
+struct nilfs_sufile_info {
+	struct nilfs_mdt_info mi;
+	unsigned long ncleansegs;
+};
+
+static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
+{
+	return (struct nilfs_sufile_info *)NILFS_MDT(sufile);
+}
+
 static inline unsigned long
 nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
 {
@@ -62,14 +72,6 @@ nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
 		     max - curr + 1);
 }
 
-static inline struct nilfs_sufile_header *
-nilfs_sufile_block_get_header(const struct inode *sufile,
-			      struct buffer_head *bh,
-			      void *kaddr)
-{
-	return kaddr + bh_offset(bh);
-}
-
 static struct nilfs_segment_usage *
 nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
 				     struct buffer_head *bh, void *kaddr)
@@ -110,6 +112,15 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
 }
 
 /**
+ * nilfs_sufile_get_ncleansegs - return the number of clean segments
+ * @sufile: inode of segment usage file
+ */
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
+{
+	return NILFS_SUI(sufile)->ncleansegs;
+}
+
+/**
  * nilfs_sufile_updatev - modify multiple segment usages at a time
  * @sufile: inode of segment usage file
  * @segnumv: array of segment numbers
@@ -270,7 +281,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 	if (ret < 0)
 		goto out_sem;
 	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-	header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+	header = kaddr + bh_offset(header_bh);
 	ncleansegs = le64_to_cpu(header->sh_ncleansegs);
 	last_alloc = le64_to_cpu(header->sh_last_alloc);
 	kunmap_atomic(kaddr, KM_USER0);
@@ -302,13 +313,13 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 			kunmap_atomic(kaddr, KM_USER0);
 
 			kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-			header = nilfs_sufile_block_get_header(
-				sufile, header_bh, kaddr);
+			header = kaddr + bh_offset(header_bh);
 			le64_add_cpu(&header->sh_ncleansegs, -1);
 			le64_add_cpu(&header->sh_ndirtysegs, 1);
 			header->sh_last_alloc = cpu_to_le64(segnum);
 			kunmap_atomic(kaddr, KM_USER0);
 
+			NILFS_SUI(sufile)->ncleansegs--;
 			nilfs_mdt_mark_buffer_dirty(header_bh);
 			nilfs_mdt_mark_buffer_dirty(su_bh);
 			nilfs_mdt_mark_dirty(sufile);
@@ -351,6 +362,8 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
 	kunmap_atomic(kaddr, KM_USER0);
 
 	nilfs_sufile_mod_counter(header_bh, -1, 1);
+	NILFS_SUI(sufile)->ncleansegs--;
+
 	nilfs_mdt_mark_buffer_dirty(su_bh);
 	nilfs_mdt_mark_dirty(sufile);
 }
@@ -380,6 +393,8 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
 	kunmap_atomic(kaddr, KM_USER0);
 
 	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+	NILFS_SUI(sufile)->ncleansegs -= clean;
+
 	nilfs_mdt_mark_buffer_dirty(su_bh);
 	nilfs_mdt_mark_dirty(sufile);
 }
@@ -409,79 +424,65 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
 	nilfs_mdt_mark_buffer_dirty(su_bh);
 
 	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
+	NILFS_SUI(sufile)->ncleansegs++;
+
 	nilfs_mdt_mark_dirty(sufile);
 }
 
 /**
- * nilfs_sufile_get_segment_usage - get a segment usage
+ * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty
  * @sufile: inode of segment usage file
  * @segnum: segment number
- * @sup: pointer to segment usage
- * @bhp: pointer to buffer head
- *
- * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
- * specified by @segnum.
- *
- * Return Value: On success, 0 is returned, and the segment usage and the
- * buffer head of the buffer on which the segment usage is located are stored
- * in the place pointed by @sup and @bhp, respectively. On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid segment usage number.
  */
-int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
-				   struct nilfs_segment_usage **sup,
-				   struct buffer_head **bhp)
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 {
 	struct buffer_head *bh;
-	struct nilfs_segment_usage *su;
-	void *kaddr;
 	int ret;
 
-	/* segnum is 0 origin */
-	if (segnum >= nilfs_sufile_get_nsegments(sufile))
-		return -EINVAL;
-	down_write(&NILFS_MDT(sufile)->mi_sem);
-	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
-	if (ret < 0)
-		goto out_sem;
-	kaddr = kmap(bh->b_page);
-	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
-	if (nilfs_segment_usage_error(su)) {
-		kunmap(bh->b_page);
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+	if (!ret) {
+		nilfs_mdt_mark_buffer_dirty(bh);
+		nilfs_mdt_mark_dirty(sufile);
 		brelse(bh);
-		ret = -EINVAL;
-		goto out_sem;
 	}
-
-	if (sup != NULL)
-		*sup = su;
-	*bhp = bh;
-
- out_sem:
-	up_write(&NILFS_MDT(sufile)->mi_sem);
 	return ret;
 }
 
 /**
- * nilfs_sufile_put_segment_usage - put a segment usage
+ * nilfs_sufile_set_segment_usage - set usage of a segment
  * @sufile: inode of segment usage file
  * @segnum: segment number
- * @bh: buffer head
- *
- * Description: nilfs_sufile_put_segment_usage() releases the segment usage
- * specified by @segnum. @bh must be the buffer head which have been returned
- * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
+ * @nblocks: number of live blocks in the segment
+ * @modtime: modification time (option)
  */
-void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
-				    struct buffer_head *bh)
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
+				   unsigned long nblocks, time_t modtime)
 {
-	kunmap(bh->b_page);
+	struct buffer_head *bh;
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int ret;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+	if (ret < 0)
+		goto out_sem;
+
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+	WARN_ON(nilfs_segment_usage_error(su));
+	if (modtime)
+		su->su_lastmod = cpu_to_le64(modtime);
+	su->su_nblocks = cpu_to_le32(nblocks);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(sufile);
 	brelse(bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
 }
 
 /**
@@ -515,7 +516,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 		goto out_sem;
 
 	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-	header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+	header = kaddr + bh_offset(header_bh);
 	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
 	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
 	sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
@@ -532,33 +533,6 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 	return ret;
 }
 
-/**
- * nilfs_sufile_get_ncleansegs - get the number of clean segments
- * @sufile: inode of segment usage file
- * @nsegsp: pointer to the number of clean segments
- *
- * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
- * segments.
- *
- * Return Value: On success, 0 is returned and the number of clean segments is
- * stored in the place pointed by @nsegsp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
-{
-	struct nilfs_sustat sustat;
-	int ret;
-
-	ret = nilfs_sufile_get_stat(sufile, &sustat);
-	if (ret == 0)
-		*nsegsp = sustat.ss_ncleansegs;
-	return ret;
-}
-
 void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 			       struct buffer_head *header_bh,
 			       struct buffer_head *su_bh)
@@ -577,8 +551,10 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 	nilfs_segment_usage_set_error(su);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	if (suclean)
+	if (suclean) {
 		nilfs_sufile_mod_counter(header_bh, -1, 0);
+		NILFS_SUI(sufile)->ncleansegs--;
+	}
 	nilfs_mdt_mark_buffer_dirty(su_bh);
 	nilfs_mdt_mark_dirty(sufile);
 }
@@ -657,3 +633,48 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 	up_read(&NILFS_MDT(sufile)->mi_sem);
 	return ret;
 }
+
+/**
+ * nilfs_sufile_read - read sufile inode
+ * @sufile: sufile inode
+ * @raw_inode: on-disk sufile inode
+ */
+int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode)
+{
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_read_inode_common(sufile, raw_inode);
+	if (ret < 0)
+		return ret;
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (!ret) {
+		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+		header = kaddr + bh_offset(header_bh);
+		sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(header_bh);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_sufile_new - create sufile
+ * @nilfs: nilfs object
+ * @susize: size of a segment usage entry
+ */
+struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize)
+{
+	struct inode *sufile;
+
+	sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO,
+			       sizeof(struct nilfs_sufile_info));
+	if (sufile)
+		nilfs_mdt_set_entry_size(sufile, susize,
+					 sizeof(struct nilfs_sufile_header));
+	return sufile;
+}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 0e99e5c0bd0..15163b8aff7 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -34,14 +34,13 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 	return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
 }
 
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
+
 int nilfs_sufile_alloc(struct inode *, __u64 *);
-int nilfs_sufile_get_segment_usage(struct inode *, __u64,
-				   struct nilfs_segment_usage **,
-				   struct buffer_head **);
-void nilfs_sufile_put_segment_usage(struct inode *, __u64,
-				    struct buffer_head *);
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
+				   unsigned long nblocks, time_t modtime);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
-int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
 ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
 				size_t);
 
@@ -62,6 +61,9 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
 			       struct buffer_head *);
 
+int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode);
+struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize);
+
 /**
  * nilfs_sufile_scrap - make a segment garbage
  * @sufile: inode of segment usage file
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 644e66727dd..5403b3ef3a4 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -363,14 +363,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	list_add(&sbi->s_list, &nilfs->ns_supers);
 	up_write(&nilfs->ns_super_sem);
 
-	sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
+	sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
 	if (!sbi->s_ifile)
 		return -ENOMEM;
 
-	err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
-	if (unlikely(err))
-		goto failed;
-
 	down_read(&nilfs->ns_segctor_sem);
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
 					  &bh_cp);
@@ -411,7 +407,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
-	nilfs_mdt_clear(sbi->s_ifile);
 	nilfs_mdt_destroy(sbi->s_ifile);
 	sbi->s_ifile = NULL;
 	down_write(&nilfs->ns_super_sem);
@@ -419,22 +414,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
 	up_write(&nilfs->ns_super_sem);
 }
 
-static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
-{
-	struct the_nilfs *nilfs = sbi->s_nilfs;
-	int err = 0;
-
-	down_write(&nilfs->ns_sem);
-	if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
-		nilfs->ns_mount_state |= NILFS_VALID_FS;
-		err = nilfs_commit_super(sbi, 1);
-		if (likely(!err))
-			printk(KERN_INFO "NILFS: recovery complete.\n");
-	}
-	up_write(&nilfs->ns_sem);
-	return err;
-}
-
 static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
@@ -490,7 +469,7 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 
 	if (!nilfs_test_opt(sbi, BARRIER))
-		seq_printf(seq, ",barrier=off");
+		seq_printf(seq, ",nobarrier");
 	if (nilfs_test_opt(sbi, SNAPSHOT))
 		seq_printf(seq, ",cp=%llu",
 			   (unsigned long long int)sbi->s_snapshot_cno);
@@ -500,6 +479,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",errors=panic");
 	if (nilfs_test_opt(sbi, STRICT_ORDER))
 		seq_printf(seq, ",order=strict");
+	if (nilfs_test_opt(sbi, NORECOVERY))
+		seq_printf(seq, ",norecovery");
 
 	return 0;
 }
@@ -568,7 +549,7 @@ static const struct export_operations nilfs_export_ops = {
 
 enum {
 	Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_barrier, Opt_snapshot, Opt_order,
+	Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
 	Opt_err,
 };
 
@@ -576,25 +557,13 @@ static match_table_t tokens = {
 	{Opt_err_cont, "errors=continue"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
-	{Opt_barrier, "barrier=%s"},
+	{Opt_nobarrier, "nobarrier"},
 	{Opt_snapshot, "cp=%u"},
 	{Opt_order, "order=%s"},
+	{Opt_norecovery, "norecovery"},
 	{Opt_err, NULL}
 };
 
-static int match_bool(substring_t *s, int *result)
-{
-	int len = s->to - s->from;
-
-	if (strncmp(s->from, "on", len) == 0)
-		*result = 1;
-	else if (strncmp(s->from, "off", len) == 0)
-		*result = 0;
-	else
-		return 1;
-	return 0;
-}
-
 static int parse_options(char *options, struct super_block *sb)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -612,13 +581,8 @@ static int parse_options(char *options, struct super_block *sb)
 
 		token = match_token(p, tokens, args);
 		switch (token) {
-		case Opt_barrier:
-			if (match_bool(&args[0], &option))
-				return 0;
-			if (option)
-				nilfs_set_opt(sbi, BARRIER);
-			else
-				nilfs_clear_opt(sbi, BARRIER);
+		case Opt_nobarrier:
+			nilfs_clear_opt(sbi, BARRIER);
 			break;
 		case Opt_order:
 			if (strcmp(args[0].from, "relaxed") == 0)
@@ -647,6 +611,9 @@ static int parse_options(char *options, struct super_block *sb)
 			sbi->s_snapshot_cno = option;
 			nilfs_set_opt(sbi, SNAPSHOT);
 			break;
+		case Opt_norecovery:
+			nilfs_set_opt(sbi, NORECOVERY);
+			break;
 		default:
 			printk(KERN_ERR
 			       "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -672,9 +639,7 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
 	int mnt_count = le16_to_cpu(sbp->s_mnt_count);
 
 	/* nilfs->sem must be locked by the caller. */
-	if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
-		printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
-	} else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
+	if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
 		printk(KERN_WARNING
 		       "NILFS warning: mounting fs with errors\n");
 #if 0
@@ -782,11 +747,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	sb->s_root = NULL;
 	sb->s_time_gran = 1;
 
-	if (!nilfs_loaded(nilfs)) {
-		err = load_nilfs(nilfs, sbi);
-		if (err)
-			goto failed_sbi;
-	}
+	err = load_nilfs(nilfs, sbi);
+	if (err)
+		goto failed_sbi;
+
 	cno = nilfs_last_cno(nilfs);
 
 	if (sb->s_flags & MS_RDONLY) {
@@ -854,12 +818,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		up_write(&nilfs->ns_sem);
 	}
 
-	err = nilfs_mark_recovery_complete(sbi);
-	if (unlikely(err)) {
-		printk(KERN_ERR "NILFS: recovery failed.\n");
-		goto failed_root;
-	}
-
 	down_write(&nilfs->ns_super_sem);
 	if (!nilfs_test_opt(sbi, SNAPSHOT))
 		nilfs->ns_current = sbi;
@@ -867,10 +825,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 
 	return 0;
 
- failed_root:
-	dput(sb->s_root);
-	sb->s_root = NULL;
-
  failed_segctor:
 	nilfs_detach_segment_constructor(sbi);
 
@@ -915,6 +869,14 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
+	if (!nilfs_valid_fs(nilfs)) {
+		printk(KERN_WARNING "NILFS (device %s): couldn't "
+		       "remount because the filesystem is in an "
+		       "incomplete recovery state.\n", sb->s_id);
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		goto out;
 	if (*flags & MS_RDONLY) {
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ad391a8c3e7..6241e1722ef 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -146,13 +146,9 @@ void put_nilfs(struct the_nilfs *nilfs)
 
 	might_sleep();
 	if (nilfs_loaded(nilfs)) {
-		nilfs_mdt_clear(nilfs->ns_sufile);
 		nilfs_mdt_destroy(nilfs->ns_sufile);
-		nilfs_mdt_clear(nilfs->ns_cpfile);
 		nilfs_mdt_destroy(nilfs->ns_cpfile);
-		nilfs_mdt_clear(nilfs->ns_dat);
 		nilfs_mdt_destroy(nilfs->ns_dat);
-		/* XXX: how and when to clear nilfs->ns_gc_dat? */
 		nilfs_mdt_destroy(nilfs->ns_gc_dat);
 	}
 	if (nilfs_init(nilfs)) {
@@ -166,7 +162,6 @@ void put_nilfs(struct the_nilfs *nilfs)
 static int nilfs_load_super_root(struct the_nilfs *nilfs,
 				 struct nilfs_sb_info *sbi, sector_t sr_block)
 {
-	static struct lock_class_key dat_lock_key;
 	struct buffer_head *bh_sr;
 	struct nilfs_super_root *raw_sr;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
@@ -187,51 +182,36 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
 	inode_size = nilfs->ns_inode_size;
 
 	err = -ENOMEM;
-	nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
+	nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size);
 	if (unlikely(!nilfs->ns_dat))
 		goto failed;
 
-	nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
+	nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size);
 	if (unlikely(!nilfs->ns_gc_dat))
 		goto failed_dat;
 
-	nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
+	nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
 	if (unlikely(!nilfs->ns_cpfile))
 		goto failed_gc_dat;
 
-	nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
+	nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
 	if (unlikely(!nilfs->ns_sufile))
 		goto failed_cpfile;
 
-	err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
-	if (unlikely(err))
-		goto failed_sufile;
-
-	err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
-	if (unlikely(err))
-		goto failed_sufile;
-
-	lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
-	lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
-
 	nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
-	nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
-				 sizeof(struct nilfs_cpfile_header));
-	nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
-				 sizeof(struct nilfs_sufile_header));
 
-	err = nilfs_mdt_read_inode_direct(
-		nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
+	err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
+			     NILFS_SR_DAT_OFFSET(inode_size));
 	if (unlikely(err))
 		goto failed_sufile;
 
-	err = nilfs_mdt_read_inode_direct(
-		nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
+	err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
+				NILFS_SR_CPFILE_OFFSET(inode_size));
 	if (unlikely(err))
 		goto failed_sufile;
 
-	err = nilfs_mdt_read_inode_direct(
-		nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
+	err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
+				NILFS_SR_SUFILE_OFFSET(inode_size));
 	if (unlikely(err))
 		goto failed_sufile;
 
@@ -281,29 +261,30 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	struct nilfs_recovery_info ri;
 	unsigned int s_flags = sbi->s_super->s_flags;
 	int really_read_only = bdev_read_only(nilfs->ns_bdev);
-	unsigned valid_fs;
-	int err = 0;
-
-	nilfs_init_recovery_info(&ri);
+	int valid_fs = nilfs_valid_fs(nilfs);
+	int err;
 
-	down_write(&nilfs->ns_sem);
-	valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
-	up_write(&nilfs->ns_sem);
+	if (nilfs_loaded(nilfs)) {
+		if (valid_fs ||
+		    ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
+			return 0;
+		printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
+		       "recovery state.\n");
+		return -EINVAL;
+	}
 
-	if (!valid_fs && (s_flags & MS_RDONLY)) {
-		printk(KERN_INFO "NILFS: INFO: recovery "
-		       "required for readonly filesystem.\n");
-		if (really_read_only) {
-			printk(KERN_ERR "NILFS: write access "
-			       "unavailable, cannot proceed.\n");
-			err = -EROFS;
-			goto failed;
+	if (!valid_fs) {
+		printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+		if (s_flags & MS_RDONLY) {
+			printk(KERN_INFO "NILFS: INFO: recovery "
+			       "required for readonly filesystem.\n");
+			printk(KERN_INFO "NILFS: write access will "
+			       "be enabled during recovery.\n");
 		}
-		printk(KERN_INFO "NILFS: write access will "
-		       "be enabled during recovery.\n");
-		sbi->s_super->s_flags &= ~MS_RDONLY;
 	}
 
+	nilfs_init_recovery_info(&ri);
+
 	err = nilfs_search_super_root(nilfs, sbi, &ri);
 	if (unlikely(err)) {
 		printk(KERN_ERR "NILFS: error searching super root.\n");
@@ -316,19 +297,56 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 		goto failed;
 	}
 
-	if (!valid_fs) {
-		err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
-		if (unlikely(err)) {
-			nilfs_mdt_destroy(nilfs->ns_cpfile);
-			nilfs_mdt_destroy(nilfs->ns_sufile);
-			nilfs_mdt_destroy(nilfs->ns_dat);
-			goto failed;
+	if (valid_fs)
+		goto skip_recovery;
+
+	if (s_flags & MS_RDONLY) {
+		if (nilfs_test_opt(sbi, NORECOVERY)) {
+			printk(KERN_INFO "NILFS: norecovery option specified. "
+			       "skipping roll-forward recovery\n");
+			goto skip_recovery;
 		}
-		if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
-			sbi->s_super->s_dirt = 1;
+		if (really_read_only) {
+			printk(KERN_ERR "NILFS: write access "
+			       "unavailable, cannot proceed.\n");
+			err = -EROFS;
+			goto failed_unload;
+		}
+		sbi->s_super->s_flags &= ~MS_RDONLY;
+	} else if (nilfs_test_opt(sbi, NORECOVERY)) {
+		printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
+		       "option was specified for a read/write mount\n");
+		err = -EINVAL;
+		goto failed_unload;
 	}
 
+	err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+	if (err)
+		goto failed_unload;
+
+	down_write(&nilfs->ns_sem);
+	nilfs->ns_mount_state |= NILFS_VALID_FS;
+	nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+	err = nilfs_commit_super(sbi, 1);
+	up_write(&nilfs->ns_sem);
+
+	if (err) {
+		printk(KERN_ERR "NILFS: failed to update super block. "
+		       "recovery unfinished.\n");
+		goto failed_unload;
+	}
+	printk(KERN_INFO "NILFS: recovery complete.\n");
+
+ skip_recovery:
 	set_nilfs_loaded(nilfs);
+	nilfs_clear_recovery_info(&ri);
+	sbi->s_super->s_flags = s_flags;
+	return 0;
+
+ failed_unload:
+	nilfs_mdt_destroy(nilfs->ns_cpfile);
+	nilfs_mdt_destroy(nilfs->ns_sufile);
+	nilfs_mdt_destroy(nilfs->ns_dat);
 
  failed:
 	nilfs_clear_recovery_info(&ri);
@@ -632,30 +650,23 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
 	struct inode *dat = nilfs_dat_inode(nilfs);
 	unsigned long ncleansegs;
-	int err;
 
 	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
-	err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
+	ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
 	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
-	if (likely(!err))
-		*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
-	return err;
+	*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+	return 0;
 }
 
 int nilfs_near_disk_full(struct the_nilfs *nilfs)
 {
-	struct inode *sufile = nilfs->ns_sufile;
 	unsigned long ncleansegs, nincsegs;
-	int ret;
 
-	ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
-	if (likely(!ret)) {
-		nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
-			nilfs->ns_blocks_per_segment + 1;
-		if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
-			ret++;
-	}
-	return ret;
+	ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
+	nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+		nilfs->ns_blocks_per_segment + 1;
+
+	return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
 }
 
 /**
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 20abd55881e..589786e3346 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -258,6 +258,16 @@ static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
 		kfree(sbi);
 }
 
+static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
+{
+	unsigned valid_fs;
+
+	down_read(&nilfs->ns_sem);
+	valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+	up_read(&nilfs->ns_sem);
+	return valid_fs;
+}
+
 static inline void
 nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
 			sector_t *seg_start, sector_t *seg_end)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index dcd2040d330..5ef5f365a5c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -69,36 +69,30 @@ static int zero;
 
 ctl_table inotify_table[] = {
 	{
-		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
 		.procname	= "max_user_instances",
 		.data		= &inotify_max_user_instances,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
 	{
-		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
 		.procname	= "max_user_watches",
 		.data		= &inotify_max_user_watches,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
 	{
-		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
 		.procname	= "max_queued_events",
 		.data		= &inotify_max_queued_events,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 #endif /* CONFIG_SYSCTL */
 
@@ -747,10 +741,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 
 	/* create/update an inode mark */
 	ret = inotify_update_watch(group, inode, mask);
-	if (unlikely(ret))
-		goto path_put_and_out;
-
-path_put_and_out:
 	path_put(&path);
 fput_and_out:
 	fput_light(filp, fput_needed);
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 9669541d011..08f7530e934 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -927,7 +927,7 @@ lock_retry_remap:
 		return 0;
 
 	ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
-			"EOVERFLOW" : (!err ? "EIO" : "unkown error"));
+			"EOVERFLOW" : (!err ? "EIO" : "unknown error"));
 	return err < 0 ? err : -EIO;
 
 read_err:
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 663c0e341f8..43179ddd336 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -399,7 +399,7 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
  * @cached_page: allocated but as yet unused page
  * @lru_pvec:	lru-buffering pagevec of caller
  *
- * Obtain @nr_pages locked page cache pages from the mapping @maping and
+ * Obtain @nr_pages locked page cache pages from the mapping @mapping and
  * starting at index @index.
  *
  * If a page is newly created, increment its refcount and add it to the
@@ -1281,7 +1281,7 @@ rl_not_mapped_enoent:
 
 /*
  * Copy as much as we can into the pages and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the pages
+ * were successfully copied.  If a fault is encountered then clear the pages
  * out to (ofs + bytes) and return the number of bytes which were copied.
  */
 static inline size_t ntfs_copy_from_user(struct page **pages,
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 89b02985c05..4dadcdf3d45 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -338,7 +338,7 @@ err_out:
  * copy of the complete multi sector transfer deprotected page.  On failure,
  * *@wrp is undefined.
  *
- * Simillarly, if @lsn is not NULL, on succes *@lsn will be set to the current
+ * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
  * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
  *
  * The following error codes are defined:
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 9ef85e628fe..79a89184cb5 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -36,12 +36,11 @@
 /* Definition of the ntfs sysctl. */
 static ctl_table ntfs_sysctls[] = {
 	{
-		.ctl_name	= CTL_UNNUMBERED,	/* Binary and text IDs. */
 		.procname	= "ntfs-debug",
 		.data		= &debug_msgs,		/* Data pointer and size. */
 		.maxlen		= sizeof(debug_msgs),
 		.mode		= 0644,			/* Mode, proc handler. */
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= proc_dointvec
 	},
 	{}
 };
@@ -49,7 +48,6 @@ static ctl_table ntfs_sysctls[] = {
 /* Define the parent directory /proc/sys/fs. */
 static ctl_table sysctls_root[] = {
 	{
-		.ctl_name	= CTL_FS,
 		.procname	= "fs",
 		.mode		= 0555,
 		.child		= ntfs_sysctls
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 38a42f5d59f..7c7198a5bc9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2398,7 +2398,7 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
  *
  * The array is assumed to be large enough to hold an entire path (tree depth).
  *
- * Upon succesful return from this function:
+ * Upon successful return from this function:
  *
  * - The 'right_path' array will contain a path to the leaf block
  *   whose range contains e_cpos.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5..3dae4a13f6e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
  *
  * called like this: dio->get_blocks(dio->inode, fs_startblk,
  * 					fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
  */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 				     struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 
 	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 
-	/*
-	 * Any write past EOF is not allowed because we'd be extending.
-	 */
-	if (create && (iblock + max_blocks) > inode_blocks) {
-		ret = -EIO;
-		goto bail;
-	}
-
 	/* This figures out the size of the next contiguous block, and
 	 * our logical offset */
 	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
-		ocfs2_error(inode->i_sb,
-			    "Inode %llu has a hole at block %llu\n",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-			    (unsigned long long)iblock);
-		ret = -EROFS;
-		goto bail;
-	}
-
 	/* We should already CoW the refcounted extent. */
 	BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
 	/*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 	 */
 	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 		map_bh(bh_result, inode->i_sb, p_blkno);
-	else {
-		/*
-		 * ocfs2_prepare_inode_for_write() should have caught
-		 * the case where we'd be filling a hole and triggered
-		 * a buffered write instead.
-		 */
-		if (create) {
-			ret = -EIO;
-			mlog_errno(ret);
-			goto bail;
-		}
-
+	else
 		clear_buffer_mapped(bh_result);
-	}
 
 	/* make sure we don't map more than max_blocks blocks here as
 	   that's all the kernel will handle at this point. */
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index a1163b8b417..b7428c5d0d3 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -47,7 +47,7 @@
  * Calculate the bit offset in the hamming code buffer based on the bit's
  * offset in the data buffer.  Since the hamming code reserves all
  * power-of-two bits for parity, the data bit number and the code bit
- * number are offest by all the parity bits beforehand.
+ * number are offset by all the parity bits beforehand.
  *
  * Recall that bit numbers in hamming code are 1-based.  This function
  * takes the 0-based data bit from the caller.
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index da794bc07a6..a3f150e52b0 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -294,10 +294,10 @@ static int sc_seq_show(struct seq_file *seq, void *v)
 		if (sc->sc_sock) {
 			inet = inet_sk(sc->sc_sock->sk);
 			/* the stack's structs aren't sparse endian clean */
-			saddr = (__force __be32)inet->saddr;
-			daddr = (__force __be32)inet->daddr;
-			sport = (__force __be16)inet->sport;
-			dport = (__force __be16)inet->dport;
+			saddr = (__force __be32)inet->inet_saddr;
+			daddr = (__force __be32)inet->inet_daddr;
+			sport = (__force __be16)inet->inet_sport;
+			dport = (__force __be16)inet->inet_dport;
 		}
 
 		/* XXX sigh, inet-> doesn't have sparse annotation so any
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 83bcaf266b3..03ccf9a7b1f 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2586,7 +2586,7 @@ fail:
 	 * is complete everywhere.  if the target dies while this is
 	 * going on, some nodes could potentially see the target as the
 	 * master, so it is important that my recovery finds the migration
-	 * mle and sets the master to UNKNONWN. */
+	 * mle and sets the master to UNKNOWN. */
 
 
 	/* wait for new node to assert master */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0d38d67194c..c5e4a49e3a1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1855,7 +1855,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 		 * outstanding lock request, so a cancel convert is
 		 * required. We intentionally overwrite 'ret' - if the
 		 * cancel fails and the lock was granted, it's easier
-		 * to just bubble sucess back up to the user.
+		 * to just bubble success back up to the user.
 		 */
 		ret = ocfs2_flock_handle_signal(lockres, level);
 	} else if (!ret && (level > lockres->l_level)) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89fc8ee1f5a..3d30a1c974a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1712,7 +1712,8 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
 	struct super_block *sb = inode->i_sb;
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
-	    !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+	    !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
+	    OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		return 0;
 
 	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
@@ -2005,7 +2006,7 @@ out_dio:
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 
-	if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
+	if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode)) {
 		ret = filemap_fdatawrite_range(file->f_mapping, pos,
 					       pos + count - 1);
 		if (ret < 0)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 54c16b66327..bf34c491ae9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -659,7 +659,7 @@ static int __ocfs2_journal_access(handle_t *handle,
 
 	default:
 		status = -EINVAL;
-		mlog(ML_ERROR, "Uknown access type!\n");
+		mlog(ML_ERROR, "Unknown access type!\n");
 	}
 	if (!status && ocfs2_meta_ecc(osb) && triggers)
 		jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index eae40460242..d963d863870 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -35,12 +35,7 @@
 #include <linux/kref.h>
 #include <linux/mutex.h>
 #include <linux/lockdep.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
+#include <linux/jbd2.h>
 
 /* For union ocfs2_dlm_lksb */
 #include "stackglue.h"
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index e5df9d170b0..123bc520a2c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -17,10 +17,6 @@
 
 #include "ocfs2.h"
 
-/* Common stuff */
-/* id number of quota format */
-#define QFMT_OCFS2 3
-
 /*
  * In-memory structures
  */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 1a2c50a759f..21f9e71223c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -1325,7 +1325,7 @@ out:
 	return status;
 }
 
-static struct quota_format_ops ocfs2_format_ops = {
+static const struct quota_format_ops ocfs2_format_ops = {
 	.check_quota_file	= ocfs2_local_check_quota_file,
 	.read_file_info		= ocfs2_local_read_info,
 	.write_file_info	= ocfs2_global_write_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 60287fc56bc..30967e3f5e4 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2431,7 +2431,7 @@ out:
  * we gonna touch and whether we need to create new blocks.
  *
  * Normally the refcount blocks store these refcount should be
- * continguous also, so that we can get the number easily.
+ * contiguous also, so that we can get the number easily.
  * As for meta_ac, we will at most add split 2 refcount record and
  * 2 more refcount block, so just check it in a rough way.
  *
@@ -3743,6 +3743,9 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 		goto out;
 	}
 
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		goto attach_xattr;
+
 	ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
 
 	size = i_size_read(inode);
@@ -3769,6 +3772,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 		cpos += num_clusters;
 	}
 
+attach_xattr:
 	if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
 		ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
 						       &ref_tree->rf_ci,
@@ -3858,6 +3862,49 @@ out:
 	return ret;
 }
 
+static int ocfs2_duplicate_inline_data(struct inode *s_inode,
+				       struct buffer_head *s_bh,
+				       struct inode *t_inode,
+				       struct buffer_head *t_bh)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+	struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
+
+	BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
+	memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
+	       le16_to_cpu(s_di->id2.i_data.id_count));
+	spin_lock(&OCFS2_I(t_inode)->ip_lock);
+	OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+	t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+
+	ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
 static int ocfs2_duplicate_extent_list(struct inode *s_inode,
 				struct inode *t_inode,
 				struct buffer_head *t_bh,
@@ -3997,6 +4044,14 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
 		goto out;
 	}
 
+	if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
+						  t_inode, t_bh);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
 	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
 				       1, &ref_tree, &ref_root_bh);
 	if (ret) {
@@ -4013,10 +4068,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
 		goto out_unlock_refcount;
 	}
 
-	ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve);
-	if (ret)
-		mlog_errno(ret);
-
 out_unlock_refcount:
 	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	brelse(ref_root_bh);
@@ -4068,9 +4119,17 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
 		ret = ocfs2_reflink_xattrs(inode, old_bh,
 					   new_inode, new_bh,
 					   preserve);
-		if (ret)
+		if (ret) {
 			mlog_errno(ret);
+			goto inode_unlock;
+		}
 	}
+
+	ret = ocfs2_complete_reflink(inode, old_bh,
+				     new_inode, new_bh, preserve);
+	if (ret)
+		mlog_errno(ret);
+
 inode_unlock:
 	ocfs2_inode_unlock(new_inode, 1);
 	brelse(new_bh);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 3f2f1c45b7b..f3df0baa9a4 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -620,51 +620,46 @@ error:
 
 static ctl_table ocfs2_nm_table[] = {
 	{
-		.ctl_name	= 1,
 		.procname	= "hb_ctl_path",
 		.data		= ocfs2_hb_ctl_path,
 		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
 		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
-		.strategy	= &sysctl_string,
+		.proc_handler	= proc_dostring,
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static ctl_table ocfs2_mod_table[] = {
 	{
-		.ctl_name	= FS_OCFS2_NM,
 		.procname	= "nm",
 		.data		= NULL,
 		.maxlen		= 0,
 		.mode		= 0555,
 		.child		= ocfs2_nm_table
 	},
-	{ .ctl_name = 0}
+	{ }
 };
 
 static ctl_table ocfs2_kern_table[] = {
 	{
-		.ctl_name	= FS_OCFS2,
 		.procname	= "ocfs2",
 		.data		= NULL,
 		.maxlen		= 0,
 		.mode		= 0555,
 		.child		= ocfs2_mod_table
 	},
-	{ .ctl_name = 0}
+	{ }
 };
 
 static ctl_table ocfs2_root_table[] = {
 	{
-		.ctl_name	= CTL_FS,
 		.procname	= "fs",
 		.data		= NULL,
 		.maxlen		= 0,
 		.mode		= 0555,
 		.child		= ocfs2_kern_table
 	},
-	{ .ctl_name = 0 }
+	{ }
 };
 
 static struct ctl_table_header *ocfs2_table_header = NULL;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c0e48aeebb1..14f47d2bfe0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -773,18 +773,20 @@ static int ocfs2_sb_probe(struct super_block *sb,
 		if (tmpstat < 0) {
 			status = tmpstat;
 			mlog_errno(status);
-			goto bail;
+			break;
 		}
 		di = (struct ocfs2_dinode *) (*bh)->b_data;
 		memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
 		spin_lock_init(&stats->b_lock);
-		status = ocfs2_verify_volume(di, *bh, blksize, stats);
-		if (status >= 0)
-			goto bail;
-		brelse(*bh);
-		*bh = NULL;
-		if (status != -EAGAIN)
+		tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats);
+		if (tmpstat < 0) {
+			brelse(*bh);
+			*bh = NULL;
+		}
+		if (tmpstat != -EAGAIN) {
+			status = tmpstat;
 			break;
+		}
 	}
 
 bail:
@@ -1645,6 +1647,10 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = numbits;
 	buf->f_ffree = freebits;
+	buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN)
+				& 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN,
+				OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL;
 
 	brelse(bh);
 
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index b6284f235d2..c61369342a2 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,11 +53,6 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/rbtree.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-#endif
 
 #define MLOG_MASK_PREFIX ML_UPTODATE
 
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index e1c0ec0ae98..082234581d0 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -85,7 +85,7 @@ out:
 }
 
 /*
- * Tries to allocate exactly one block.  Returns true if sucessful.
+ * Tries to allocate exactly one block.  Returns true if successful.
  */
 int omfs_allocate_block(struct super_block *sb, u64 block)
 {
diff --git a/fs/open.c b/fs/open.c
index 4f01e06227c..b4b31d277f3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -587,6 +587,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
 	error = -EPERM;
 	if (!capable(CAP_SYS_CHROOT))
 		goto dput_and_out;
+	error = security_path_chroot(&path);
+	if (error)
+		goto dput_and_out;
 
 	set_fs_root(current->fs, &path);
 	error = 0;
@@ -617,11 +620,15 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
 	if (err)
 		goto out_putf;
 	mutex_lock(&inode->i_mutex);
+	err = security_path_chmod(dentry, file->f_vfsmnt, mode);
+	if (err)
+		goto out_unlock;
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	err = notify_change(dentry, &newattrs);
+out_unlock:
 	mutex_unlock(&inode->i_mutex);
 	mnt_drop_write(file->f_path.mnt);
 out_putf:
@@ -646,11 +653,15 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
 	if (error)
 		goto dput_and_out;
 	mutex_lock(&inode->i_mutex);
+	error = security_path_chmod(path.dentry, path.mnt, mode);
+	if (error)
+		goto out_unlock;
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	error = notify_change(path.dentry, &newattrs);
+out_unlock:
 	mutex_unlock(&inode->i_mutex);
 	mnt_drop_write(path.mnt);
 dput_and_out:
@@ -664,9 +675,9 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
 	return sys_fchmodat(AT_FDCWD, filename, mode);
 }
 
-static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
+static int chown_common(struct path *path, uid_t user, gid_t group)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = path->dentry->d_inode;
 	int error;
 	struct iattr newattrs;
 
@@ -683,7 +694,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 		newattrs.ia_valid |=
 			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
 	mutex_lock(&inode->i_mutex);
-	error = notify_change(dentry, &newattrs);
+	error = security_path_chown(path, user, group);
+	if (!error)
+		error = notify_change(path->dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
 
 	return error;
@@ -700,7 +713,7 @@ SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(path.dentry, user, group);
+	error = chown_common(&path, user, group);
 	mnt_drop_write(path.mnt);
 out_release:
 	path_put(&path);
@@ -725,7 +738,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(path.dentry, user, group);
+	error = chown_common(&path, user, group);
 	mnt_drop_write(path.mnt);
 out_release:
 	path_put(&path);
@@ -744,7 +757,7 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
 	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(path.dentry, user, group);
+	error = chown_common(&path, user, group);
 	mnt_drop_write(path.mnt);
 out_release:
 	path_put(&path);
@@ -767,7 +780,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 		goto out_fput;
 	dentry = file->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = chown_common(dentry, user, group);
+	error = chown_common(&file->f_path, user, group);
 	mnt_drop_write(file->f_path.mnt);
 out_fput:
 	fput(file);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e10cba..64bc8998ac9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
 	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
 }
 
+ssize_t part_discard_alignment_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%u\n", p->discard_alignment);
+}
+
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
@@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+		   NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	p->start_sect = start;
 	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
+	p->discard_alignment = queue_sector_discard_alignment(disk->queue,
+							      start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 038a6022152..49cfd5f5423 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -1,7 +1,9 @@
 /************************************************************
  * EFI GUID Partition Table handling
- * Per Intel EFI Specification v1.02
- * http://developer.intel.com/technology/efi/efi.htm
+ *
+ * http://www.uefi.org/specs/
+ * http://www.intel.com/technology/efi/
+ *
  * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
  *   Copyright 2000,2001,2002,2004 Dell Inc.
  *
@@ -92,6 +94,7 @@
  *
  ************************************************************/
 #include <linux/crc32.h>
+#include <linux/math64.h>
 #include "check.h"
 #include "efi.h"
 
@@ -141,7 +144,8 @@ last_lba(struct block_device *bdev)
 {
 	if (!bdev || !bdev->bd_inode)
 		return 0;
-	return (bdev->bd_inode->i_size >> 9) - 1ULL;
+	return div_u64(bdev->bd_inode->i_size,
+		       bdev_logical_block_size(bdev)) - 1ULL;
 }
 
 static inline int
@@ -188,6 +192,7 @@ static size_t
 read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 {
 	size_t totalreadcount = 0;
+	sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
 
 	if (!bdev || !buffer || lba > last_lba(bdev))
                 return 0;
@@ -195,7 +200,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 	while (count) {
 		int copied = 512;
 		Sector sect;
-		unsigned char *data = read_dev_sector(bdev, lba++, &sect);
+		unsigned char *data = read_dev_sector(bdev, n++, &sect);
 		if (!data)
 			break;
 		if (copied > count)
@@ -257,15 +262,16 @@ static gpt_header *
 alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 {
 	gpt_header *gpt;
+	unsigned ssz = bdev_logical_block_size(bdev);
+
 	if (!bdev)
 		return NULL;
 
-	gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL);
+	gpt = kzalloc(ssz, GFP_KERNEL);
 	if (!gpt)
 		return NULL;
 
-	if (read_lba(bdev, lba, (u8 *) gpt,
-		     sizeof (gpt_header)) < sizeof (gpt_header)) {
+	if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
 		kfree(gpt);
                 gpt=NULL;
 		return NULL;
@@ -601,6 +607,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 	gpt_header *gpt = NULL;
 	gpt_entry *ptes = NULL;
 	u32 i;
+	unsigned ssz = bdev_logical_block_size(bdev) / 512;
 
 	if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
 		kfree(gpt);
@@ -611,13 +618,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 	pr_debug("GUID Partition Table is valid!  Yea!\n");
 
 	for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+		u64 start = le64_to_cpu(ptes[i].starting_lba);
+		u64 size = le64_to_cpu(ptes[i].ending_lba) -
+			   le64_to_cpu(ptes[i].starting_lba) + 1ULL;
+
 		if (!is_pte_valid(&ptes[i], last_lba(bdev)))
 			continue;
 
-		put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba),
-				 (le64_to_cpu(ptes[i].ending_lba) -
-                                  le64_to_cpu(ptes[i].starting_lba) +
-				  1ULL));
+		put_partition(state, i+1, start * ssz, size * ssz);
 
 		/* If this is a RAID volume, tell md */
 		if (!efi_guidcmp(ptes[i].partition_type_guid,
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 2cc89d0475b..6998b589abf 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -37,7 +37,6 @@
 #define EFI_PMBR_OSTYPE_EFI 0xEF
 #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
 
-#define GPT_BLOCK_SIZE 512
 #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
 #define GPT_HEADER_REVISION_V1 0x00010000
 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -79,7 +78,12 @@ typedef struct _gpt_header {
 	__le32 num_partition_entries;
 	__le32 sizeof_partition_entry;
 	__le32 partition_entry_array_crc32;
-	u8 reserved2[GPT_BLOCK_SIZE - 92];
+
+	/* The rest of the logical block is reserved by UEFI and must be zero.
+	 * EFI standard handles this by:
+	 *
+	 * uint8_t		reserved2[ BlockSize - 92 ];
+	 */
 } __attribute__ ((packed)) gpt_header;
 
 typedef struct _gpt_entry_attributes {
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 822c2d50651..4badde179b1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -410,6 +410,16 @@ static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
 }
 #endif		/* CONFIG_MMU */
 
+static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
+{
+	seq_printf(m, "Cpus_allowed:\t");
+	seq_cpumask(m, &task->cpus_allowed);
+	seq_printf(m, "\n");
+	seq_printf(m, "Cpus_allowed_list:\t");
+	seq_cpumask_list(m, &task->cpus_allowed);
+	seq_printf(m, "\n");
+}
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -424,6 +434,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	}
 	task_sig(m, task);
 	task_cap(m, task);
+	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 #if defined(CONFIG_S390)
 	task_show_regs(m, task);
@@ -495,20 +506,17 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		/* add up live thread stats at the group level */
 		if (whole) {
-			struct task_cputime cputime;
 			struct task_struct *t = task;
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				gtime = cputime_add(gtime, task_gtime(t));
+				gtime = cputime_add(gtime, t->gtime);
 				t = next_thread(t);
 			} while (t != task);
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			thread_group_cputime(task, &cputime);
-			utime = cputime.utime;
-			stime = cputime.stime;
+			thread_group_times(task, &utime, &stime);
 			gtime = cputime_add(gtime, sig->gtime);
 		}
 
@@ -524,9 +532,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	if (!whole) {
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
-		utime = task_utime(task);
-		stime = task_stime(task);
-		gtime = task_gtime(task);
+		task_times(task, &utime, &stime);
+		gtime = task->gtime;
 	}
 
 	/* scale priority and nice values from timeslices to -20..20 */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index af643b5aefe..18d5cc62d8e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1265,6 +1265,72 @@ static const struct file_operations proc_pid_sched_operations = {
 
 #endif
 
+static ssize_t comm_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *offset)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct task_struct *p;
+	char buffer[TASK_COMM_LEN];
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (same_thread_group(current, p))
+		set_task_comm(p, buffer);
+	else
+		count = -EINVAL;
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int comm_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	task_lock(p);
+	seq_printf(m, "%s\n", p->comm);
+	task_unlock(p);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static int comm_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+
+	ret = single_open(filp, comm_show, NULL);
+	if (!ret) {
+		struct seq_file *m = filp->private_data;
+
+		m->private = inode;
+	}
+	return ret;
+}
+
+static const struct file_operations proc_pid_set_comm_operations = {
+	.open		= comm_open,
+	.read		= seq_read,
+	.write		= comm_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /*
  * We added or removed a vma mapping the executable. The vmas are only mapped
  * during exec and are not mapped with the mmap system call.
@@ -2200,7 +2266,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = {
 
 #endif
 
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
 					 size_t count, loff_t *ppos)
 {
@@ -2504,6 +2570,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
+	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
@@ -2556,7 +2623,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
 	REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
@@ -2838,6 +2905,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
+	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index fa678abc9db..480cb1065ee 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -429,7 +429,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 			unsigned int ino;
 
 			ino = de->low_ino;
-			de_get(de);
+			pde_get(de);
 			spin_unlock(&proc_subdir_lock);
 			error = -EINVAL;
 			inode = proc_get_inode(dir->i_sb, ino, de);
@@ -445,7 +445,7 @@ out_unlock:
 		return NULL;
 	}
 	if (de)
-		de_put(de);
+		pde_put(de);
 	return ERR_PTR(error);
 }
 
@@ -509,17 +509,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 				struct proc_dir_entry *next;
 
 				/* filldir passes info to user space */
-				de_get(de);
+				pde_get(de);
 				spin_unlock(&proc_subdir_lock);
 				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
 					    de->low_ino, de->mode >> 12) < 0) {
-					de_put(de);
+					pde_put(de);
 					goto out;
 				}
 				spin_lock(&proc_subdir_lock);
 				filp->f_pos++;
 				next = de->next;
-				de_put(de);
+				pde_put(de);
 				de = next;
 			} while (de);
 			spin_unlock(&proc_subdir_lock);
@@ -763,7 +763,7 @@ out:
 	return NULL;
 }
 
-void free_proc_entry(struct proc_dir_entry *de)
+static void free_proc_entry(struct proc_dir_entry *de)
 {
 	unsigned int ino = de->low_ino;
 
@@ -777,6 +777,12 @@ void free_proc_entry(struct proc_dir_entry *de)
 	kfree(de);
 }
 
+void pde_put(struct proc_dir_entry *pde)
+{
+	if (atomic_dec_and_test(&pde->count))
+		free_proc_entry(pde);
+}
+
 /*
  * Remove a /proc entry and free it if it's not currently in use.
  */
@@ -845,6 +851,5 @@ continue_removing:
 	WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
 			"'%s/%s', leaking at least '%s'\n", __func__,
 			de->parent->name, de->name, de->subdir->name);
-	if (atomic_dec_and_test(&de->count))
-		free_proc_entry(de);
+	pde_put(de);
 }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d78ade30554..445a02bcaab 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -24,29 +24,6 @@
 
 #include "internal.h"
 
-struct proc_dir_entry *de_get(struct proc_dir_entry *de)
-{
-	atomic_inc(&de->count);
-	return de;
-}
-
-/*
- * Decrements the use count and checks for deferred deletion.
- */
-void de_put(struct proc_dir_entry *de)
-{
-	if (!atomic_read(&de->count)) {
-		printk("de_put: entry %s already free!\n", de->name);
-		return;
-	}
-
-	if (atomic_dec_and_test(&de->count))
-		free_proc_entry(de);
-}
-
-/*
- * Decrement the use count of the proc_dir_entry.
- */
 static void proc_delete_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
@@ -59,7 +36,7 @@ static void proc_delete_inode(struct inode *inode)
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
 	if (de)
-		de_put(de);
+		pde_put(de);
 	if (PROC_I(inode)->sysctl)
 		sysctl_head_put(PROC_I(inode)->sysctl);
 	clear_inode(inode);
@@ -480,7 +457,7 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 		}
 		unlock_new_inode(inode);
 	} else
-	       de_put(de);
+	       pde_put(de);
 	return inode;
 }			
 
@@ -495,7 +472,7 @@ int proc_fill_super(struct super_block *s)
 	s->s_op = &proc_sops;
 	s->s_time_gran = 1;
 	
-	de_get(&proc_root);
+	pde_get(&proc_root);
 	root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
 	if (!root_inode)
 		goto out_no_root;
@@ -509,6 +486,6 @@ int proc_fill_super(struct super_block *s)
 out_no_root:
 	printk("proc_read_super: get root inode failed\n");
 	iput(root_inode);
-	de_put(&proc_root);
+	pde_put(&proc_root);
 	return -ENOMEM;
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 753ca37002c..1f24a3eddd1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,8 +61,6 @@ extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
 extern const struct inode_operations proc_net_inode_operations;
 
-void free_proc_entry(struct proc_dir_entry *de);
-
 void proc_init_inodecache(void);
 
 static inline struct pid *proc_pid(struct inode *inode)
@@ -101,8 +99,12 @@ unsigned long task_vsize(struct mm_struct *);
 int task_statm(struct mm_struct *, int *, int *, int *, int *);
 void task_mem(struct seq_file *, struct mm_struct *);
 
-struct proc_dir_entry *de_get(struct proc_dir_entry *de);
-void de_put(struct proc_dir_entry *de);
+static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
+{
+	atomic_inc(&pde->count);
+	return pde;
+}
+void pde_put(struct proc_dir_entry *pde);
 
 extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 7ba79a54948..123257bb356 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <asm/prom.h>
@@ -25,26 +26,27 @@ static struct proc_dir_entry *proc_device_tree;
 /*
  * Supply data on a read from /proc/device-tree/node/property.
  */
-static int property_read_proc(char *page, char **start, off_t off,
-			      int count, int *eof, void *data)
+static int property_proc_show(struct seq_file *m, void *v)
 {
-	struct property *pp = data;
-	int n;
+	struct property *pp = m->private;
 
-	if (off >= pp->length) {
-		*eof = 1;
-		return 0;
-	}
-	n = pp->length - off;
-	if (n > count)
-		n = count;
-	else
-		*eof = 1;
-	memcpy(page, (char *)pp->value + off, n);
-	*start = page;
-	return n;
+	seq_write(m, pp->value, pp->length);
+	return 0;
+}
+
+static int property_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, property_proc_show, PDE(inode)->data);
 }
 
+static const struct file_operations property_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= property_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /*
  * For a node with a name like "gc@10", we make symlinks called "gc"
  * and "@10" to it.
@@ -63,10 +65,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
 	 * Unfortunately proc_register puts each new entry
 	 * at the beginning of the list.  So we rearrange them.
 	 */
-	ent = create_proc_read_entry(name,
-				     strncmp(name, "security-", 9)
-				     ? S_IRUGO : S_IRUSR, de,
-				     property_read_proc, pp);
+	ent = proc_create_data(name,
+			       strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
+			       de, &property_proc_fops, pp);
 	if (ent == NULL)
 		return NULL;
 
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f667e8aeabd..6ff9981f0a1 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -48,7 +48,7 @@ out:
 static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
 {
 	int len;
-	for ( ; p->ctl_name || p->procname; p++) {
+	for ( ; p->procname; p++) {
 
 		if (!p->procname)
 			continue;
@@ -218,7 +218,7 @@ static int scan(struct ctl_table_header *head, ctl_table *table,
 		void *dirent, filldir_t filldir)
 {
 
-	for (; table->ctl_name || table->procname; table++, (*pos)++) {
+	for (; table->procname; table++, (*pos)++) {
 		int res;
 
 		/* Can't do anything without a proc name */
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c6d70..b9b7aad2003 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
 	int i, j;
 	unsigned long jif;
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-	cputime64_t guest;
+	cputime64_t guest, guest_nice;
 	u64 sum = 0;
 	u64 sum_softirq = 0;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v)
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
-	guest = cputime64_zero;
+	guest = guest_nice = cputime64_zero;
 	getboottime(&boottime);
 	jif = boottime.tv_sec;
 
@@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+		guest_nice = cputime64_add(guest_nice,
+			kstat_cpu(i).cpustat.guest_nice);
 		for_each_irq_nr(j) {
 			sum += kstat_irqs_cpu(j, i);
 		}
@@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	sum += arch_irq_stat();
 
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+		"%llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
 		(unsigned long long)cputime64_to_clock_t(nice),
 		(unsigned long long)cputime64_to_clock_t(system),
@@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
 		(unsigned long long)cputime64_to_clock_t(irq),
 		(unsigned long long)cputime64_to_clock_t(softirq),
 		(unsigned long long)cputime64_to_clock_t(steal),
-		(unsigned long long)cputime64_to_clock_t(guest));
+		(unsigned long long)cputime64_to_clock_t(guest),
+		(unsigned long long)cputime64_to_clock_t(guest_nice));
 	for_each_online_cpu(i) {
 
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = kstat_cpu(i).cpustat.softirq;
 		steal = kstat_cpu(i).cpustat.steal;
 		guest = kstat_cpu(i).cpustat.guest;
+		guest_nice = kstat_cpu(i).cpustat.guest_nice;
 		seq_printf(p,
-			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+			"%llu\n",
 			i,
 			(unsigned long long)cputime64_to_clock_t(user),
 			(unsigned long long)cputime64_to_clock_t(nice),
@@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v)
 			(unsigned long long)cputime64_to_clock_t(irq),
 			(unsigned long long)cputime64_to_clock_t(softirq),
 			(unsigned long long)cputime64_to_clock_t(steal),
-			(unsigned long long)cputime64_to_clock_t(guest));
+			(unsigned long long)cputime64_to_clock_t(guest),
+			(unsigned long long)cputime64_to_clock_t(guest_nice));
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2a1bef9203c..47c03f4336b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -650,6 +650,50 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return err;
 }
 
+static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
+{
+	u64 pme = 0;
+	if (pte_present(pte))
+		pme = PM_PFRAME(pte_pfn(pte) + offset)
+			| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+	return pme;
+}
+
+static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
+				 unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma;
+	struct pagemapread *pm = walk->private;
+	struct hstate *hs = NULL;
+	int err = 0;
+
+	vma = find_vma(walk->mm, addr);
+	if (vma)
+		hs = hstate_vma(vma);
+	for (; addr != end; addr += PAGE_SIZE) {
+		u64 pfn = PM_NOT_PRESENT;
+
+		if (vma && (addr >= vma->vm_end)) {
+			vma = find_vma(walk->mm, addr);
+			if (vma)
+				hs = hstate_vma(vma);
+		}
+
+		if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
+			/* calculate pfn of the "raw" page in the hugepage. */
+			int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
+			pfn = huge_pte_to_pagemap_entry(*pte, offset);
+		}
+		err = add_to_pagemap(addr, pfn, pm);
+		if (err)
+			return err;
+	}
+
+	cond_resched();
+
+	return err;
+}
+
 /*
  * /proc/pid/pagemap - an array mapping virtual pages to pfns
  *
@@ -742,6 +786,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 
 	pagemap_walk.pmd_entry = pagemap_pte_range;
 	pagemap_walk.pte_hole = pagemap_pte_hole;
+	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
 	pagemap_walk.mm = mm;
 	pagemap_walk.private = &pm;
 
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8f5c05d3dbd..5d9fd64ef81 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -110,9 +110,13 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 		}
 	}
 
-	size += (*text = mm->end_code - mm->start_code);
-	size += (*data = mm->start_stack - mm->start_data);
+	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
+		>> PAGE_SHIFT;
+	*data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK))
+		>> PAGE_SHIFT;
 	up_read(&mm->mmap_sem);
+	size >>= PAGE_SHIFT;
+	size += *text + *data;
 	*resident = size;
 	return size;
 }
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 0afba069d56..22e0d60e53e 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -17,13 +17,6 @@
 #include <linux/bitops.h>
 #include "qnx4.h"
 
-#if 0
-int qnx4_new_block(struct super_block *sb)
-{
-	return 0;
-}
-#endif  /*  0  */
-
 static void count_bits(register const char *bmPart, register int size,
 		       int *const tf)
 {
@@ -35,22 +28,7 @@ static void count_bits(register const char *bmPart, register int size,
 	}
 	do {
 		b = *bmPart++;
-		if ((b & 1) == 0)
-			tot++;
-		if ((b & 2) == 0)
-			tot++;
-		if ((b & 4) == 0)
-			tot++;
-		if ((b & 8) == 0)
-			tot++;
-		if ((b & 16) == 0)
-			tot++;
-		if ((b & 32) == 0)
-			tot++;
-		if ((b & 64) == 0)
-			tot++;
-		if ((b & 128) == 0)
-			tot++;
+		tot += 8 - hweight8(b);
 		size--;
 	} while (size != 0);
 	*tf = tot;
@@ -67,7 +45,7 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb)
 
 	while (total < size) {
 		if ((bh = sb_bread(sb, start + offset)) == NULL) {
-			printk("qnx4: I/O error in counting free blocks\n");
+			printk(KERN_ERR "qnx4: I/O error in counting free blocks\n");
 			break;
 		}
 		count_bits(bh->b_data, size - total, &total_free);
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 86cc39cb139..6f30c3d5bcb 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -26,8 +26,8 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int ix, ino;
 	int size;
 
-	QNX4DEBUG(("qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
-	QNX4DEBUG(("filp->f_pos         = %ld\n", (long) filp->f_pos));
+	QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
+	QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
 
 	lock_kernel();
 
@@ -50,7 +50,7 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 					size = QNX4_NAME_MAX;
 
 				if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
-					QNX4DEBUG(("qnx4_readdir:%.*s\n", size, de->di_fname));
+					QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
 					if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
 						ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
 					else {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index d2cd1798d8c..ebf3440d28c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -64,25 +64,7 @@ static struct buffer_head *qnx4_getblk(struct inode *inode, int nr,
 		result = sb_getblk(inode->i_sb, nr);
 		return result;
 	}
-	if (!create) {
-		return NULL;
-	}
-#if 0
-	tmp = qnx4_new_block(inode->i_sb);
-	if (!tmp) {
-		return NULL;
-	}
-	result = sb_getblk(inode->i_sb, tmp);
-	if (tst) {
-		qnx4_free_block(inode->i_sb, tmp);
-		brelse(result);
-		goto repeat;
-	}
-	tst = tmp;
-#endif
-	inode->i_ctime = CURRENT_TIME_SEC;
-	mark_inode_dirty(inode);
-	return result;
+	return NULL;
 }
 
 struct buffer_head *qnx4_bread(struct inode *inode, int block, int create)
@@ -107,14 +89,12 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h
 {
 	unsigned long phys;
 
-	QNX4DEBUG(("qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
+	QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
 
 	phys = qnx4_block_map( inode, iblock );
 	if ( phys ) {
 		// logical block is before EOF
 		map_bh(bh, inode->i_sb, phys);
-	} else if ( create ) {
-		// to be done.
 	}
 	return 0;
 }
@@ -142,12 +122,12 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 				// read next xtnt block.
 				bh = sb_bread(inode->i_sb, i_xblk - 1);
 				if ( !bh ) {
-					QNX4DEBUG(("qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
+					QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
 					return -EIO;
 				}
 				xblk = (struct qnx4_xblk*)bh->b_data;
 				if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) {
-					QNX4DEBUG(("qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
+					QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
 					return -EIO;
 				}
 			}
@@ -168,7 +148,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 			brelse( bh );
 	}
 
-	QNX4DEBUG(("qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
+	QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
 	return block;
 }
 
@@ -209,7 +189,7 @@ static const char *qnx4_checkroot(struct super_block *sb)
 	if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
 		return "no qnx4 filesystem (no root dir).";
 	} else {
-		QNX4DEBUG(("QNX4 filesystem found on dev %s.\n", sb->s_id));
+		QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
 		rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
 		rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
 		for (j = 0; j < rl; j++) {
@@ -220,7 +200,7 @@ static const char *qnx4_checkroot(struct super_block *sb)
 			for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
 				rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
 				if (rootdir->di_fname != NULL) {
-					QNX4DEBUG(("Rootdir entry found : [%s]\n", rootdir->di_fname));
+					QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
 					if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) {
 						found = 1;
 						qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
@@ -265,12 +245,12 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	   if we don't belong here... */
 	bh = sb_bread(s, 1);
 	if (!bh) {
-		printk("qnx4: unable to read the superblock\n");
+		printk(KERN_ERR "qnx4: unable to read the superblock\n");
 		goto outnobh;
 	}
 	if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
 		if (!silent)
-			printk("qnx4: wrong fsid in superblock.\n");
+			printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
 		goto out;
 	}
 	s->s_op = &qnx4_sops;
@@ -284,14 +264,14 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	errmsg = qnx4_checkroot(s);
 	if (errmsg != NULL) {
  		if (!silent)
- 			printk("qnx4: %s\n", errmsg);
+			printk(KERN_ERR "qnx4: %s\n", errmsg);
 		goto out;
 	}
 
  	/* does root not have inode number QNX4_ROOT_INO ?? */
 	root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
 	if (IS_ERR(root)) {
- 		printk("qnx4: get inode failed\n");
+		printk(KERN_ERR "qnx4: get inode failed\n");
 		ret = PTR_ERR(root);
  		goto out;
  	}
@@ -374,7 +354,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
 	qnx4_inode = qnx4_raw_inode(inode);
 	inode->i_mode = 0;
 
-	QNX4DEBUG(("Reading inode : [%d]\n", ino));
+	QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino));
 	if (!ino) {
 		printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is "
 				"out of range\n",
@@ -385,7 +365,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
 	block = ino / QNX4_INODES_PER_BLOCK;
 
 	if (!(bh = sb_bread(sb, block))) {
-		printk("qnx4: major problem: unable to read inode from dev "
+		printk(KERN_ERR "qnx4: major problem: unable to read inode from dev "
 		       "%s\n", sb->s_id);
 		iget_failed(inode);
 		return ERR_PTR(-EIO);
@@ -499,7 +479,7 @@ static int __init init_qnx4_fs(void)
 		return err;
 	}
 
-	printk("QNX4 filesystem 0.2.3 registered.\n");
+	printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n");
 	return 0;
 }
 
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index ae1e7edbacd..58703ebba87 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -30,7 +30,7 @@ static int qnx4_match(int len, const char *name,
 	int namelen, thislen;
 
 	if (bh == NULL) {
-		printk("qnx4: matching unassigned buffer !\n");
+		printk(KERN_WARNING "qnx4: matching unassigned buffer !\n");
 		return 0;
 	}
 	de = (struct qnx4_inode_entry *) (bh->b_data + *offset);
@@ -66,7 +66,7 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
 
 	*res_dir = NULL;
 	if (!dir->i_sb) {
-		printk("qnx4: no superblock on dir.\n");
+		printk(KERN_WARNING "qnx4: no superblock on dir.\n");
 		return NULL;
 	}
 	bh = NULL;
@@ -124,7 +124,7 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	foundinode = qnx4_iget(dir->i_sb, ino);
 	if (IS_ERR(foundinode)) {
 		unlock_kernel();
-		QNX4DEBUG(("qnx4: lookup->iget -> error %ld\n",
+		QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
 			   PTR_ERR(foundinode)));
 		return ERR_CAST(foundinode);
 	}
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 8047e01ef46..efc02ebb8c7 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -17,7 +17,7 @@ config QUOTA
 
 config QUOTA_NETLINK_INTERFACE
 	bool "Report quota messages through netlink interface"
-	depends on QUOTA && NET
+	depends on QUOTACTL && NET
 	help
 	  If you say Y here, quota warnings (about exceeding softlimit, reaching
 	  hardlimit, etc.) will be reported through netlink interface. If unsure,
@@ -46,12 +46,14 @@ config QFMT_V1
 	  format say Y here.
 
 config QFMT_V2
-	tristate "Quota format v2 support"
+	tristate "Quota format vfsv0 and vfsv1 support"
 	depends on QUOTA
 	select QUOTA_TREE
 	help
-	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
-	  need this functionality say Y here.
+	  This config option enables kernel support for vfsv0 and vfsv1 quota
+	  formats. Both these formats support 32-bit UIDs/GIDs and vfsv1 format
+	  also supports 64-bit inode and block quota limits. If you need this
+	  functionality say Y here.
 
 config QUOTACTL
 	bool
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39b49c42a7e..cd6bb9a33c1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -77,10 +77,6 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h> /* for inode_lock, oddly enough.. */
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-#include <net/netlink.h>
-#include <net/genetlink.h>
-#endif
 
 #include <asm/uaccess.h>
 
@@ -1071,73 +1067,6 @@ static void print_warning(struct dquot *dquot, const int warntype)
 }
 #endif
 
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-
-/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
-	.id = GENL_ID_GENERATE,
-	.hdrsize = 0,
-	.name = "VFS_DQUOT",
-	.version = 1,
-	.maxattr = QUOTA_NL_A_MAX,
-};
-
-/* Send warning to userspace about user which exceeded quota */
-static void send_warning(const struct dquot *dquot, const char warntype)
-{
-	static atomic_t seq;
-	struct sk_buff *skb;
-	void *msg_head;
-	int ret;
-	int msg_size = 4 * nla_total_size(sizeof(u32)) +
-		       2 * nla_total_size(sizeof(u64));
-
-	/* We have to allocate using GFP_NOFS as we are called from a
-	 * filesystem performing write and thus further recursion into
-	 * the fs to free some data could cause deadlocks. */
-	skb = genlmsg_new(msg_size, GFP_NOFS);
-	if (!skb) {
-		printk(KERN_ERR
-		  "VFS: Not enough memory to send quota warning.\n");
-		return;
-	}
-	msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
-			&quota_genl_family, 0, QUOTA_NL_C_WARNING);
-	if (!msg_head) {
-		printk(KERN_ERR
-		  "VFS: Cannot store netlink header in quota warning.\n");
-		goto err_out;
-	}
-	ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, dquot->dq_type);
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, dquot->dq_id);
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR,
-		MAJOR(dquot->dq_sb->s_dev));
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR,
-		MINOR(dquot->dq_sb->s_dev));
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
-	if (ret)
-		goto attr_err_out;
-	genlmsg_end(skb, msg_head);
-
-	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
-	return;
-attr_err_out:
-	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
-err_out:
-	kfree_skb(skb);
-}
-#endif
 /*
  * Write warnings to the console and send warning messages over netlink.
  *
@@ -1145,18 +1074,20 @@ err_out:
  */
 static void flush_warnings(struct dquot *const *dquots, char *warntype)
 {
+	struct dquot *dq;
 	int i;
 
-	for (i = 0; i < MAXQUOTAS; i++)
-		if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
-		    !warning_issued(dquots[i], warntype[i])) {
+	for (i = 0; i < MAXQUOTAS; i++) {
+		dq = dquots[i];
+		if (dq && warntype[i] != QUOTA_NL_NOWARN &&
+		    !warning_issued(dq, warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
-			print_warning(dquots[i], warntype[i]);
-#endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-			send_warning(dquots[i], warntype[i]);
+			print_warning(dq, warntype[i]);
 #endif
+			quota_send_warning(dq->dq_type, dq->dq_id,
+					   dq->dq_sb->s_dev, warntype[i]);
 		}
+	}
 }
 
 static int ignore_hardlimit(struct dquot *dquot)
@@ -2233,7 +2164,9 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
 	struct dentry *dentry;
 	int error;
 
+	mutex_lock(&sb->s_root->d_inode->i_mutex);
 	dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
+	mutex_unlock(&sb->s_root->d_inode->i_mutex);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
@@ -2473,100 +2406,89 @@ const struct quotactl_ops vfs_quotactl_ops = {
 
 static ctl_table fs_dqstats_table[] = {
 	{
-		.ctl_name	= FS_DQ_LOOKUPS,
 		.procname	= "lookups",
 		.data		= &dqstats.lookups,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= FS_DQ_DROPS,
 		.procname	= "drops",
 		.data		= &dqstats.drops,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= FS_DQ_READS,
 		.procname	= "reads",
 		.data		= &dqstats.reads,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= FS_DQ_WRITES,
 		.procname	= "writes",
 		.data		= &dqstats.writes,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= FS_DQ_CACHE_HITS,
 		.procname	= "cache_hits",
 		.data		= &dqstats.cache_hits,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= FS_DQ_ALLOCATED,
 		.procname	= "allocated_dquots",
 		.data		= &dqstats.allocated_dquots,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= FS_DQ_FREE,
 		.procname	= "free_dquots",
 		.data		= &dqstats.free_dquots,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
-		.ctl_name	= FS_DQ_SYNCS,
 		.procname	= "syncs",
 		.data		= &dqstats.syncs,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 	{
-		.ctl_name	= FS_DQ_WARNINGS,
 		.procname	= "warnings",
 		.data		= &flag_print_warnings,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
-	{ .ctl_name = 0 },
+	{ },
 };
 
 static ctl_table fs_table[] = {
 	{
-		.ctl_name	= FS_DQSTATS,
 		.procname	= "quota",
 		.mode		= 0555,
 		.child		= fs_dqstats_table,
 	},
-	{ .ctl_name = 0 },
+	{ },
 };
 
 static ctl_table sys_table[] = {
 	{
-		.ctl_name	= CTL_FS,
 		.procname	= "fs",
 		.mode		= 0555,
 		.child		= fs_table,
 	},
-	{ .ctl_name = 0 },
+	{ },
 };
 
 static int __init dquot_init(void)
@@ -2607,12 +2529,6 @@ static int __init dquot_init(void)
 
 	register_shrinker(&dqcache_shrinker);
 
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-	if (genl_register_family(&quota_genl_family) != 0)
-		printk(KERN_ERR
-		       "VFS: Failed to create quota netlink interface.\n");
-#endif
-
 	return 0;
 }
 module_init(dquot_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95c5b42384b..ee91e275695 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -18,6 +18,8 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/types.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
 
 /* Check validity of generic quotactl commands */
 static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
@@ -525,3 +527,94 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 	return ret;
 }
 #endif
+
+
+#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
+
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = "VFS_DQUOT",
+	.version = 1,
+	.maxattr = QUOTA_NL_A_MAX,
+};
+
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+			const char warntype)
+{
+	static atomic_t seq;
+	struct sk_buff *skb;
+	void *msg_head;
+	int ret;
+	int msg_size = 4 * nla_total_size(sizeof(u32)) +
+		       2 * nla_total_size(sizeof(u64));
+
+	/* We have to allocate using GFP_NOFS as we are called from a
+	 * filesystem performing write and thus further recursion into
+	 * the fs to free some data could cause deadlocks. */
+	skb = genlmsg_new(msg_size, GFP_NOFS);
+	if (!skb) {
+		printk(KERN_ERR
+		  "VFS: Not enough memory to send quota warning.\n");
+		return;
+	}
+	msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+			&quota_genl_family, 0, QUOTA_NL_C_WARNING);
+	if (!msg_head) {
+		printk(KERN_ERR
+		  "VFS: Cannot store netlink header in quota warning.\n");
+		goto err_out;
+	}
+	ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+	if (ret)
+		goto attr_err_out;
+	genlmsg_end(skb, msg_head);
+
+	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+	return;
+attr_err_out:
+	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+
+static int __init quota_init(void)
+{
+	if (genl_register_family(&quota_genl_family) != 0)
+		printk(KERN_ERR
+		       "VFS: Failed to create quota netlink interface.\n");
+	return 0;
+};
+
+module_init(quota_init);
+#endif
+
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 0edcf42b177..2ae757e9c00 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -204,7 +204,7 @@ out:
 	return ret;
 }
 
-static struct quota_format_ops v1_format_ops = {
+static const struct quota_format_ops v1_format_ops = {
 	.check_quota_file	= v1_check_quota_file,
 	.read_file_info		= v1_read_file_info,
 	.write_file_info	= v1_write_file_info,
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index a5475fb1ae4..3dfc23e0213 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -23,14 +23,23 @@ MODULE_LICENSE("GPL");
 
 #define __QUOTA_V2_PARANOIA
 
-static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
-static void v2_disk2memdqb(struct dquot *dquot, void *dp);
-static int v2_is_id(void *dp, struct dquot *dquot);
-
-static struct qtree_fmt_operations v2_qtree_ops = {
-	.mem2disk_dqblk = v2_mem2diskdqb,
-	.disk2mem_dqblk = v2_disk2memdqb,
-	.is_id = v2_is_id,
+static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2r0_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2r0_is_id(void *dp, struct dquot *dquot);
+static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2r1_is_id(void *dp, struct dquot *dquot);
+
+static struct qtree_fmt_operations v2r0_qtree_ops = {
+	.mem2disk_dqblk = v2r0_mem2diskdqb,
+	.disk2mem_dqblk = v2r0_disk2memdqb,
+	.is_id = v2r0_is_id,
+};
+
+static struct qtree_fmt_operations v2r1_qtree_ops = {
+	.mem2disk_dqblk = v2r1_mem2diskdqb,
+	.disk2mem_dqblk = v2r1_disk2memdqb,
+	.is_id = v2r1_is_id,
 };
 
 #define QUOTABLOCK_BITS 10
@@ -46,23 +55,33 @@ static inline qsize_t v2_qbtos(qsize_t blocks)
 	return blocks << QUOTABLOCK_BITS;
 }
 
+static int v2_read_header(struct super_block *sb, int type,
+			  struct v2_disk_dqheader *dqhead)
+{
+	ssize_t size;
+
+	size = sb->s_op->quota_read(sb, type, (char *)dqhead,
+				    sizeof(struct v2_disk_dqheader), 0);
+	if (size != sizeof(struct v2_disk_dqheader)) {
+		printk(KERN_WARNING "quota_v2: Failed header read:"
+		       " expected=%zd got=%zd\n",
+			sizeof(struct v2_disk_dqheader), size);
+		return 0;
+	}
+	return 1;
+}
+
 /* Check whether given file is really vfsv0 quotafile */
 static int v2_check_quota_file(struct super_block *sb, int type)
 {
 	struct v2_disk_dqheader dqhead;
-	ssize_t size;
 	static const uint quota_magics[] = V2_INITQMAGICS;
 	static const uint quota_versions[] = V2_INITQVERSIONS;
  
-	size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
-				    sizeof(struct v2_disk_dqheader), 0);
-	if (size != sizeof(struct v2_disk_dqheader)) {
-		printk("quota_v2: failed read expected=%zd got=%zd\n",
-			sizeof(struct v2_disk_dqheader), size);
+	if (!v2_read_header(sb, type, &dqhead))
 		return 0;
-	}
 	if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
-	    le32_to_cpu(dqhead.dqh_version) != quota_versions[type])
+	    le32_to_cpu(dqhead.dqh_version) > quota_versions[type])
 		return 0;
 	return 1;
 }
@@ -71,14 +90,20 @@ static int v2_check_quota_file(struct super_block *sb, int type)
 static int v2_read_file_info(struct super_block *sb, int type)
 {
 	struct v2_disk_dqinfo dinfo;
+	struct v2_disk_dqheader dqhead;
 	struct mem_dqinfo *info = sb_dqinfo(sb, type);
 	struct qtree_mem_dqinfo *qinfo;
 	ssize_t size;
+	unsigned int version;
+
+	if (!v2_read_header(sb, type, &dqhead))
+		return 0;
+	version = le32_to_cpu(dqhead.dqh_version);
 
 	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
-		printk(KERN_WARNING "Can't read info structure on device %s.\n",
+		printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
 			sb->s_id);
 		return -1;
 	}
@@ -89,9 +114,15 @@ static int v2_read_file_info(struct super_block *sb, int type)
 		return -1;
 	}
 	qinfo = info->dqi_priv;
-	/* limits are stored as unsigned 32-bit data */
-	info->dqi_maxblimit = 0xffffffff;
-	info->dqi_maxilimit = 0xffffffff;
+	if (version == 0) {
+		/* limits are stored as unsigned 32-bit data */
+		info->dqi_maxblimit = 0xffffffff;
+		info->dqi_maxilimit = 0xffffffff;
+	} else {
+		/* used space is stored as unsigned 64-bit value */
+		info->dqi_maxblimit = 0xffffffffffffffff;	/* 2^64-1 */
+		info->dqi_maxilimit = 0xffffffffffffffff;
+	}
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
 	info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
@@ -103,8 +134,13 @@ static int v2_read_file_info(struct super_block *sb, int type)
 	qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
 	qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
 	qinfo->dqi_qtree_depth = qtree_depth(qinfo);
-	qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
-	qinfo->dqi_ops = &v2_qtree_ops;
+	if (version == 0) {
+		qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk);
+		qinfo->dqi_ops = &v2r0_qtree_ops;
+	} else {
+		qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk);
+		qinfo->dqi_ops = &v2r1_qtree_ops;
+	}
 	return 0;
 }
 
@@ -135,9 +171,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	return 0;
 }
 
-static void v2_disk2memdqb(struct dquot *dquot, void *dp)
+static void v2r0_disk2memdqb(struct dquot *dquot, void *dp)
 {
-	struct v2_disk_dqblk *d = dp, empty;
+	struct v2r0_disk_dqblk *d = dp, empty;
 	struct mem_dqblk *m = &dquot->dq_dqb;
 
 	m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
@@ -149,15 +185,15 @@ static void v2_disk2memdqb(struct dquot *dquot, void *dp)
 	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
 	m->dqb_btime = le64_to_cpu(d->dqb_btime);
 	/* We need to escape back all-zero structure */
-	memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+	memset(&empty, 0, sizeof(struct v2r0_disk_dqblk));
 	empty.dqb_itime = cpu_to_le64(1);
-	if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
+	if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk)))
 		m->dqb_itime = 0;
 }
 
-static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
+static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot)
 {
-	struct v2_disk_dqblk *d = dp;
+	struct v2r0_disk_dqblk *d = dp;
 	struct mem_dqblk *m = &dquot->dq_dqb;
 	struct qtree_mem_dqinfo *info =
 			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
@@ -175,9 +211,60 @@ static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
 		d->dqb_itime = cpu_to_le64(1);
 }
 
-static int v2_is_id(void *dp, struct dquot *dquot)
+static int v2r0_is_id(void *dp, struct dquot *dquot)
+{
+	struct v2r0_disk_dqblk *d = dp;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	if (qtree_entry_unused(info, dp))
+		return 0;
+	return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+
+static void v2r1_disk2memdqb(struct dquot *dquot, void *dp)
+{
+	struct v2r1_disk_dqblk *d = dp, empty;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+	m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+	m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+	m->dqb_itime = le64_to_cpu(d->dqb_itime);
+	m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit));
+	m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit));
+	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+	m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	/* We need to escape back all-zero structure */
+	memset(&empty, 0, sizeof(struct v2r1_disk_dqblk));
+	empty.dqb_itime = cpu_to_le64(1);
+	if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk)))
+		m->dqb_itime = 0;
+}
+
+static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+	struct v2r1_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+	d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+	d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+	d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit));
+	d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit));
+	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+	d->dqb_btime = cpu_to_le64(m->dqb_btime);
+	d->dqb_id = cpu_to_le32(dquot->dq_id);
+	if (qtree_entry_unused(info, dp))
+		d->dqb_itime = cpu_to_le64(1);
+}
+
+static int v2r1_is_id(void *dp, struct dquot *dquot)
 {
-	struct v2_disk_dqblk *d = dp;
+	struct v2r1_disk_dqblk *d = dp;
 	struct qtree_mem_dqinfo *info =
 			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
 
@@ -207,7 +294,7 @@ static int v2_free_file_info(struct super_block *sb, int type)
 	return 0;
 }
 
-static struct quota_format_ops v2_format_ops = {
+static const struct quota_format_ops v2_format_ops = {
 	.check_quota_file	= v2_check_quota_file,
 	.read_file_info		= v2_read_file_info,
 	.write_file_info	= v2_write_file_info,
@@ -217,20 +304,32 @@ static struct quota_format_ops v2_format_ops = {
 	.release_dqblk		= v2_release_dquot,
 };
 
-static struct quota_format_type v2_quota_format = {
+static struct quota_format_type v2r0_quota_format = {
 	.qf_fmt_id	= QFMT_VFS_V0,
 	.qf_ops		= &v2_format_ops,
 	.qf_owner	= THIS_MODULE
 };
 
+static struct quota_format_type v2r1_quota_format = {
+	.qf_fmt_id	= QFMT_VFS_V1,
+	.qf_ops		= &v2_format_ops,
+	.qf_owner	= THIS_MODULE
+};
+
 static int __init init_v2_quota_format(void)
 {
-	return register_quota_format(&v2_quota_format);
+	int ret;
+
+	ret = register_quota_format(&v2r0_quota_format);
+	if (ret)
+		return ret;
+	return register_quota_format(&v2r1_quota_format);
 }
 
 static void __exit exit_v2_quota_format(void)
 {
-	unregister_quota_format(&v2_quota_format);
+	unregister_quota_format(&v2r0_quota_format);
+	unregister_quota_format(&v2r1_quota_format);
 }
 
 module_init(init_v2_quota_format);
diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h
index 530fe580685..f1966b42c2f 100644
--- a/fs/quota/quotaio_v2.h
+++ b/fs/quota/quotaio_v2.h
@@ -17,8 +17,8 @@
 }
 
 #define V2_INITQVERSIONS {\
-	0,		/* USRQUOTA */\
-	0		/* GRPQUOTA */\
+	1,		/* USRQUOTA */\
+	1		/* GRPQUOTA */\
 }
 
 /* First generic header */
@@ -32,7 +32,7 @@ struct v2_disk_dqheader {
  * (as it appears on disk) - the file is a radix tree whose leaves point
  * to blocks of these structures.
  */
-struct v2_disk_dqblk {
+struct v2r0_disk_dqblk {
 	__le32 dqb_id;		/* id this quota applies to */
 	__le32 dqb_ihardlimit;	/* absolute limit on allocated inodes */
 	__le32 dqb_isoftlimit;	/* preferred inode limit */
@@ -44,6 +44,19 @@ struct v2_disk_dqblk {
 	__le64 dqb_itime;	/* time limit for excessive inode use */
 };
 
+struct v2r1_disk_dqblk {
+	__le32 dqb_id;		/* id this quota applies to */
+	__le32 dqb_pad;
+	__le64 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__le64 dqb_isoftlimit;	/* preferred inode limit */
+	__le64 dqb_curinodes;	/* current # allocated inodes */
+	__le64 dqb_bhardlimit;	/* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+	__le64 dqb_bsoftlimit;	/* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+	__le64 dqb_curspace;	/* current space occupied (in bytes) */
+	__le64 dqb_btime;	/* time limit for excessive disk use */
+	__le64 dqb_itime;	/* time limit for excessive inode use */
+};
+
 /* Header with type and version specific information */
 struct v2_disk_dqinfo {
 	__le32 dqi_bgrace;	/* Time before block soft limit becomes hard limit */
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ac28987f22..b7f4a1f94d4 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -826,8 +826,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	if (!(out_file->f_mode & FMODE_WRITE))
 		goto fput_out;
 	retval = -EINVAL;
-	if (!out_file->f_op || !out_file->f_op->sendpage)
-		goto fput_out;
 	in_inode = in_file->f_path.dentry->d_inode;
 	out_inode = out_file->f_path.dentry->d_inode;
 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 513f431038f..ac7cd75c86f 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,6 +1,7 @@
 config REISERFS_FS
 	tristate "Reiserfs support"
 	select CRC32
+	select FS_JOURNAL_INFO
 	help
 	  Stores not just filenames but the files themselves in a balanced
 	  tree.  Uses journalling.
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 7c5ab6330dd..792b3cb2cd1 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,7 +7,11 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
 		 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
 		 hashes.o tail_conversion.o journal.o resize.o \
-		 item_ops.o ioctl.o procfs.o xattr.o
+		 item_ops.o ioctl.o xattr.o lock.o
+
+ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
+reiserfs-objs += procfs.o
+endif
 
 ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
 reiserfs-objs += xattr_user.o xattr_trusted.o
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index e716161ab32..68549570718 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1249,14 +1249,18 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
 	else if (bitmap == 0)
 		block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
 
+	reiserfs_write_unlock(sb);
 	bh = sb_bread(sb, block);
+	reiserfs_write_lock(sb);
 	if (bh == NULL)
 		reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
 		                 "reading failed", __func__, block);
 	else {
 		if (buffer_locked(bh)) {
 			PROC_INFO_INC(sb, scan_bitmap.wait);
+			reiserfs_write_unlock(sb);
 			__wait_on_buffer(bh);
+			reiserfs_write_lock(sb);
 		}
 		BUG_ON(!buffer_uptodate(bh));
 		BUG_ON(atomic_read(&bh->b_count) == 0);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6d2668fdc38..c094f58c744 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -20,7 +20,7 @@ const struct file_operations reiserfs_dir_operations = {
 	.read = generic_read_dir,
 	.readdir = reiserfs_readdir,
 	.fsync = reiserfs_dir_fsync,
-	.ioctl = reiserfs_ioctl,
+	.unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = reiserfs_compat_ioctl,
 #endif
@@ -174,14 +174,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 				// user space buffer is swapped out. At that time
 				// entry can move to somewhere else
 				memcpy(local_buf, d_name, d_reclen);
+
+				/*
+				 * Since filldir might sleep, we can release
+				 * the write lock here for other waiters
+				 */
+				reiserfs_write_unlock(inode->i_sb);
 				if (filldir
 				    (dirent, local_buf, d_reclen, d_off, d_ino,
 				     DT_UNKNOWN) < 0) {
+					reiserfs_write_lock(inode->i_sb);
 					if (local_buf != small_buf) {
 						kfree(local_buf);
 					}
 					goto end;
 				}
+				reiserfs_write_lock(inode->i_sb);
 				if (local_buf != small_buf) {
 					kfree(local_buf);
 				}
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 128d3f7c8aa..60c08044066 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -21,14 +21,6 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
 
-#ifdef CONFIG_REISERFS_CHECK
-
-struct tree_balance *cur_tb = NULL;	/* detects whether more than one
-					   copy of tb exists as a means
-					   of checking whether schedule
-					   is interrupting do_balance */
-#endif
-
 static inline void buffer_info_init_left(struct tree_balance *tb,
                                          struct buffer_info *bi)
 {
@@ -1840,11 +1832,12 @@ static int check_before_balancing(struct tree_balance *tb)
 {
 	int retval = 0;
 
-	if (cur_tb) {
+	if (REISERFS_SB(tb->tb_sb)->cur_tb) {
 		reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
 			       "occurred based on cur_tb not being null at "
 			       "this point in code. do_balance cannot properly "
-			       "handle schedule occurring while it runs.");
+			       "handle concurrent tree accesses on a same "
+			       "mount point.");
 	}
 
 	/* double check that buffers that we will modify are unlocked. (fix_nodes should already have
@@ -1986,7 +1979,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
 	     "check");*/
 	RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
 #ifdef CONFIG_REISERFS_CHECK
-	cur_tb = tb;
+	REISERFS_SB(tb->tb_sb)->cur_tb = tb;
 #endif
 }
 
@@ -1996,7 +1989,7 @@ static inline void do_balance_completed(struct tree_balance *tb)
 #ifdef CONFIG_REISERFS_CHECK
 	check_leaf_level(tb);
 	check_internal_levels(tb);
-	cur_tb = NULL;
+	REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
 #endif
 
 	/* reiserfs_free_block is no longer schedule safe.  So, we need to
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9f436668b7f..da2dba082e2 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -284,7 +284,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 const struct file_operations reiserfs_file_operations = {
 	.read = do_sync_read,
 	.write = reiserfs_file_write,
-	.ioctl = reiserfs_ioctl,
+	.unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = reiserfs_compat_ioctl,
 #endif
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 5e5a4e6fbaf..6591cb21edf 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -563,9 +563,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
 	return needed_nodes;
 }
 
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 
 /* Set parameters for balancing.
  * Performs write of results of analysis of balancing into structure tb,
@@ -834,7 +831,7 @@ static int get_empty_nodes(struct tree_balance *tb, int h)
 		RFALSE(buffer_dirty(new_bh) ||
 		       buffer_journaled(new_bh) ||
 		       buffer_journal_dirty(new_bh),
-		       "PAP-8140: journlaled or dirty buffer %b for the new block",
+		       "PAP-8140: journaled or dirty buffer %b for the new block",
 		       new_bh);
 
 		/* Put empty buffers into the array. */
@@ -1022,7 +1019,11 @@ static int get_far_parent(struct tree_balance *tb,
 	/* Check whether the common parent is locked. */
 
 	if (buffer_locked(*pcom_father)) {
+
+		/* Release the write lock while the buffer is busy */
+		reiserfs_write_unlock(tb->tb_sb);
 		__wait_on_buffer(*pcom_father);
+		reiserfs_write_lock(tb->tb_sb);
 		if (FILESYSTEM_CHANGED_TB(tb)) {
 			brelse(*pcom_father);
 			return REPEAT_SEARCH;
@@ -1927,7 +1928,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
 		return REPEAT_SEARCH;
 
 	if (buffer_locked(bh)) {
+		reiserfs_write_unlock(tb->tb_sb);
 		__wait_on_buffer(bh);
+		reiserfs_write_lock(tb->tb_sb);
 		if (FILESYSTEM_CHANGED_TB(tb))
 			return REPEAT_SEARCH;
 	}
@@ -1965,7 +1968,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
 		     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
 								       FL[h]);
 		son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
+		reiserfs_write_unlock(sb);
 		bh = sb_bread(sb, son_number);
+		reiserfs_write_lock(sb);
 		if (!bh)
 			return IO_ERROR;
 		if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2003,7 +2008,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
 		child_position =
 		    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
 		son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
+		reiserfs_write_unlock(sb);
 		bh = sb_bread(sb, son_number);
+		reiserfs_write_lock(sb);
 		if (!bh)
 			return IO_ERROR;
 		if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2278,7 +2285,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
 				    REPEAT_SEARCH : CARRY_ON;
 			}
 #endif
+			reiserfs_write_unlock(tb->tb_sb);
 			__wait_on_buffer(locked);
+			reiserfs_write_lock(tb->tb_sb);
 			if (FILESYSTEM_CHANGED_TB(tb))
 				return REPEAT_SEARCH;
 		}
@@ -2349,12 +2358,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
 
 	/* if it possible in indirect_to_direct conversion */
 	if (buffer_locked(tbS0)) {
+		reiserfs_write_unlock(tb->tb_sb);
 		__wait_on_buffer(tbS0);
+		reiserfs_write_lock(tb->tb_sb);
 		if (FILESYSTEM_CHANGED_TB(tb))
 			return REPEAT_SEARCH;
 	}
 #ifdef CONFIG_REISERFS_CHECK
-	if (cur_tb) {
+	if (REISERFS_SB(tb->tb_sb)->cur_tb) {
 		print_cur_tb("fix_nodes");
 		reiserfs_panic(tb->tb_sb, "PAP-8305",
 			       "there is pending do_balance");
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a14d6cd9eed..3a28e7751b3 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -251,7 +251,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
 	struct cpu_key key;
 	struct buffer_head *bh;
 	struct item_head *ih, tmp_ih;
-	int fs_gen;
 	b_blocknr_t blocknr;
 	char *p = NULL;
 	int chars;
@@ -265,7 +264,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
 		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
 		     3);
 
-      research:
 	result = search_for_position_by_key(inode->i_sb, &key, &path);
 	if (result != POSITION_FOUND) {
 		pathrelse(&path);
@@ -340,7 +338,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
 	}
 	// read file tail into part of page
 	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
-	fs_gen = get_generation(inode->i_sb);
 	copy_item_head(&tmp_ih, ih);
 
 	/* we only want to kmap if we are reading the tail into the page.
@@ -348,13 +345,9 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
 	 ** sure we need to.  But, this means the item might move if
 	 ** kmap schedules
 	 */
-	if (!p) {
+	if (!p)
 		p = (char *)kmap(bh_result->b_page);
-		if (fs_changed(fs_gen, inode->i_sb)
-		    && item_moved(&tmp_ih, &path)) {
-			goto research;
-		}
-	}
+
 	p += offset;
 	memset(p, 0, inode->i_sb->s_blocksize);
 	do {
@@ -489,10 +482,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
 	   disappeared */
 	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
 		int err;
-		lock_kernel();
+
+		reiserfs_write_lock(inode->i_sb);
+
 		err = reiserfs_commit_for_inode(inode);
 		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-		unlock_kernel();
+
+		reiserfs_write_unlock(inode->i_sb);
+
 		if (err < 0)
 			ret = err;
 	}
@@ -601,6 +598,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 	__le32 *item;
 	int done;
 	int fs_gen;
+	int lock_depth;
 	struct reiserfs_transaction_handle *th = NULL;
 	/* space reserved in transaction batch:
 	   . 3 balancings in direct->indirect conversion
@@ -616,12 +614,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 	loff_t new_offset =
 	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
 
-	/* bad.... */
-	reiserfs_write_lock(inode->i_sb);
+	lock_depth = reiserfs_write_lock_once(inode->i_sb);
 	version = get_inode_item_key_version(inode);
 
 	if (!file_capable(inode, block)) {
-		reiserfs_write_unlock(inode->i_sb);
+		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 		return -EFBIG;
 	}
 
@@ -633,7 +630,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 		/* find number of block-th logical block of the file */
 		ret = _get_block_create_0(inode, block, bh_result,
 					  create | GET_BLOCK_READ_DIRECT);
-		reiserfs_write_unlock(inode->i_sb);
+		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 		return ret;
 	}
 	/*
@@ -751,7 +748,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 		if (!dangle && th)
 			retval = reiserfs_end_persistent_transaction(th);
 
-		reiserfs_write_unlock(inode->i_sb);
+		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 
 		/* the item was found, so new blocks were not added to the file
 		 ** there is no need to make sure the inode is updated with this
@@ -935,7 +932,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 			if (blocks_needed == 1) {
 				un = &unf_single;
 			} else {
-				un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);	// We need to avoid scheduling.
+				un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
 				if (!un) {
 					un = &unf_single;
 					blocks_needed = 1;
@@ -997,10 +994,16 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 			if (retval)
 				goto failure;
 		}
-		/* inserting indirect pointers for a hole can take a
-		 ** long time.  reschedule if needed
+		/*
+		 * inserting indirect pointers for a hole can take a
+		 * long time.  reschedule if needed and also release the write
+		 * lock for others.
 		 */
-		cond_resched();
+		if (need_resched()) {
+			reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+			schedule();
+			lock_depth = reiserfs_write_lock_once(inode->i_sb);
+		}
 
 		retval = search_for_position_by_key(inode->i_sb, &key, &path);
 		if (retval == IO_ERROR) {
@@ -1035,7 +1038,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 			retval = err;
 	}
 
-	reiserfs_write_unlock(inode->i_sb);
+	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 	reiserfs_check_path(&path);
 	return retval;
 }
@@ -2072,8 +2075,9 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
 	int error;
 	struct buffer_head *bh = NULL;
 	int err2;
+	int lock_depth;
 
-	reiserfs_write_lock(inode->i_sb);
+	lock_depth = reiserfs_write_lock_once(inode->i_sb);
 
 	if (inode->i_size > 0) {
 		error = grab_tail_page(inode, &page, &bh);
@@ -2142,14 +2146,17 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
 		page_cache_release(page);
 	}
 
-	reiserfs_write_unlock(inode->i_sb);
+	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+
 	return 0;
       out:
 	if (page) {
 		unlock_page(page);
 		page_cache_release(page);
 	}
-	reiserfs_write_unlock(inode->i_sb);
+
+	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+
 	return error;
 }
 
@@ -2608,7 +2615,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
 	int ret;
 	int old_ref = 0;
 
+	reiserfs_write_unlock(inode->i_sb);
 	reiserfs_wait_on_write_block(inode->i_sb);
+	reiserfs_write_lock(inode->i_sb);
+
 	fix_tail_page_for_writing(page);
 	if (reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th;
@@ -2664,6 +2674,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
 	int update_sd = 0;
 	struct reiserfs_transaction_handle *th;
 	unsigned start;
+	int lock_depth = 0;
+	bool locked = false;
 
 	if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
 		pos ++;
@@ -2690,9 +2702,11 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
 	 ** to do the i_size updates here.
 	 */
 	pos += copied;
+
 	if (pos > inode->i_size) {
 		struct reiserfs_transaction_handle myth;
-		reiserfs_write_lock(inode->i_sb);
+		lock_depth = reiserfs_write_lock_once(inode->i_sb);
+		locked = true;
 		/* If the file have grown beyond the border where it
 		   can have a tail, unmark it as needing a tail
 		   packing */
@@ -2703,10 +2717,9 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
 			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 
 		ret = journal_begin(&myth, inode->i_sb, 1);
-		if (ret) {
-			reiserfs_write_unlock(inode->i_sb);
+		if (ret)
 			goto journal_error;
-		}
+
 		reiserfs_update_inode_transaction(inode);
 		inode->i_size = pos;
 		/*
@@ -2718,34 +2731,36 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
 		reiserfs_update_sd(&myth, inode);
 		update_sd = 1;
 		ret = journal_end(&myth, inode->i_sb, 1);
-		reiserfs_write_unlock(inode->i_sb);
 		if (ret)
 			goto journal_error;
 	}
 	if (th) {
-		reiserfs_write_lock(inode->i_sb);
+		if (!locked) {
+			lock_depth = reiserfs_write_lock_once(inode->i_sb);
+			locked = true;
+		}
 		if (!update_sd)
 			mark_inode_dirty(inode);
 		ret = reiserfs_end_persistent_transaction(th);
-		reiserfs_write_unlock(inode->i_sb);
 		if (ret)
 			goto out;
 	}
 
       out:
+	if (locked)
+		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 	unlock_page(page);
 	page_cache_release(page);
 	return ret == 0 ? copied : ret;
 
       journal_error:
+	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+	locked = false;
 	if (th) {
-		reiserfs_write_lock(inode->i_sb);
 		if (!update_sd)
 			reiserfs_update_sd(th, inode);
 		ret = reiserfs_end_persistent_transaction(th);
-		reiserfs_write_unlock(inode->i_sb);
 	}
-
 	goto out;
 }
 
@@ -2758,7 +2773,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 	int update_sd = 0;
 	struct reiserfs_transaction_handle *th = NULL;
 
+	reiserfs_write_unlock(inode->i_sb);
 	reiserfs_wait_on_write_block(inode->i_sb);
+	reiserfs_write_lock(inode->i_sb);
+
 	if (reiserfs_transaction_running(inode->i_sb)) {
 		th = current->journal_info;
 	}
@@ -2770,7 +2788,6 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 	 */
 	if (pos > inode->i_size) {
 		struct reiserfs_transaction_handle myth;
-		reiserfs_write_lock(inode->i_sb);
 		/* If the file have grown beyond the border where it
 		   can have a tail, unmark it as needing a tail
 		   packing */
@@ -2781,10 +2798,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 
 		ret = journal_begin(&myth, inode->i_sb, 1);
-		if (ret) {
-			reiserfs_write_unlock(inode->i_sb);
+		if (ret)
 			goto journal_error;
-		}
+
 		reiserfs_update_inode_transaction(inode);
 		inode->i_size = pos;
 		/*
@@ -2796,16 +2812,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 		reiserfs_update_sd(&myth, inode);
 		update_sd = 1;
 		ret = journal_end(&myth, inode->i_sb, 1);
-		reiserfs_write_unlock(inode->i_sb);
 		if (ret)
 			goto journal_error;
 	}
 	if (th) {
-		reiserfs_write_lock(inode->i_sb);
 		if (!update_sd)
 			mark_inode_dirty(inode);
 		ret = reiserfs_end_persistent_transaction(th);
-		reiserfs_write_unlock(inode->i_sb);
 		if (ret)
 			goto out;
 	}
@@ -2815,11 +2828,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 
       journal_error:
 	if (th) {
-		reiserfs_write_lock(inode->i_sb);
 		if (!update_sd)
 			reiserfs_update_sd(th, inode);
 		ret = reiserfs_end_persistent_transaction(th);
-		reiserfs_write_unlock(inode->i_sb);
 	}
 
 	return ret;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 0ccc3fdda7b..ace77451ceb 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -13,44 +13,52 @@
 #include <linux/compat.h>
 
 /*
-** reiserfs_ioctl - handler for ioctl for inode
-** supported commands:
-**  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
-**                           and prevent packing file (argument arg has to be non-zero)
-**  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
-**  3) That's all for a while ...
-*/
-int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-		   unsigned long arg)
+ * reiserfs_ioctl - handler for ioctl for inode
+ * supported commands:
+ *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
+ *                           and prevent packing file (argument arg has to be non-zero)
+ *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
+ *  3) That's all for a while ...
+ */
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	unsigned int flags;
 	int err = 0;
 
+	reiserfs_write_lock(inode->i_sb);
+
 	switch (cmd) {
 	case REISERFS_IOC_UNPACK:
 		if (S_ISREG(inode->i_mode)) {
 			if (arg)
-				return reiserfs_unpack(inode, filp);
-			else
-				return 0;
+				err = reiserfs_unpack(inode, filp);
 		} else
-			return -ENOTTY;
-		/* following two cases are taken from fs/ext2/ioctl.c by Remy
-		   Card (card@masi.ibp.fr) */
+			err = -ENOTTY;
+		break;
+		/*
+		 * following two cases are taken from fs/ext2/ioctl.c by Remy
+		 * Card (card@masi.ibp.fr)
+		 */
 	case REISERFS_IOC_GETFLAGS:
-		if (!reiserfs_attrs(inode->i_sb))
-			return -ENOTTY;
+		if (!reiserfs_attrs(inode->i_sb)) {
+			err = -ENOTTY;
+			break;
+		}
 
 		flags = REISERFS_I(inode)->i_attrs;
 		i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
-		return put_user(flags, (int __user *)arg);
+		err = put_user(flags, (int __user *)arg);
+		break;
 	case REISERFS_IOC_SETFLAGS:{
-			if (!reiserfs_attrs(inode->i_sb))
-				return -ENOTTY;
+			if (!reiserfs_attrs(inode->i_sb)) {
+				err = -ENOTTY;
+				break;
+			}
 
 			err = mnt_want_write(filp->f_path.mnt);
 			if (err)
-				return err;
+				break;
 
 			if (!is_owner_or_cap(inode)) {
 				err = -EPERM;
@@ -90,16 +98,18 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 			mark_inode_dirty(inode);
 setflags_out:
 			mnt_drop_write(filp->f_path.mnt);
-			return err;
+			break;
 		}
 	case REISERFS_IOC_GETVERSION:
-		return put_user(inode->i_generation, (int __user *)arg);
+		err = put_user(inode->i_generation, (int __user *)arg);
+		break;
 	case REISERFS_IOC_SETVERSION:
 		if (!is_owner_or_cap(inode))
-			return -EPERM;
+			err = -EPERM;
+			break;
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
-			return err;
+			break;
 		if (get_user(inode->i_generation, (int __user *)arg)) {
 			err = -EFAULT;
 			goto setversion_out;
@@ -108,19 +118,20 @@ setflags_out:
 		mark_inode_dirty(inode);
 setversion_out:
 		mnt_drop_write(filp->f_path.mnt);
-		return err;
+		break;
 	default:
-		return -ENOTTY;
+		err = -ENOTTY;
 	}
+
+	reiserfs_write_unlock(inode->i_sb);
+
+	return err;
 }
 
 #ifdef CONFIG_COMPAT
 long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
 				unsigned long arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
-
 	/* These are just misnamed, they actually get/put from/to user an int */
 	switch (cmd) {
 	case REISERFS_IOC32_UNPACK:
@@ -141,10 +152,8 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
 	default:
 		return -ENOIOCTLCMD;
 	}
-	lock_kernel();
-	ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-	unlock_kernel();
-	return ret;
+
+	return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
 }
 #endif
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 90622200b39..2f8a7e7b8da 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -429,21 +429,6 @@ static void clear_prepared_bits(struct buffer_head *bh)
 	clear_buffer_journal_restore_dirty(bh);
 }
 
-/* utility function to force a BUG if it is called without the big
-** kernel lock held.  caller is the string printed just before calling BUG()
-*/
-void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
-{
-#ifdef CONFIG_SMP
-	if (current->lock_depth < 0) {
-		reiserfs_panic(sb, "journal-1", "%s called without kernel "
-			       "lock held", caller);
-	}
-#else
-	;
-#endif
-}
-
 /* return a cnode with same dev, block number and size in table, or null if not found */
 static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
 								  super_block
@@ -556,7 +541,8 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 static inline void lock_journal(struct super_block *sb)
 {
 	PROC_INFO_INC(sb, journal.lock_journal);
-	mutex_lock(&SB_JOURNAL(sb)->j_mutex);
+
+	reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
 }
 
 /* unlock the current transaction */
@@ -708,7 +694,9 @@ static void check_barrier_completion(struct super_block *s,
 		disable_barrier(s);
 		set_buffer_uptodate(bh);
 		set_buffer_dirty(bh);
+		reiserfs_write_unlock(s);
 		sync_dirty_buffer(bh);
+		reiserfs_write_lock(s);
 	}
 }
 
@@ -996,8 +984,13 @@ static int reiserfs_async_progress_wait(struct super_block *s)
 {
 	DEFINE_WAIT(wait);
 	struct reiserfs_journal *j = SB_JOURNAL(s);
-	if (atomic_read(&j->j_async_throttle))
+
+	if (atomic_read(&j->j_async_throttle)) {
+		reiserfs_write_unlock(s);
 		congestion_wait(BLK_RW_ASYNC, HZ / 10);
+		reiserfs_write_lock(s);
+	}
+
 	return 0;
 }
 
@@ -1043,7 +1036,8 @@ static int flush_commit_list(struct super_block *s,
 	}
 
 	/* make sure nobody is trying to flush this one at the same time */
-	mutex_lock(&jl->j_commit_mutex);
+	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
+
 	if (!journal_list_still_alive(s, trans_id)) {
 		mutex_unlock(&jl->j_commit_mutex);
 		goto put_jl;
@@ -1061,12 +1055,17 @@ static int flush_commit_list(struct super_block *s,
 
 	if (!list_empty(&jl->j_bh_list)) {
 		int ret;
-		unlock_kernel();
+
+		/*
+		 * We might sleep in numerous places inside
+		 * write_ordered_buffers. Relax the write lock.
+		 */
+		reiserfs_write_unlock(s);
 		ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
 					    journal, jl, &jl->j_bh_list);
 		if (ret < 0 && retval == 0)
 			retval = ret;
-		lock_kernel();
+		reiserfs_write_lock(s);
 	}
 	BUG_ON(!list_empty(&jl->j_bh_list));
 	/*
@@ -1085,8 +1084,11 @@ static int flush_commit_list(struct super_block *s,
 		    SB_ONDISK_JOURNAL_SIZE(s);
 		tbh = journal_find_get_block(s, bn);
 		if (tbh) {
-			if (buffer_dirty(tbh))
-			    ll_rw_block(WRITE, 1, &tbh) ;
+			if (buffer_dirty(tbh)) {
+		            reiserfs_write_unlock(s);
+			    ll_rw_block(WRITE, 1, &tbh);
+			    reiserfs_write_lock(s);
+			}
 			put_bh(tbh) ;
 		}
 	}
@@ -1114,12 +1116,19 @@ static int flush_commit_list(struct super_block *s,
 		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
 		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
 		tbh = journal_find_get_block(s, bn);
+
+		reiserfs_write_unlock(s);
 		wait_on_buffer(tbh);
+		reiserfs_write_lock(s);
 		// since we're using ll_rw_blk above, it might have skipped over
 		// a locked buffer.  Double check here
 		//
-		if (buffer_dirty(tbh))	/* redundant, sync_dirty_buffer() checks */
+		/* redundant, sync_dirty_buffer() checks */
+		if (buffer_dirty(tbh)) {
+			reiserfs_write_unlock(s);
 			sync_dirty_buffer(tbh);
+			reiserfs_write_lock(s);
+		}
 		if (unlikely(!buffer_uptodate(tbh))) {
 #ifdef CONFIG_REISERFS_CHECK
 			reiserfs_warning(s, "journal-601",
@@ -1143,10 +1152,15 @@ static int flush_commit_list(struct super_block *s,
 			if (buffer_dirty(jl->j_commit_bh))
 				BUG();
 			mark_buffer_dirty(jl->j_commit_bh) ;
+			reiserfs_write_unlock(s);
 			sync_dirty_buffer(jl->j_commit_bh) ;
+			reiserfs_write_lock(s);
 		}
-	} else
+	} else {
+		reiserfs_write_unlock(s);
 		wait_on_buffer(jl->j_commit_bh);
+		reiserfs_write_lock(s);
+	}
 
 	check_barrier_completion(s, jl->j_commit_bh);
 
@@ -1286,7 +1300,9 @@ static int _update_journal_header_block(struct super_block *sb,
 
 	if (trans_id >= journal->j_last_flush_trans_id) {
 		if (buffer_locked((journal->j_header_bh))) {
+			reiserfs_write_unlock(sb);
 			wait_on_buffer((journal->j_header_bh));
+			reiserfs_write_lock(sb);
 			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
 				reiserfs_warning(sb, "journal-699",
@@ -1312,12 +1328,16 @@ static int _update_journal_header_block(struct super_block *sb,
 				disable_barrier(sb);
 				goto sync;
 			}
+			reiserfs_write_unlock(sb);
 			wait_on_buffer(journal->j_header_bh);
+			reiserfs_write_lock(sb);
 			check_barrier_completion(sb, journal->j_header_bh);
 		} else {
 		      sync:
 			set_buffer_dirty(journal->j_header_bh);
+			reiserfs_write_unlock(sb);
 			sync_dirty_buffer(journal->j_header_bh);
+			reiserfs_write_lock(sb);
 		}
 		if (!buffer_uptodate(journal->j_header_bh)) {
 			reiserfs_warning(sb, "journal-837",
@@ -1409,7 +1429,7 @@ static int flush_journal_list(struct super_block *s,
 
 	/* if flushall == 0, the lock is already held */
 	if (flushall) {
-		mutex_lock(&journal->j_flush_mutex);
+		reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
 	} else if (mutex_trylock(&journal->j_flush_mutex)) {
 		BUG();
 	}
@@ -1553,7 +1573,11 @@ static int flush_journal_list(struct super_block *s,
 					reiserfs_panic(s, "journal-1011",
 						       "cn->bh is NULL");
 				}
+
+				reiserfs_write_unlock(s);
 				wait_on_buffer(cn->bh);
+				reiserfs_write_lock(s);
+
 				if (!cn->bh) {
 					reiserfs_panic(s, "journal-1012",
 						       "cn->bh is NULL");
@@ -1769,7 +1793,7 @@ static int kupdate_transactions(struct super_block *s,
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	chunk.nr = 0;
 
-	mutex_lock(&journal->j_flush_mutex);
+	reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
 	if (!journal_list_still_alive(s, orig_trans_id)) {
 		goto done;
 	}
@@ -1973,11 +1997,19 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 	reiserfs_mounted_fs_count--;
 	/* wait for all commits to finish */
 	cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
+
+	/*
+	 * We must release the write lock here because
+	 * the workqueue job (flush_async_commit) needs this lock
+	 */
+	reiserfs_write_unlock(sb);
 	flush_workqueue(commit_wq);
+
 	if (!reiserfs_mounted_fs_count) {
 		destroy_workqueue(commit_wq);
 		commit_wq = NULL;
 	}
+	reiserfs_write_lock(sb);
 
 	free_journal_ram(sb);
 
@@ -2243,7 +2275,11 @@ static int journal_read_transaction(struct super_block *sb,
 	/* read in the log blocks, memcpy to the corresponding real block */
 	ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
+
+		reiserfs_write_unlock(sb);
 		wait_on_buffer(log_blocks[i]);
+		reiserfs_write_lock(sb);
+
 		if (!buffer_uptodate(log_blocks[i])) {
 			reiserfs_warning(sb, "journal-1212",
 					 "REPLAY FAILURE fsck required! "
@@ -2765,11 +2801,27 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 		goto free_and_return;
 	}
 
+	/*
+	 * We need to unlock here to avoid creating the following
+	 * dependency:
+	 * reiserfs_lock -> sysfs_mutex
+	 * Because the reiserfs mmap path creates the following dependency:
+	 * mm->mmap -> reiserfs_lock, hence we have
+	 * mm->mmap -> reiserfs_lock ->sysfs_mutex
+	 * This would ends up in a circular dependency with sysfs readdir path
+	 * which does sysfs_mutex -> mm->mmap_sem
+	 * This is fine because the reiserfs lock is useless in mount path,
+	 * at least until we call journal_begin. We keep it for paranoid
+	 * reasons.
+	 */
+	reiserfs_write_unlock(sb);
 	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
+		reiserfs_write_lock(sb);
 		reiserfs_warning(sb, "sh-462",
 				 "unable to initialize jornal device");
 		goto free_and_return;
 	}
+	reiserfs_write_lock(sb);
 
 	rs = SB_DISK_SUPER_BLOCK(sb);
 
@@ -2881,8 +2933,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	}
 
 	reiserfs_mounted_fs_count++;
-	if (reiserfs_mounted_fs_count <= 1)
+	if (reiserfs_mounted_fs_count <= 1) {
+		reiserfs_write_unlock(sb);
 		commit_wq = create_workqueue("reiserfs");
+		reiserfs_write_lock(sb);
+	}
 
 	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
 	journal->j_work_sb = sb;
@@ -2964,8 +3019,11 @@ static void queue_log_writer(struct super_block *s)
 	init_waitqueue_entry(&wait, current);
 	add_wait_queue(&journal->j_join_wait, &wait);
 	set_current_state(TASK_UNINTERRUPTIBLE);
-	if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
+	if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
+		reiserfs_write_unlock(s);
 		schedule();
+		reiserfs_write_lock(s);
+	}
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&journal->j_join_wait, &wait);
 }
@@ -2982,7 +3040,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	unsigned long bcount = journal->j_bcount;
 	while (1) {
+		reiserfs_write_unlock(sb);
 		schedule_timeout_uninterruptible(1);
+		reiserfs_write_lock(sb);
 		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
 		while ((atomic_read(&journal->j_wcount) > 0 ||
 			atomic_read(&journal->j_jlock)) &&
@@ -3033,7 +3093,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 
 	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
 		unlock_journal(sb);
+		reiserfs_write_unlock(sb);
 		reiserfs_wait_on_write_block(sb);
+		reiserfs_write_lock(sb);
 		PROC_INFO_INC(sb, journal.journal_relock_writers);
 		goto relock;
 	}
@@ -3506,14 +3568,14 @@ static void flush_async_commits(struct work_struct *work)
 	struct reiserfs_journal_list *jl;
 	struct list_head *entry;
 
-	lock_kernel();
+	reiserfs_write_lock(sb);
 	if (!list_empty(&journal->j_journal_list)) {
 		/* last entry is the youngest, commit it and you get everything */
 		entry = journal->j_journal_list.prev;
 		jl = JOURNAL_LIST_ENTRY(entry);
 		flush_commit_list(sb, jl, 1);
 	}
-	unlock_kernel();
+	reiserfs_write_unlock(sb);
 }
 
 /*
@@ -4041,7 +4103,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	 * the new transaction is fully setup, and we've already flushed the
 	 * ordered bh list
 	 */
-	mutex_lock(&jl->j_commit_mutex);
+	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
 
 	/* save the transaction id in case we need to commit it later */
 	commit_trans_id = jl->j_trans_id;
@@ -4156,7 +4218,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 		next = cn->next;
 		free_cnode(sb, cn);
 		cn = next;
+		reiserfs_write_unlock(sb);
 		cond_resched();
+		reiserfs_write_lock(sb);
 	}
 
 	/* we are done  with both the c_bh and d_bh, but
@@ -4203,10 +4267,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	 * is lost.
 	 */
 	if (!list_empty(&jl->j_tail_bh_list)) {
-		unlock_kernel();
+		reiserfs_write_unlock(sb);
 		write_ordered_buffers(&journal->j_dirty_buffers_lock,
 				      journal, jl, &jl->j_tail_bh_list);
-		lock_kernel();
+		reiserfs_write_lock(sb);
 	}
 	BUG_ON(!list_empty(&jl->j_tail_bh_list));
 	mutex_unlock(&jl->j_commit_mutex);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
new file mode 100644
index 00000000000..ee2cfc0fd8a
--- /dev/null
+++ b/fs/reiserfs/lock.c
@@ -0,0 +1,88 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/mutex.h>
+
+/*
+ * The previous reiserfs locking scheme was heavily based on
+ * the tricky properties of the Bkl:
+ *
+ * - it was acquired recursively by a same task
+ * - the performances relied on the release-while-schedule() property
+ *
+ * Now that we replace it by a mutex, we still want to keep the same
+ * recursive property to avoid big changes in the code structure.
+ * We use our own lock_owner here because the owner field on a mutex
+ * is only available in SMP or mutex debugging, also we only need this field
+ * for this mutex, no need for a system wide mutex facility.
+ *
+ * Also this lock is often released before a call that could block because
+ * reiserfs performances were partialy based on the release while schedule()
+ * property of the Bkl.
+ */
+void reiserfs_write_lock(struct super_block *s)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+	if (sb_i->lock_owner != current) {
+		mutex_lock(&sb_i->lock);
+		sb_i->lock_owner = current;
+	}
+
+	/* No need to protect it, only the current task touches it */
+	sb_i->lock_depth++;
+}
+
+void reiserfs_write_unlock(struct super_block *s)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+	/*
+	 * Are we unlocking without even holding the lock?
+	 * Such a situation must raise a BUG() if we don't want
+	 * to corrupt the data.
+	 */
+	BUG_ON(sb_i->lock_owner != current);
+
+	if (--sb_i->lock_depth == -1) {
+		sb_i->lock_owner = NULL;
+		mutex_unlock(&sb_i->lock);
+	}
+}
+
+/*
+ * If we already own the lock, just exit and don't increase the depth.
+ * Useful when we don't want to lock more than once.
+ *
+ * We always return the lock_depth we had before calling
+ * this function.
+ */
+int reiserfs_write_lock_once(struct super_block *s)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+	if (sb_i->lock_owner != current) {
+		mutex_lock(&sb_i->lock);
+		sb_i->lock_owner = current;
+		return sb_i->lock_depth++;
+	}
+
+	return sb_i->lock_depth;
+}
+
+void reiserfs_write_unlock_once(struct super_block *s, int lock_depth)
+{
+	if (lock_depth == -1)
+		reiserfs_write_unlock(s);
+}
+
+/*
+ * Utility function to force a BUG if it is called without the superblock
+ * write lock held.  caller is the string printed just before calling BUG()
+ */
+void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+
+	if (sb_i->lock_depth < 0)
+		reiserfs_panic(sb, "%s called without kernel lock held %d",
+			       caller);
+}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 27157912863..e296ff72a6c 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -324,6 +324,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
 				      struct nameidata *nd)
 {
 	int retval;
+	int lock_depth;
 	struct inode *inode = NULL;
 	struct reiserfs_dir_entry de;
 	INITIALIZE_PATH(path_to_entry);
@@ -331,7 +332,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	reiserfs_write_lock(dir->i_sb);
+	/*
+	 * Might be called with or without the write lock, must be careful
+	 * to not recursively hold it in case we want to release the lock
+	 * before rescheduling.
+	 */
+	lock_depth = reiserfs_write_lock_once(dir->i_sb);
+
 	de.de_gen_number_bit_string = NULL;
 	retval =
 	    reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
@@ -341,7 +348,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
 		inode = reiserfs_iget(dir->i_sb,
 				      (struct cpu_key *)&(de.de_dir_id));
 		if (!inode || IS_ERR(inode)) {
-			reiserfs_write_unlock(dir->i_sb);
+			reiserfs_write_unlock_once(dir->i_sb, lock_depth);
 			return ERR_PTR(-EACCES);
 		}
 
@@ -350,7 +357,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
 		if (IS_PRIVATE(dir))
 			inode->i_flags |= S_PRIVATE;
 	}
-	reiserfs_write_unlock(dir->i_sb);
+	reiserfs_write_unlock_once(dir->i_sb, lock_depth);
 	if (retval == IO_ERROR) {
 		return ERR_PTR(-EIO);
 	}
@@ -725,6 +732,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct inode *inode;
 	struct reiserfs_transaction_handle th;
 	struct reiserfs_security_handle security;
+	int lock_depth;
 	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
 	int jbegin_count =
 	    JOURNAL_PER_BALANCE_CNT * 3 +
@@ -748,7 +756,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		return retval;
 	}
 	jbegin_count += retval;
-	reiserfs_write_lock(dir->i_sb);
+	lock_depth = reiserfs_write_lock_once(dir->i_sb);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
@@ -798,8 +806,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
-      out_failed:
-	reiserfs_write_unlock(dir->i_sb);
+out_failed:
+	reiserfs_write_unlock_once(dir->i_sb, lock_depth);
 	return retval;
 }
 
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 536eacaeb71..adbc6f53851 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -349,10 +349,6 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
 
    .  */
 
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
-
 void __reiserfs_panic(struct super_block *sb, const char *id,
 		      const char *function, const char *fmt, ...)
 {
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 9229e5514a4..7a9981196c1 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -17,8 +17,6 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 
-#ifdef CONFIG_REISERFS_PROC_INFO
-
 /*
  * LOCKING:
  *
@@ -48,14 +46,6 @@ static int show_version(struct seq_file *m, struct super_block *sb)
 	return 0;
 }
 
-int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
-				    int count, int *eof, void *data)
-{
-	*start = buffer;
-	*eof = 1;
-	return 0;
-}
-
 #define SF( x ) ( r -> x )
 #define SFP( x ) SF( s_proc_info_data.x )
 #define SFPL( x ) SFP( x[ level ] )
@@ -538,19 +528,6 @@ int reiserfs_proc_info_done(struct super_block *sb)
 	return 0;
 }
 
-struct proc_dir_entry *reiserfs_proc_register_global(char *name,
-						     read_proc_t * func)
-{
-	return (proc_info_root) ? create_proc_read_entry(name, 0,
-							 proc_info_root,
-							 func, NULL) : NULL;
-}
-
-void reiserfs_proc_unregister_global(const char *name)
-{
-	remove_proc_entry(name, proc_info_root);
-}
-
 int reiserfs_proc_info_global_init(void)
 {
 	if (proc_info_root == NULL) {
@@ -572,48 +549,6 @@ int reiserfs_proc_info_global_done(void)
 	}
 	return 0;
 }
-
-/* REISERFS_PROC_INFO */
-#else
-
-int reiserfs_proc_info_init(struct super_block *sb)
-{
-	return 0;
-}
-int reiserfs_proc_info_done(struct super_block *sb)
-{
-	return 0;
-}
-
-struct proc_dir_entry *reiserfs_proc_register_global(char *name,
-						     read_proc_t * func)
-{
-	return NULL;
-}
-
-void reiserfs_proc_unregister_global(const char *name)
-{;
-}
-
-int reiserfs_proc_info_global_init(void)
-{
-	return 0;
-}
-int reiserfs_proc_info_global_done(void)
-{
-	return 0;
-}
-
-int reiserfs_global_version_in_proc(char *buffer, char **start,
-				    off_t offset,
-				    int count, int *eof, void *data)
-{
-	return 0;
-}
-
-/* REISERFS_PROC_INFO */
-#endif
-
 /*
  * Revision 1.1.8.2  2001/07/15 17:08:42  god
  *  . use get_super() in procfs.c
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 18b315d3d10..b3a94d20f0f 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -141,7 +141,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 
 			set_buffer_uptodate(bh);
 			mark_buffer_dirty(bh);
+			reiserfs_write_unlock(s);
 			sync_dirty_buffer(bh);
+			reiserfs_write_lock(s);
 			// update bitmap_info stuff
 			bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
 			brelse(bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index d036ee5b1c8..5fa7118f04e 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -222,9 +222,6 @@ static inline int bin_search(const void *key,	/* Key to search for. */
 	return ITEM_NOT_FOUND;
 }
 
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 
 /* Minimal possible key. It is never in the tree. */
 const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
@@ -519,25 +516,48 @@ static int is_tree_node(struct buffer_head *bh, int level)
 
 #define SEARCH_BY_KEY_READA 16
 
-/* The function is NOT SCHEDULE-SAFE! */
-static void search_by_key_reada(struct super_block *s,
+/*
+ * The function is NOT SCHEDULE-SAFE!
+ * It might unlock the write lock if we needed to wait for a block
+ * to be read. Note that in this case it won't recover the lock to avoid
+ * high contention resulting from too much lock requests, especially
+ * the caller (search_by_key) will perform other schedule-unsafe
+ * operations just after calling this function.
+ *
+ * @return true if we have unlocked
+ */
+static bool search_by_key_reada(struct super_block *s,
 				struct buffer_head **bh,
 				b_blocknr_t *b, int num)
 {
 	int i, j;
+	bool unlocked = false;
 
 	for (i = 0; i < num; i++) {
 		bh[i] = sb_getblk(s, b[i]);
 	}
+	/*
+	 * We are going to read some blocks on which we
+	 * have a reference. It's safe, though we might be
+	 * reading blocks concurrently changed if we release
+	 * the lock. But it's still fine because we check later
+	 * if the tree changed
+	 */
 	for (j = 0; j < i; j++) {
 		/*
 		 * note, this needs attention if we are getting rid of the BKL
 		 * you have to make sure the prepared bit isn't set on this buffer
 		 */
-		if (!buffer_uptodate(bh[j]))
+		if (!buffer_uptodate(bh[j])) {
+			if (!unlocked) {
+				reiserfs_write_unlock(s);
+				unlocked = true;
+			}
 			ll_rw_block(READA, 1, bh + j);
+		}
 		brelse(bh[j]);
 	}
+	return unlocked;
 }
 
 /**************************************************************************
@@ -625,11 +645,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to s
 		   have a pointer to it. */
 		if ((bh = last_element->pe_buffer =
 		     sb_getblk(sb, block_number))) {
+			bool unlocked = false;
+
 			if (!buffer_uptodate(bh) && reada_count > 1)
-				search_by_key_reada(sb, reada_bh,
+				/* may unlock the write lock */
+				unlocked = search_by_key_reada(sb, reada_bh,
 						    reada_blocks, reada_count);
+			/*
+			 * If we haven't already unlocked the write lock,
+			 * then we need to do that here before reading
+			 * the current block
+			 */
+			if (!buffer_uptodate(bh) && !unlocked) {
+				reiserfs_write_unlock(sb);
+				unlocked = true;
+			}
 			ll_rw_block(READ, 1, &bh);
 			wait_on_buffer(bh);
+
+			if (unlocked)
+				reiserfs_write_lock(sb);
 			if (!buffer_uptodate(bh))
 				goto io_error;
 		} else {
@@ -673,7 +708,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to s
 		       !key_in_buffer(search_path, key, sb),
 		       "PAP-5130: key is not in the buffer");
 #ifdef CONFIG_REISERFS_CHECK
-		if (cur_tb) {
+		if (REISERFS_SB(sb)->cur_tb) {
 			print_cur_tb("5140");
 			reiserfs_panic(sb, "PAP-5140",
 				       "schedule occurred in do_balance!");
@@ -1024,7 +1059,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 			reiserfs_free_block(th, inode, block, 1);
 		    }
 
+		    reiserfs_write_unlock(sb);
 		    cond_resched();
+		    reiserfs_write_lock(sb);
 
 		    if (item_moved (&s_ih, path))  {
 			need_re_search = 1;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f0ad05f3802..b4a7dd03bdb 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -465,7 +465,7 @@ static void reiserfs_put_super(struct super_block *s)
 	struct reiserfs_transaction_handle th;
 	th.t_trans_id = 0;
 
-	lock_kernel();
+	reiserfs_write_lock(s);
 
 	if (s->s_dirt)
 		reiserfs_write_super(s);
@@ -499,10 +499,10 @@ static void reiserfs_put_super(struct super_block *s)
 
 	reiserfs_proc_info_done(s);
 
+	reiserfs_write_unlock(s);
+	mutex_destroy(&REISERFS_SB(s)->lock);
 	kfree(s->s_fs_info);
 	s->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static struct kmem_cache *reiserfs_inode_cachep;
@@ -554,25 +554,28 @@ static void reiserfs_dirty_inode(struct inode *inode)
 	struct reiserfs_transaction_handle th;
 
 	int err = 0;
+	int lock_depth;
+
 	if (inode->i_sb->s_flags & MS_RDONLY) {
 		reiserfs_warning(inode->i_sb, "clm-6006",
 				 "writing inode %lu on readonly FS",
 				 inode->i_ino);
 		return;
 	}
-	reiserfs_write_lock(inode->i_sb);
+	lock_depth = reiserfs_write_lock_once(inode->i_sb);
 
 	/* this is really only used for atime updates, so they don't have
 	 ** to be included in O_SYNC or fsync
 	 */
 	err = journal_begin(&th, inode->i_sb, 1);
-	if (err) {
-		reiserfs_write_unlock(inode->i_sb);
-		return;
-	}
+	if (err)
+		goto out;
+
 	reiserfs_update_sd(&th, inode);
 	journal_end(&th, inode->i_sb, 1);
-	reiserfs_write_unlock(inode->i_sb);
+
+out:
+	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 }
 
 #ifdef CONFIG_QUOTA
@@ -1168,11 +1171,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	unsigned int qfmt = 0;
 #ifdef CONFIG_QUOTA
 	int i;
+#endif
+
+	reiserfs_write_lock(s);
 
+#ifdef CONFIG_QUOTA
 	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
 
-	lock_kernel();
 	rs = SB_DISK_SUPER_BLOCK(s);
 
 	if (!reiserfs_parse_options
@@ -1295,12 +1301,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 
 out_ok:
 	replace_mount_options(s, new_opts);
-	unlock_kernel();
+	reiserfs_write_unlock(s);
 	return 0;
 
 out_err:
 	kfree(new_opts);
-	unlock_kernel();
+	reiserfs_write_unlock(s);
 	return err;
 }
 
@@ -1404,7 +1410,9 @@ static int read_super_block(struct super_block *s, int offset)
 static int reread_meta_blocks(struct super_block *s)
 {
 	ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
+	reiserfs_write_unlock(s);
 	wait_on_buffer(SB_BUFFER_WITH_SB(s));
+	reiserfs_write_lock(s);
 	if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
 		reiserfs_warning(s, "reiserfs-2504", "error reading the super");
 		return 1;
@@ -1613,7 +1621,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
 	if (!sbi) {
 		errval = -ENOMEM;
-		goto error;
+		goto error_alloc;
 	}
 	s->s_fs_info = sbi;
 	/* Set default values for options: non-aggressive tails, RO on errors */
@@ -1627,6 +1635,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	/* setup default block allocator options */
 	reiserfs_init_alloc_options(s);
 
+	mutex_init(&REISERFS_SB(s)->lock);
+	REISERFS_SB(s)->lock_depth = -1;
+
+	/*
+	 * This function is called with the bkl, which also was the old
+	 * locking used here.
+	 * do_journal_begin() will soon check if we hold the lock (ie: was the
+	 * bkl). This is likely because do_journal_begin() has several another
+	 * callers because at this time, it doesn't seem to be necessary to
+	 * protect against anything.
+	 * Anyway, let's be conservative and lock for now.
+	 */
+	reiserfs_write_lock(s);
+
 	jdev_name = NULL;
 	if (reiserfs_parse_options
 	    (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
@@ -1852,9 +1874,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	init_waitqueue_head(&(sbi->s_wait));
 	spin_lock_init(&sbi->bitmap_lock);
 
+	reiserfs_write_unlock(s);
+
 	return (0);
 
 error:
+	reiserfs_write_unlock(s);
+error_alloc:
 	if (jinit_done) {	/* kill the commit thread, free journal ram */
 		journal_release_error(NULL, s);
 	}
@@ -2196,8 +2222,6 @@ static int __init init_reiserfs_fs(void)
 	}
 
 	reiserfs_proc_info_global_init();
-	reiserfs_proc_register_global("version",
-				      reiserfs_global_version_in_proc);
 
 	ret = register_filesystem(&reiserfs_fs_type);
 
@@ -2205,7 +2229,6 @@ static int __init init_reiserfs_fs(void)
 		return 0;
 	}
 
-	reiserfs_proc_unregister_global("version");
 	reiserfs_proc_info_global_done();
 	destroy_inodecache();
 
@@ -2214,7 +2237,6 @@ static int __init init_reiserfs_fs(void)
 
 static void __exit exit_reiserfs_fs(void)
 {
-	reiserfs_proc_unregister_global("version");
 	reiserfs_proc_info_global_done();
 	unregister_filesystem(&reiserfs_fs_type);
 	destroy_inodecache();
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6925b835a43..58aa8e75f7f 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -975,7 +975,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 	int err = 0;
 
 	/* If we don't have the privroot located yet - go find it */
-	mutex_lock(&s->s_root->d_inode->i_mutex);
+	reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
 	dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
 				strlen(PRIVROOT_NAME));
 	if (!IS_ERR(dentry)) {
@@ -1004,14 +1004,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 		goto error;
 
 	if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
-		mutex_lock(&s->s_root->d_inode->i_mutex);
+		reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
 		err = create_privroot(REISERFS_SB(s)->priv_root);
 		mutex_unlock(&s->s_root->d_inode->i_mutex);
 	}
 
 	if (privroot->d_inode) {
 		s->s_xattr = reiserfs_xattr_handlers;
-		mutex_lock(&privroot->d_inode->i_mutex);
+		reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s);
 		if (!REISERFS_SB(s)->xattr_root) {
 			struct dentry *dentry;
 			dentry = lookup_one_len(XAROOT_NAME, privroot,
diff --git a/fs/splice.c b/fs/splice.c
index 7394e9e1753..39208663aaf 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -648,9 +648,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 	ret = buf->ops->confirm(pipe, buf);
 	if (!ret) {
 		more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
-
-		ret = file->f_op->sendpage(file, buf->page, buf->offset,
-					   sd->len, &pos, more);
+		if (file->f_op && file->f_op->sendpage)
+			ret = file->f_op->sendpage(file, buf->page, buf->offset,
+						   sd->len, &pos, more);
+		else
+			ret = -EINVAL;
 	}
 
 	return ret;
@@ -1068,8 +1070,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (unlikely(ret < 0))
 		return ret;
 
-	splice_write = out->f_op->splice_write;
-	if (!splice_write)
+	if (out->f_op && out->f_op->splice_write)
+		splice_write = out->f_op->splice_write;
+	else
 		splice_write = default_file_splice_write;
 
 	return splice_write(pipe, out, ppos, len, flags);
@@ -1093,8 +1096,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 	if (unlikely(ret < 0))
 		return ret;
 
-	splice_read = in->f_op->splice_read;
-	if (!splice_read)
+	if (in->f_op && in->f_op->splice_read)
+		splice_read = in->f_op->splice_read;
+	else
 		splice_read = default_file_splice_read;
 
 	return splice_read(in, ppos, pipe, len, flags);
@@ -1316,7 +1320,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		if (off_in)
 			return -ESPIPE;
 		if (off_out) {
-			if (out->f_op->llseek == no_llseek)
+			if (!out->f_op || !out->f_op->llseek ||
+			    out->f_op->llseek == no_llseek)
 				return -EINVAL;
 			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
 				return -EFAULT;
@@ -1336,7 +1341,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		if (off_out)
 			return -ESPIPE;
 		if (off_in) {
-			if (in->f_op->llseek == no_llseek)
+			if (!in->f_op || !in->f_op->llseek ||
+			    in->f_op->llseek == no_llseek)
 				return -EINVAL;
 			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
 				return -EFAULT;
diff --git a/fs/sync.c b/fs/sync.c
index d104591b066..36752a68348 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -295,10 +295,11 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
  */
 int generic_write_sync(struct file *file, loff_t pos, loff_t count)
 {
-	if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+	if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
 		return 0;
 	return vfs_fsync_range(file, file->f_path.dentry, pos,
-			       pos + count - 1, 1);
+			       pos + count - 1,
+			       (file->f_flags & __O_SYNC) ? 0 : 1);
 }
 EXPORT_SYMBOL(generic_write_sync);
 
@@ -452,9 +453,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
 
 	ret = 0;
 	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
-		ret = wait_on_page_writeback_range(mapping,
-					offset >> PAGE_CACHE_SHIFT,
-					endbyte >> PAGE_CACHE_SHIFT);
+		ret = filemap_fdatawait_range(mapping, offset, endbyte);
 		if (ret < 0)
 			goto out;
 	}
@@ -467,9 +466,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
 	}
 
 	if (flags & SYNC_FILE_RANGE_WAIT_AFTER) {
-		ret = wait_on_page_writeback_range(mapping,
-					offset >> PAGE_CACHE_SHIFT,
-					endbyte >> PAGE_CACHE_SHIFT);
+		ret = filemap_fdatawait_range(mapping, offset, endbyte);
 	}
 out:
 	return ret;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e0201837d24..f05f2303a8b 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -25,7 +25,6 @@
 #include "sysfs.h"
 
 DEFINE_MUTEX(sysfs_mutex);
-DEFINE_MUTEX(sysfs_rename_mutex);
 DEFINE_SPINLOCK(sysfs_assoc_lock);
 
 static DEFINE_SPINLOCK(sysfs_ino_lock);
@@ -85,46 +84,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 }
 
 /**
- *	sysfs_get_dentry - get dentry for the given sysfs_dirent
- *	@sd: sysfs_dirent of interest
- *
- *	Get dentry for @sd.  Dentry is looked up if currently not
- *	present.  This function descends from the root looking up
- *	dentry for each step.
- *
- *	LOCKING:
- *	mutex_lock(sysfs_rename_mutex)
- *
- *	RETURNS:
- *	Pointer to found dentry on success, ERR_PTR() value on error.
- */
-struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
-{
-	struct dentry *dentry = dget(sysfs_sb->s_root);
-
-	while (dentry->d_fsdata != sd) {
-		struct sysfs_dirent *cur;
-		struct dentry *parent;
-
-		/* find the first ancestor which hasn't been looked up */
-		cur = sd;
-		while (cur->s_parent != dentry->d_fsdata)
-			cur = cur->s_parent;
-
-		/* look it up */
-		parent = dentry;
-		mutex_lock(&parent->d_inode->i_mutex);
-		dentry = lookup_one_noperm(cur->s_name, parent);
-		mutex_unlock(&parent->d_inode->i_mutex);
-		dput(parent);
-
-		if (IS_ERR(dentry))
-			break;
-	}
-	return dentry;
-}
-
-/**
  *	sysfs_get_active - get an active reference to sysfs_dirent
  *	@sd: sysfs_dirent to get an active reference to
  *
@@ -298,7 +257,61 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
 		goto repeat;
 }
 
-static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
+static int sysfs_dentry_delete(struct dentry *dentry)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
+}
+
+static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	int is_dir;
+
+	mutex_lock(&sysfs_mutex);
+
+	/* The sysfs dirent has been deleted */
+	if (sd->s_flags & SYSFS_FLAG_REMOVED)
+		goto out_bad;
+
+	/* The sysfs dirent has been moved? */
+	if (dentry->d_parent->d_fsdata != sd->s_parent)
+		goto out_bad;
+
+	/* The sysfs dirent has been renamed */
+	if (strcmp(dentry->d_name.name, sd->s_name) != 0)
+		goto out_bad;
+
+	mutex_unlock(&sysfs_mutex);
+out_valid:
+	return 1;
+out_bad:
+	/* Remove the dentry from the dcache hashes.
+	 * If this is a deleted dentry we use d_drop instead of d_delete
+	 * so sysfs doesn't need to cope with negative dentries.
+	 *
+	 * If this is a dentry that has simply been renamed we
+	 * use d_drop to remove it from the dcache lookup on its
+	 * old parent.  If this dentry persists later when a lookup
+	 * is performed at its new name the dentry will be readded
+	 * to the dcache hashes.
+	 */
+	is_dir = (sysfs_type(sd) == SYSFS_DIR);
+	mutex_unlock(&sysfs_mutex);
+	if (is_dir) {
+		/* If we have submounts we must allow the vfs caches
+		 * to lie about the state of the filesystem to prevent
+		 * leaks and other nasty things.
+		 */
+		if (have_submounts(dentry))
+			goto out_valid;
+		shrink_dcache_parent(dentry);
+	}
+	d_drop(dentry);
+	return 0;
+}
+
+static void sysfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
 	struct sysfs_dirent * sd = dentry->d_fsdata;
 
@@ -307,7 +320,9 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
 }
 
 static const struct dentry_operations sysfs_dentry_ops = {
-	.d_iput		= sysfs_d_iput,
+	.d_revalidate	= sysfs_dentry_revalidate,
+	.d_delete	= sysfs_dentry_delete,
+	.d_iput		= sysfs_dentry_iput,
 };
 
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
@@ -344,12 +359,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
 	return NULL;
 }
 
-static int sysfs_ilookup_test(struct inode *inode, void *arg)
-{
-	struct sysfs_dirent *sd = arg;
-	return inode->i_ino == sd->s_ino;
-}
-
 /**
  *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
  *	@acxt: pointer to sysfs_addrm_cxt to be used
@@ -357,47 +366,20 @@ static int sysfs_ilookup_test(struct inode *inode, void *arg)
  *
  *	This function is called when the caller is about to add or
  *	remove sysfs_dirent under @parent_sd.  This function acquires
- *	sysfs_mutex, grabs inode for @parent_sd if available and lock
- *	i_mutex of it.  @acxt is used to keep and pass context to
+ *	sysfs_mutex.  @acxt is used to keep and pass context to
  *	other addrm functions.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).  sysfs_mutex is locked on
- *	return.  i_mutex of parent inode is locked on return if
- *	available.
+ *	return.
  */
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 		       struct sysfs_dirent *parent_sd)
 {
-	struct inode *inode;
-
 	memset(acxt, 0, sizeof(*acxt));
 	acxt->parent_sd = parent_sd;
 
-	/* Lookup parent inode.  inode initialization is protected by
-	 * sysfs_mutex, so inode existence can be determined by
-	 * looking up inode while holding sysfs_mutex.
-	 */
 	mutex_lock(&sysfs_mutex);
-
-	inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
-			 parent_sd);
-	if (inode) {
-		WARN_ON(inode->i_state & I_NEW);
-
-		/* parent inode available */
-		acxt->parent_inode = inode;
-
-		/* sysfs_mutex is below i_mutex in lock hierarchy.
-		 * First, trylock i_mutex.  If fails, unlock
-		 * sysfs_mutex and lock them in order.
-		 */
-		if (!mutex_trylock(&inode->i_mutex)) {
-			mutex_unlock(&sysfs_mutex);
-			mutex_lock(&inode->i_mutex);
-			mutex_lock(&sysfs_mutex);
-		}
-	}
 }
 
 /**
@@ -422,18 +404,22 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  */
 int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
+	struct sysfs_inode_attrs *ps_iattr;
+
 	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
 		return -EEXIST;
 
 	sd->s_parent = sysfs_get(acxt->parent_sd);
 
-	if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
-		inc_nlink(acxt->parent_inode);
-
-	acxt->cnt++;
-
 	sysfs_link_sibling(sd);
 
+	/* Update timestamps on the parent */
+	ps_iattr = acxt->parent_sd->s_iattr;
+	if (ps_iattr) {
+		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+	}
+
 	return 0;
 }
 
@@ -512,70 +498,22 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
  */
 void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
+	struct sysfs_inode_attrs *ps_iattr;
+
 	BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED);
 
 	sysfs_unlink_sibling(sd);
 
+	/* Update timestamps on the parent */
+	ps_iattr = acxt->parent_sd->s_iattr;
+	if (ps_iattr) {
+		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+	}
+
 	sd->s_flags |= SYSFS_FLAG_REMOVED;
 	sd->s_sibling = acxt->removed;
 	acxt->removed = sd;
-
-	if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
-		drop_nlink(acxt->parent_inode);
-
-	acxt->cnt++;
-}
-
-/**
- *	sysfs_drop_dentry - drop dentry for the specified sysfs_dirent
- *	@sd: target sysfs_dirent
- *
- *	Drop dentry for @sd.  @sd must have been unlinked from its
- *	parent on entry to this function such that it can't be looked
- *	up anymore.
- */
-static void sysfs_drop_dentry(struct sysfs_dirent *sd)
-{
-	struct inode *inode;
-	struct dentry *dentry;
-
-	inode = ilookup(sysfs_sb, sd->s_ino);
-	if (!inode)
-		return;
-
-	/* Drop any existing dentries associated with sd.
-	 *
-	 * For the dentry to be properly freed we need to grab a
-	 * reference to the dentry under the dcache lock,  unhash it,
-	 * and then put it.  The playing with the dentry count allows
-	 * dput to immediately free the dentry  if it is not in use.
-	 */
-repeat:
-	spin_lock(&dcache_lock);
-	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
-		if (d_unhashed(dentry))
-			continue;
-		dget_locked(dentry);
-		spin_lock(&dentry->d_lock);
-		__d_drop(dentry);
-		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
-		dput(dentry);
-		goto repeat;
-	}
-	spin_unlock(&dcache_lock);
-
-	/* adjust nlink and update timestamp */
-	mutex_lock(&inode->i_mutex);
-
-	inode->i_ctime = CURRENT_TIME;
-	drop_nlink(inode);
-	if (sysfs_type(sd) == SYSFS_DIR)
-		drop_nlink(inode);
-
-	mutex_unlock(&inode->i_mutex);
-
-	iput(inode);
 }
 
 /**
@@ -584,25 +522,15 @@ repeat:
  *
  *	Finish up sysfs_dirent add/remove.  Resources acquired by
  *	sysfs_addrm_start() are released and removed sysfs_dirents are
- *	cleaned up.  Timestamps on the parent inode are updated.
+ *	cleaned up.
  *
  *	LOCKING:
- *	All mutexes acquired by sysfs_addrm_start() are released.
+ *	sysfs_mutex is released.
  */
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 {
 	/* release resources acquired by sysfs_addrm_start() */
 	mutex_unlock(&sysfs_mutex);
-	if (acxt->parent_inode) {
-		struct inode *inode = acxt->parent_inode;
-
-		/* if added/removed, update timestamps on the parent */
-		if (acxt->cnt)
-			inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-
-		mutex_unlock(&inode->i_mutex);
-		iput(inode);
-	}
 
 	/* kill removed sysfs_dirents */
 	while (acxt->removed) {
@@ -611,7 +539,6 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 		acxt->removed = sd->s_sibling;
 		sd->s_sibling = NULL;
 
-		sysfs_drop_dentry(sd);
 		sysfs_deactivate(sd);
 		unmap_bin_file(sd);
 		sysfs_put(sd);
@@ -751,10 +678,15 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	/* instantiate and hash dentry */
-	dentry->d_op = &sysfs_dentry_ops;
-	dentry->d_fsdata = sysfs_get(sd);
-	d_instantiate(dentry, inode);
-	d_rehash(dentry);
+	ret = d_find_alias(inode);
+	if (!ret) {
+		dentry->d_op = &sysfs_dentry_ops;
+		dentry->d_fsdata = sysfs_get(sd);
+		d_add(dentry, inode);
+	} else {
+		d_move(ret, dentry);
+		iput(inode);
+	}
 
  out_unlock:
 	mutex_unlock(&sysfs_mutex);
@@ -763,7 +695,9 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 
 const struct inode_operations sysfs_dir_inode_operations = {
 	.lookup		= sysfs_lookup,
+	.permission	= sysfs_permission,
 	.setattr	= sysfs_setattr,
+	.getattr	= sysfs_getattr,
 	.setxattr	= sysfs_setxattr,
 };
 
@@ -826,141 +760,65 @@ void sysfs_remove_dir(struct kobject * kobj)
 	__sysfs_remove_dir(sd);
 }
 
-int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
+int sysfs_rename(struct sysfs_dirent *sd,
+	struct sysfs_dirent *new_parent_sd, const char *new_name)
 {
-	struct sysfs_dirent *sd = kobj->sd;
-	struct dentry *parent = NULL;
-	struct dentry *old_dentry = NULL, *new_dentry = NULL;
 	const char *dup_name = NULL;
 	int error;
 
-	mutex_lock(&sysfs_rename_mutex);
+	mutex_lock(&sysfs_mutex);
 
 	error = 0;
-	if (strcmp(sd->s_name, new_name) == 0)
+	if ((sd->s_parent == new_parent_sd) &&
+	    (strcmp(sd->s_name, new_name) == 0))
 		goto out;	/* nothing to rename */
 
-	/* get the original dentry */
-	old_dentry = sysfs_get_dentry(sd);
-	if (IS_ERR(old_dentry)) {
-		error = PTR_ERR(old_dentry);
-		old_dentry = NULL;
-		goto out;
-	}
-
-	parent = old_dentry->d_parent;
-
-	/* lock parent and get dentry for new name */
-	mutex_lock(&parent->d_inode->i_mutex);
-	mutex_lock(&sysfs_mutex);
-
 	error = -EEXIST;
-	if (sysfs_find_dirent(sd->s_parent, new_name))
-		goto out_unlock;
-
-	error = -ENOMEM;
-	new_dentry = d_alloc_name(parent, new_name);
-	if (!new_dentry)
-		goto out_unlock;
+	if (sysfs_find_dirent(new_parent_sd, new_name))
+		goto out;
 
 	/* rename sysfs_dirent */
-	error = -ENOMEM;
-	new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
-	if (!new_name)
-		goto out_unlock;
-
-	dup_name = sd->s_name;
-	sd->s_name = new_name;
+	if (strcmp(sd->s_name, new_name) != 0) {
+		error = -ENOMEM;
+		new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
+		if (!new_name)
+			goto out;
+
+		dup_name = sd->s_name;
+		sd->s_name = new_name;
+	}
 
-	/* rename */
-	d_add(new_dentry, NULL);
-	d_move(old_dentry, new_dentry);
+	/* Remove from old parent's list and insert into new parent's list. */
+	if (sd->s_parent != new_parent_sd) {
+		sysfs_unlink_sibling(sd);
+		sysfs_get(new_parent_sd);
+		sysfs_put(sd->s_parent);
+		sd->s_parent = new_parent_sd;
+		sysfs_link_sibling(sd);
+	}
 
 	error = 0;
- out_unlock:
+ out:
 	mutex_unlock(&sysfs_mutex);
-	mutex_unlock(&parent->d_inode->i_mutex);
 	kfree(dup_name);
-	dput(old_dentry);
-	dput(new_dentry);
- out:
-	mutex_unlock(&sysfs_rename_mutex);
 	return error;
 }
 
+int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
+{
+	return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name);
+}
+
 int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 {
 	struct sysfs_dirent *sd = kobj->sd;
 	struct sysfs_dirent *new_parent_sd;
-	struct dentry *old_parent, *new_parent = NULL;
-	struct dentry *old_dentry = NULL, *new_dentry = NULL;
-	int error;
 
-	mutex_lock(&sysfs_rename_mutex);
 	BUG_ON(!sd->s_parent);
-	new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
+	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
 		new_parent_kobj->sd : &sysfs_root;
 
-	error = 0;
-	if (sd->s_parent == new_parent_sd)
-		goto out;	/* nothing to move */
-
-	/* get dentries */
-	old_dentry = sysfs_get_dentry(sd);
-	if (IS_ERR(old_dentry)) {
-		error = PTR_ERR(old_dentry);
-		old_dentry = NULL;
-		goto out;
-	}
-	old_parent = old_dentry->d_parent;
-
-	new_parent = sysfs_get_dentry(new_parent_sd);
-	if (IS_ERR(new_parent)) {
-		error = PTR_ERR(new_parent);
-		new_parent = NULL;
-		goto out;
-	}
-
-again:
-	mutex_lock(&old_parent->d_inode->i_mutex);
-	if (!mutex_trylock(&new_parent->d_inode->i_mutex)) {
-		mutex_unlock(&old_parent->d_inode->i_mutex);
-		goto again;
-	}
-	mutex_lock(&sysfs_mutex);
-
-	error = -EEXIST;
-	if (sysfs_find_dirent(new_parent_sd, sd->s_name))
-		goto out_unlock;
-
-	error = -ENOMEM;
-	new_dentry = d_alloc_name(new_parent, sd->s_name);
-	if (!new_dentry)
-		goto out_unlock;
-
-	error = 0;
-	d_add(new_dentry, NULL);
-	d_move(old_dentry, new_dentry);
-
-	/* Remove from old parent's list and insert into new parent's list. */
-	sysfs_unlink_sibling(sd);
-	sysfs_get(new_parent_sd);
-	drop_nlink(old_parent->d_inode);
-	sysfs_put(sd->s_parent);
-	sd->s_parent = new_parent_sd;
-	inc_nlink(new_parent->d_inode);
-	sysfs_link_sibling(sd);
-
- out_unlock:
-	mutex_unlock(&sysfs_mutex);
-	mutex_unlock(&new_parent->d_inode->i_mutex);
-	mutex_unlock(&old_parent->d_inode->i_mutex);
- out:
-	dput(new_parent);
-	dput(old_dentry);
-	dput(new_dentry);
-	mutex_unlock(&sysfs_rename_mutex);
-	return error;
+	return sysfs_rename(sd, new_parent_sd, sd->s_name);
 }
 
 /* Relationship between s_mode and the DT_xxx types */
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f5ea4680f15..dc30d9e3168 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -579,46 +579,23 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
  */
 int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 {
-	struct sysfs_dirent *victim_sd = NULL;
-	struct dentry *victim = NULL;
-	struct inode * inode;
+	struct sysfs_dirent *sd;
 	struct iattr newattrs;
 	int rc;
 
-	rc = -ENOENT;
-	victim_sd = sysfs_get_dirent(kobj->sd, attr->name);
-	if (!victim_sd)
-		goto out;
+	mutex_lock(&sysfs_mutex);
 
-	mutex_lock(&sysfs_rename_mutex);
-	victim = sysfs_get_dentry(victim_sd);
-	mutex_unlock(&sysfs_rename_mutex);
-	if (IS_ERR(victim)) {
-		rc = PTR_ERR(victim);
-		victim = NULL;
+	rc = -ENOENT;
+	sd = sysfs_find_dirent(kobj->sd, attr->name);
+	if (!sd)
 		goto out;
-	}
-
-	inode = victim->d_inode;
-
-	mutex_lock(&inode->i_mutex);
 
-	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
-	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-	newattrs.ia_ctime = current_fs_time(inode->i_sb);
-	rc = sysfs_setattr(victim, &newattrs);
+	newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
+	newattrs.ia_valid = ATTR_MODE;
+	rc = sysfs_sd_setattr(sd, &newattrs);
 
-	if (rc == 0) {
-		fsnotify_change(victim, newattrs.ia_valid);
-		mutex_lock(&sysfs_mutex);
-		victim_sd->s_mode = newattrs.ia_mode;
-		mutex_unlock(&sysfs_mutex);
-	}
-
-	mutex_unlock(&inode->i_mutex);
  out:
-	dput(victim);
-	sysfs_put(victim_sd);
+	mutex_unlock(&sysfs_mutex);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e28cecf179f..220b758523a 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -37,7 +37,9 @@ static struct backing_dev_info sysfs_backing_dev_info = {
 };
 
 static const struct inode_operations sysfs_inode_operations ={
+	.permission	= sysfs_permission,
 	.setattr	= sysfs_setattr,
+	.getattr	= sysfs_getattr,
 	.setxattr	= sysfs_setxattr,
 };
 
@@ -46,7 +48,7 @@ int __init sysfs_inode_init(void)
 	return bdi_init(&sysfs_backing_dev_info);
 }
 
-struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
 {
 	struct sysfs_inode_attrs *attrs;
 	struct iattr *iattrs;
@@ -64,30 +66,15 @@ struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
 
 	return attrs;
 }
-int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
+
+int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr)
 {
-	struct inode * inode = dentry->d_inode;
-	struct sysfs_dirent * sd = dentry->d_fsdata;
 	struct sysfs_inode_attrs *sd_attrs;
 	struct iattr *iattrs;
 	unsigned int ia_valid = iattr->ia_valid;
-	int error;
-
-	if (!sd)
-		return -EINVAL;
 
 	sd_attrs = sd->s_iattr;
 
-	error = inode_change_ok(inode, iattr);
-	if (error)
-		return error;
-
-	iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
-
-	error = inode_setattr(inode, iattr);
-	if (error)
-		return error;
-
 	if (!sd_attrs) {
 		/* setting attributes for the first time, allocate now */
 		sd_attrs = sysfs_init_inode_attrs(sd);
@@ -103,42 +90,78 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 		if (ia_valid & ATTR_GID)
 			iattrs->ia_gid = iattr->ia_gid;
 		if (ia_valid & ATTR_ATIME)
-			iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
-					inode->i_sb->s_time_gran);
+			iattrs->ia_atime = iattr->ia_atime;
 		if (ia_valid & ATTR_MTIME)
-			iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
-					inode->i_sb->s_time_gran);
+			iattrs->ia_mtime = iattr->ia_mtime;
 		if (ia_valid & ATTR_CTIME)
-			iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
-					inode->i_sb->s_time_gran);
+			iattrs->ia_ctime = iattr->ia_ctime;
 		if (ia_valid & ATTR_MODE) {
 			umode_t mode = iattr->ia_mode;
-
-			if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-				mode &= ~S_ISGID;
 			iattrs->ia_mode = sd->s_mode = mode;
 		}
 	}
+	return 0;
+}
+
+int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	int error;
+
+	if (!sd)
+		return -EINVAL;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
+
+	error = inode_setattr(inode, iattr);
+	if (error)
+		return error;
+
+	mutex_lock(&sysfs_mutex);
+	error = sysfs_sd_setattr(sd, iattr);
+	mutex_unlock(&sysfs_mutex);
+
 	return error;
 }
 
+static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len)
+{
+	struct sysfs_inode_attrs *iattrs;
+	void *old_secdata;
+	size_t old_secdata_len;
+
+	iattrs = sd->s_iattr;
+	if (!iattrs)
+		iattrs = sysfs_init_inode_attrs(sd);
+	if (!iattrs)
+		return -ENOMEM;
+
+	old_secdata = iattrs->ia_secdata;
+	old_secdata_len = iattrs->ia_secdata_len;
+
+	iattrs->ia_secdata = *secdata;
+	iattrs->ia_secdata_len = *secdata_len;
+
+	*secdata = old_secdata;
+	*secdata_len = old_secdata_len;
+	return 0;
+}
+
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags)
 {
 	struct sysfs_dirent *sd = dentry->d_fsdata;
-	struct sysfs_inode_attrs *iattrs;
 	void *secdata;
 	int error;
 	u32 secdata_len = 0;
 
 	if (!sd)
 		return -EINVAL;
-	if (!sd->s_iattr)
-		sd->s_iattr = sysfs_init_inode_attrs(sd);
-	if (!sd->s_iattr)
-		return -ENOMEM;
-
-	iattrs = sd->s_iattr;
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
 		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
@@ -150,12 +173,13 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 						&secdata, &secdata_len);
 		if (error)
 			goto out;
-		if (iattrs->ia_secdata)
-			security_release_secctx(iattrs->ia_secdata,
-						iattrs->ia_secdata_len);
-		iattrs->ia_secdata = secdata;
-		iattrs->ia_secdata_len = secdata_len;
 
+		mutex_lock(&sysfs_mutex);
+		error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
+		mutex_unlock(&sysfs_mutex);
+
+		if (secdata)
+			security_release_secctx(secdata, secdata_len);
 	} else
 		return -EINVAL;
 out:
@@ -170,7 +194,6 @@ static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 
 static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
 {
-	inode->i_mode = iattr->ia_mode;
 	inode->i_uid = iattr->ia_uid;
 	inode->i_gid = iattr->ia_gid;
 	inode->i_atime = iattr->ia_atime;
@@ -178,17 +201,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
 	inode->i_ctime = iattr->ia_ctime;
 }
 
-
-/*
- * sysfs has a different i_mutex lock order behavior for i_mutex than other
- * filesystems; sysfs i_mutex is called in many places with subsystem locks
- * held. At the same time, many of the VFS locking rules do not apply to
- * sysfs at all (cross directory rename for example). To untangle this mess
- * (which gives false positives in lockdep), we're giving sysfs inodes their
- * own class for i_mutex.
- */
-static struct lock_class_key sysfs_inode_imutex_key;
-
 static int sysfs_count_nlink(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *child;
@@ -201,38 +213,55 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
 	return nr + 2;
 }
 
+static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
+{
+	struct sysfs_inode_attrs *iattrs = sd->s_iattr;
+
+	inode->i_mode = sd->s_mode;
+	if (iattrs) {
+		/* sysfs_dirent has non-default attributes
+		 * get them from persistent copy in sysfs_dirent
+		 */
+		set_inode_attr(inode, &iattrs->ia_iattr);
+		security_inode_notifysecctx(inode,
+					    iattrs->ia_secdata,
+					    iattrs->ia_secdata_len);
+	}
+
+	if (sysfs_type(sd) == SYSFS_DIR)
+		inode->i_nlink = sysfs_count_nlink(sd);
+}
+
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct inode *inode = dentry->d_inode;
+
+	mutex_lock(&sysfs_mutex);
+	sysfs_refresh_inode(sd, inode);
+	mutex_unlock(&sysfs_mutex);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
 static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
 	struct bin_attribute *bin_attr;
-	struct sysfs_inode_attrs *iattrs;
 
 	inode->i_private = sysfs_get(sd);
 	inode->i_mapping->a_ops = &sysfs_aops;
 	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
 	inode->i_op = &sysfs_inode_operations;
-	inode->i_ino = sd->s_ino;
-	lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
 
-	iattrs = sd->s_iattr;
-	if (iattrs) {
-		/* sysfs_dirent has non-default attributes
-		 * get them for the new inode from persistent copy
-		 * in sysfs_dirent
-		 */
-		set_inode_attr(inode, &iattrs->ia_iattr);
-		if (iattrs->ia_secdata)
-			security_inode_notifysecctx(inode,
-						iattrs->ia_secdata,
-						iattrs->ia_secdata_len);
-	} else
-		set_default_inode_attr(inode, sd->s_mode);
+	set_default_inode_attr(inode, sd->s_mode);
+	sysfs_refresh_inode(sd, inode);
 
 	/* initialize inode according to type */
 	switch (sysfs_type(sd)) {
 	case SYSFS_DIR:
 		inode->i_op = &sysfs_dir_inode_operations;
 		inode->i_fop = &sysfs_dir_operations;
-		inode->i_nlink = sysfs_count_nlink(sd);
 		break;
 	case SYSFS_KOBJ_ATTR:
 		inode->i_size = PAGE_SIZE;
@@ -315,3 +344,14 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
 	else
 		return -ENOENT;
 }
+
+int sysfs_permission(struct inode *inode, int mask)
+{
+	struct sysfs_dirent *sd = inode->i_private;
+
+	mutex_lock(&sysfs_mutex);
+	sysfs_refresh_inode(sd, inode);
+	mutex_unlock(&sysfs_mutex);
+
+	return generic_permission(inode, mask, NULL);
+}
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5081ad7702..c5eff49fa41 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -210,10 +210,13 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 }
 
 const struct inode_operations sysfs_symlink_inode_operations = {
-	.setxattr = sysfs_setxattr,
-	.readlink = generic_readlink,
-	.follow_link = sysfs_follow_link,
-	.put_link = sysfs_put_link,
+	.setxattr	= sysfs_setxattr,
+	.readlink	= generic_readlink,
+	.follow_link	= sysfs_follow_link,
+	.put_link	= sysfs_put_link,
+	.setattr	= sysfs_setattr,
+	.getattr	= sysfs_getattr,
+	.permission	= sysfs_permission,
 };
 
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index af4c4e7482a..ca52e7b9d8f 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -89,9 +89,7 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
  */
 struct sysfs_addrm_cxt {
 	struct sysfs_dirent	*parent_sd;
-	struct inode		*parent_inode;
 	struct sysfs_dirent	*removed;
-	int			cnt;
 };
 
 /*
@@ -105,7 +103,6 @@ extern struct kmem_cache *sysfs_dir_cachep;
  * dir.c
  */
 extern struct mutex sysfs_mutex;
-extern struct mutex sysfs_rename_mutex;
 extern spinlock_t sysfs_assoc_lock;
 
 extern const struct file_operations sysfs_dir_operations;
@@ -133,6 +130,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd);
 void sysfs_remove_subdir(struct sysfs_dirent *sd);
 
+int sysfs_rename(struct sysfs_dirent *sd,
+	struct sysfs_dirent *new_parent_sd, const char *new_name);
+
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
 	if (sd) {
@@ -155,7 +155,10 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
  */
 struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
 void sysfs_delete_inode(struct inode *inode);
+int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
+int sysfs_permission(struct inode *inode, int mask);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags);
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index dbc093afd94..90492327b38 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -350,13 +350,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 		       le32_to_cpu(sup->fmt_version));
 		printk(KERN_DEBUG "\ttime_gran      %u\n",
 		       le32_to_cpu(sup->time_gran));
-		printk(KERN_DEBUG "\tUUID           %02X%02X%02X%02X-%02X%02X"
-		       "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
-		       sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3],
-		       sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7],
-		       sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11],
-		       sup->uuid[12], sup->uuid[13], sup->uuid[14],
-		       sup->uuid[15]);
+		printk(KERN_DEBUG "\tUUID           %pUB\n",
+		       sup->uuid);
 		break;
 	}
 	case UBIFS_MST_NODE:
@@ -2014,7 +2009,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 		inum = key_inum_flash(c, &dent->key);
 		fscki1 = read_add_inode(c, priv, inum);
 		if (IS_ERR(fscki1)) {
-			err = PTR_ERR(fscki);
+			err = PTR_ERR(fscki1);
 			ubifs_err("error %d while processing entry node and "
 				  "trying to find parent inode node %lu",
 				  err, (unsigned long)inum);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1009adc8d60..39849f887e7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1389,7 +1389,6 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			       unsigned long nr_segs, loff_t pos)
 {
 	int err;
-	ssize_t ret;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 
@@ -1397,17 +1396,7 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (err)
 		return err;
 
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-	if (ret < 0)
-		return ret;
-
-	if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
-		err = ubifs_sync_wbufs_by_inode(c, inode);
-		if (err)
-			return err;
-	}
-
-	return ret;
+	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
 static int ubifs_set_page_dirty(struct page *page)
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index f94ddf7efba..868a55ee080 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -23,7 +23,7 @@
 /*
  * This file implements functions needed to recover from unclean un-mounts.
  * When UBIFS is mounted, it checks a flag on the master node to determine if
- * an un-mount was completed sucessfully. If not, the process of mounting
+ * an un-mount was completed successfully. If not, the process of mounting
  * incorparates additional checking and fixing of on-flash data structures.
  * UBIFS always cleans away all remnants of an unclean un-mount, so that
  * errors do not accumulate. However UBIFS defers recovery if it is mounted
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 333e181ee98..43f9d19a6f3 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1393,12 +1393,7 @@ static int mount_ubifs(struct ubifs_info *c)
 		c->leb_size, c->leb_size >> 10);
 	dbg_msg("data journal heads:  %d",
 		c->jhead_cnt - NONDATA_JHEADS_CNT);
-	dbg_msg("UUID:                %02X%02X%02X%02X-%02X%02X"
-	       "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",
-	       c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3],
-	       c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
-	       c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
-	       c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
+	dbg_msg("UUID:                %pUB", c->uuid);
 	dbg_msg("big_lpt              %d", c->big_lpt);
 	dbg_msg("log LEBs:            %d (%d - %d)",
 		c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1842,22 +1837,32 @@ const struct super_operations ubifs_super_operations = {
  * @name: UBI volume name
  * @mode: UBI volume open mode
  *
- * There are several ways to specify UBI volumes when mounting UBIFS:
- * o ubiX_Y    - UBI device number X, volume Y;
- * o ubiY      - UBI device number 0, volume Y;
+ * The primary method of mounting UBIFS is by specifying the UBI volume
+ * character device node path. However, UBIFS may also be mounted withoug any
+ * character device node using one of the following methods:
+ *
+ * o ubiX_Y    - mount UBI device number X, volume Y;
+ * o ubiY      - mount UBI device number 0, volume Y;
  * o ubiX:NAME - mount UBI device X, volume with name NAME;
  * o ubi:NAME  - mount UBI device 0, volume with name NAME.
  *
  * Alternative '!' separator may be used instead of ':' (because some shells
  * like busybox may interpret ':' as an NFS host name separator). This function
- * returns ubi volume object in case of success and a negative error code in
- * case of failure.
+ * returns UBI volume description object in case of success and a negative
+ * error code in case of failure.
  */
 static struct ubi_volume_desc *open_ubi(const char *name, int mode)
 {
+	struct ubi_volume_desc *ubi;
 	int dev, vol;
 	char *endptr;
 
+	/* First, try to open using the device node path method */
+	ubi = ubi_open_volume_path(name, mode);
+	if (!IS_ERR(ubi))
+		return ubi;
+
+	/* Try the "nodev" method */
 	if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
 		return ERR_PTR(-EINVAL);
 
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 1e068535b58..82372e332f0 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -440,7 +440,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 	    (bloc->logicalBlockNum + count) >
 		partmap->s_partition_len) {
 		udf_debug("%d < %d || %d + %d > %d\n",
-			  bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
+			  bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
 			  partmap->s_partition_len);
 		goto error_return;
 	}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index b80cbd78833..f311d509b6a 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -196,6 +196,7 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 		mutex_lock(&inode->i_mutex);
 		lock_kernel();
 		udf_discard_prealloc(inode);
+		udf_truncate_tail_extent(inode);
 		unlock_kernel();
 		mutex_unlock(&inode->i_mutex);
 	}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 6d24c2c63f9..f90231eb291 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -97,15 +97,17 @@ no_delete:
  */
 void udf_clear_inode(struct inode *inode)
 {
-	struct udf_inode_info *iinfo;
-	if (!(inode->i_sb->s_flags & MS_RDONLY)) {
-		lock_kernel();
-		udf_truncate_tail_extent(inode);
-		unlock_kernel();
-		write_inode_now(inode, 0);
-		invalidate_inode_buffers(inode);
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
+	    inode->i_size != iinfo->i_lenExtents) {
+		printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
+			"inode size %llu different from extent lenght %llu. "
+			"Filesystem need not be standards compliant.\n",
+			inode->i_sb->s_id, inode->i_ino, inode->i_mode,
+			(unsigned long long)inode->i_size,
+			(unsigned long long)iinfo->i_lenExtents);
 	}
-	iinfo = UDF_I(inode);
 	kfree(iinfo->i_ext.i_data);
 	iinfo->i_ext.i_data = NULL;
 }
@@ -198,7 +200,6 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
 	int newblock;
 	struct buffer_head *dbh = NULL;
 	struct kernel_lb_addr eloc;
-	uint32_t elen;
 	uint8_t alloctype;
 	struct extent_position epos;
 
@@ -273,12 +274,11 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
 	eloc.logicalBlockNum = *block;
 	eloc.partitionReferenceNum =
 				iinfo->i_location.partitionReferenceNum;
-	elen = inode->i_sb->s_blocksize;
-	iinfo->i_lenExtents = elen;
+	iinfo->i_lenExtents = inode->i_size;
 	epos.bh = NULL;
 	epos.block = iinfo->i_location;
 	epos.offset = udf_file_entry_alloc_offset(inode);
-	udf_add_aext(inode, &epos, &eloc, elen, 0);
+	udf_add_aext(inode, &epos, &eloc, inode->i_size, 0);
 	/* UniqueID stuff */
 
 	brelse(epos.bh);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 21dad8c608f..cd2115060fd 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -408,15 +408,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 	}
 
 add:
-	/* Is there any extent whose size we need to round up? */
-	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
-		elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
-		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-			epos.offset -= sizeof(struct short_ad);
-		else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-			epos.offset -= sizeof(struct long_ad);
-		udf_write_aext(dir, &epos, &eloc, elen, 1);
-	}
 	f_pos += nfidlen;
 
 	if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB &&
@@ -439,6 +430,7 @@ add:
 		udf_current_aext(dir, &epos, &eloc, &elen, 1);
 	}
 
+	/* Entry fits into current block? */
 	if (sb->s_blocksize - fibh->eoffset >= nfidlen) {
 		fibh->soffset = fibh->eoffset;
 		fibh->eoffset += nfidlen;
@@ -462,6 +454,16 @@ add:
 				(fibh->sbh->b_data + fibh->soffset);
 		}
 	} else {
+		/* Round up last extent in the file */
+		elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
+		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+			epos.offset -= sizeof(struct short_ad);
+		else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+			epos.offset -= sizeof(struct long_ad);
+		udf_write_aext(dir, &epos, &eloc, elen, 1);
+		dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize
+					- 1) & ~(sb->s_blocksize - 1);
+
 		fibh->soffset = fibh->eoffset - sb->s_blocksize;
 		fibh->eoffset += nfidlen - sb->s_blocksize;
 		if (fibh->sbh != fibh->ebh) {
@@ -508,6 +510,20 @@ add:
 		dir->i_size += nfidlen;
 		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
 			dinfo->i_lenAlloc += nfidlen;
+		else {
+			/* Find the last extent and truncate it to proper size */
+			while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+				(EXT_RECORDED_ALLOCATED >> 30))
+				;
+			elen -= dinfo->i_lenExtents - dir->i_size;
+			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+			udf_write_aext(dir, &epos, &eloc, elen, 1);
+			dinfo->i_lenExtents = dir->i_size;
+		}
+
 		mark_inode_dirty(dir);
 		goto out_ok;
 	} else {
@@ -922,7 +938,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 		block = udf_get_pblock(inode->i_sb, block,
 				iinfo->i_location.partitionReferenceNum,
 				0);
-		epos.bh = udf_tread(inode->i_sb, block);
+		epos.bh = udf_tgetblk(inode->i_sb, block);
 		lock_buffer(epos.bh);
 		memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize);
 		set_buffer_uptodate(epos.bh);
@@ -999,6 +1015,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_size = elen;
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
 		iinfo->i_lenAlloc = inode->i_size;
+	else
+		udf_truncate_tail_extent(inode);
 	mark_inode_dirty(inode);
 
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 9d1b8c2e6c4..1e4543cbcd2 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1078,21 +1078,39 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 	return 0;
 }
 
-static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
+static void udf_find_vat_block(struct super_block *sb, int p_index,
+			       int type1_index, sector_t start_block)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct udf_part_map *map = &sbi->s_partmaps[p_index];
+	sector_t vat_block;
 	struct kernel_lb_addr ino;
+
+	/*
+	 * VAT file entry is in the last recorded block. Some broken disks have
+	 * it a few blocks before so try a bit harder...
+	 */
+	ino.partitionReferenceNum = type1_index;
+	for (vat_block = start_block;
+	     vat_block >= map->s_partition_root &&
+	     vat_block >= start_block - 3 &&
+	     !sbi->s_vat_inode; vat_block--) {
+		ino.logicalBlockNum = vat_block - map->s_partition_root;
+		sbi->s_vat_inode = udf_iget(sb, &ino);
+	}
+}
+
+static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map = &sbi->s_partmaps[p_index];
 	struct buffer_head *bh = NULL;
 	struct udf_inode_info *vati;
 	uint32_t pos;
 	struct virtualAllocationTable20 *vat20;
 	sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
 
-	/* VAT file entry is in the last recorded block */
-	ino.partitionReferenceNum = type1_index;
-	ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
-	sbi->s_vat_inode = udf_iget(sb, &ino);
+	udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
 	if (!sbi->s_vat_inode &&
 	    sbi->s_last_block != blocks - 1) {
 		printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
@@ -1100,9 +1118,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 		       "block of the device (%lu).\n",
 		       (unsigned long)sbi->s_last_block,
 		       (unsigned long)blocks - 1);
-		ino.partitionReferenceNum = type1_index;
-		ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
-		sbi->s_vat_inode = udf_iget(sb, &ino);
+		udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
 	}
 	if (!sbi->s_vat_inode)
 		return 1;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6f671f1ac27..22af68f8b68 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -70,13 +70,13 @@ static inline unsigned long ufs_dir_pages(struct inode *inode)
 	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
 }
 
-ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry)
+ino_t ufs_inode_by_name(struct inode *dir, struct qstr *qstr)
 {
 	ino_t res = 0;
 	struct ufs_dir_entry *de;
 	struct page *page;
 	
-	de = ufs_find_entry(dir, dentry, &page);
+	de = ufs_find_entry(dir, qstr, &page);
 	if (de) {
 		res = fs32_to_cpu(dir->i_sb, de->d_ino);
 		ufs_put_page(page);
@@ -249,12 +249,12 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
  * (as a parameter - res_dir). Page is returned mapped and unlocked.
  * Entry is guaranteed to be valid.
  */
-struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct qstr *qstr,
 				     struct page **res_page)
 {
 	struct super_block *sb = dir->i_sb;
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const char *name = qstr->name;
+	int namelen = qstr->len;
 	unsigned reclen = UFS_DIR_REC_LEN(namelen);
 	unsigned long start, n;
 	unsigned long npages = ufs_dir_pages(dir);
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 23119fe7ad6..4c26d9e8bc9 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -56,7 +56,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
 		return ERR_PTR(-ENAMETOOLONG);
 
 	lock_kernel();
-	ino = ufs_inode_by_name(dir, dentry);
+	ino = ufs_inode_by_name(dir, &dentry->d_name);
 	if (ino) {
 		inode = ufs_iget(dir->i_sb, ino);
 		if (IS_ERR(inode)) {
@@ -237,7 +237,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
 	struct page *page;
 	int err = -ENOENT;
 
-	de = ufs_find_entry(dir, dentry, &page);
+	de = ufs_find_entry(dir, &dentry->d_name, &page);
 	if (!de)
 		goto out;
 
@@ -281,7 +281,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct ufs_dir_entry *old_de;
 	int err = -ENOENT;
 
-	old_de = ufs_find_entry(old_dir, old_dentry, &old_page);
+	old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
 		goto out;
 
@@ -301,7 +301,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out_dir;
 
 		err = -ENOENT;
-		new_de = ufs_find_entry(new_dir, new_dentry, &new_page);
+		new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
 		inode_inc_link_count(old_inode);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 5faed7954d0..143c20bfb04 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -66,6 +66,7 @@
  */
 
 
+#include <linux/exportfs.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
 
@@ -96,6 +97,56 @@
 #include "swab.h"
 #include "util.h"
 
+static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
+{
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct inode *inode;
+
+	if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg)
+		return ERR_PTR(-ESTALE);
+
+	inode = ufs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+				       int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
+}
+
+static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid,
+				       int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
+}
+
+static struct dentry *ufs_get_parent(struct dentry *child)
+{
+	struct qstr dot_dot = {
+		.name	= "..",
+		.len	= 2,
+	};
+	ino_t ino;
+
+	ino = ufs_inode_by_name(child->d_inode, &dot_dot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+	return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino));
+}
+
+static const struct export_operations ufs_export_ops = {
+	.fh_to_dentry	= ufs_fh_to_dentry,
+	.fh_to_parent	= ufs_fh_to_parent,
+	.get_parent	= ufs_get_parent,
+};
+
 #ifdef CONFIG_UFS_DEBUG
 /*
  * Print contents of ufs_super_block, useful for debugging
@@ -990,6 +1041,7 @@ magic_found:
 	 * Read ufs_super_block into internal data structures
 	 */
 	sb->s_op = &ufs_super_ops;
+	sb->s_export_op = &ufs_export_ops;
 	sb->dq_op = NULL; /***/
 	sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
 
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 644e77e1359..0b4c39bc0d9 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned);
 /* dir.c */
 extern const struct inode_operations ufs_dir_inode_operations;
 extern int ufs_add_link (struct dentry *, struct inode *);
-extern ino_t ufs_inode_by_name(struct inode *, struct dentry *);
+extern ino_t ufs_inode_by_name(struct inode *, struct qstr *);
 extern int ufs_make_empty(struct inode *, struct inode *);
-extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct dentry *, struct page **);
+extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct qstr *, struct page **);
 extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
 extern int ufs_empty_dir (struct inode *);
 extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index c6ad7c7e3ee..05ac0fe9c4d 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -36,7 +36,7 @@ posix_acl_from_xattr(const void *value, size_t size)
 	if (count == 0)
 		return NULL;
 	
-	acl = posix_acl_alloc(count, GFP_KERNEL);
+	acl = posix_acl_alloc(count, GFP_NOFS);
 	if (!acl)
 		return ERR_PTR(-ENOMEM);
 	acl_e = acl->a_entries;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7a59daed178..56641fe52a2 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -26,6 +26,8 @@ endif
 
 obj-$(CONFIG_XFS_FS)		+= xfs.o
 
+xfs-y				+= linux-2.6/xfs_trace.o
+
 xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix quota/, \
 				   xfs_dquot.o \
 				   xfs_dquot_item.o \
@@ -90,8 +92,7 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_rw.o \
 				   xfs_dmops.o
 
-xfs-$(CONFIG_XFS_TRACE)		+= xfs_btree_trace.o \
-				   xfs_dir2_trace.o
+xfs-$(CONFIG_XFS_TRACE)		+= xfs_btree_trace.o
 
 # Objects in linux/
 xfs-y				+= $(addprefix $(XFS_LINUX)/, \
@@ -113,6 +114,3 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
 xfs-y				+= $(addprefix support/, \
 				   debug.o \
 				   uuid.o)
-
-xfs-$(CONFIG_XFS_TRACE)		+= support/ktrace.o
-
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b23a5450644..69e598b6986 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -21,6 +21,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c2e30eea74d..66abe36c121 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,6 +38,7 @@
 #include "xfs_rw.h"
 #include "xfs_iomap.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
@@ -76,7 +77,7 @@ xfs_ioend_wake(
 		wake_up(to_ioend_wq(ip));
 }
 
-STATIC void
+void
 xfs_count_page_state(
 	struct page		*page,
 	int			*delalloc,
@@ -98,48 +99,6 @@ xfs_count_page_state(
 	} while ((bh = bh->b_this_page) != head);
 }
 
-#if defined(XFS_RW_TRACE)
-void
-xfs_page_trace(
-	int		tag,
-	struct inode	*inode,
-	struct page	*page,
-	unsigned long	pgoff)
-{
-	xfs_inode_t	*ip;
-	loff_t		isize = i_size_read(inode);
-	loff_t		offset = page_offset(page);
-	int		delalloc = -1, unmapped = -1, unwritten = -1;
-
-	if (page_has_buffers(page))
-		xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-
-	ip = XFS_I(inode);
-	if (!ip->i_rwtrace)
-		return;
-
-	ktrace_enter(ip->i_rwtrace,
-		(void *)((unsigned long)tag),
-		(void *)ip,
-		(void *)inode,
-		(void *)page,
-		(void *)pgoff,
-		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-		(void *)((unsigned long)((isize >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(isize & 0xffffffff)),
-		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(offset & 0xffffffff)),
-		(void *)((unsigned long)delalloc),
-		(void *)((unsigned long)unmapped),
-		(void *)((unsigned long)unwritten),
-		(void *)((unsigned long)current_pid()),
-		(void *)NULL);
-}
-#else
-#define xfs_page_trace(tag, inode, page, pgoff)
-#endif
-
 STATIC struct block_device *
 xfs_find_bdev_for_inode(
 	struct xfs_inode	*ip)
@@ -235,71 +194,36 @@ xfs_setfilesize(
 }
 
 /*
- * Buffered IO write completion for delayed allocate extents.
- */
-STATIC void
-xfs_end_bio_delalloc(
-	struct work_struct	*work)
-{
-	xfs_ioend_t		*ioend =
-		container_of(work, xfs_ioend_t, io_work);
-
-	xfs_setfilesize(ioend);
-	xfs_destroy_ioend(ioend);
-}
-
-/*
- * Buffered IO write completion for regular, written extents.
- */
-STATIC void
-xfs_end_bio_written(
-	struct work_struct	*work)
-{
-	xfs_ioend_t		*ioend =
-		container_of(work, xfs_ioend_t, io_work);
-
-	xfs_setfilesize(ioend);
-	xfs_destroy_ioend(ioend);
-}
-
-/*
- * IO write completion for unwritten extents.
- *
- * Issue transactions to convert a buffer range from unwritten
- * to written extents.
+ * IO write completion.
  */
 STATIC void
-xfs_end_bio_unwritten(
+xfs_end_io(
 	struct work_struct	*work)
 {
 	xfs_ioend_t		*ioend =
 		container_of(work, xfs_ioend_t, io_work);
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
-	xfs_off_t		offset = ioend->io_offset;
-	size_t			size = ioend->io_size;
-
-	if (likely(!ioend->io_error)) {
-		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-			int error;
-			error = xfs_iomap_write_unwritten(ip, offset, size);
-			if (error)
-				ioend->io_error = error;
-		}
-		xfs_setfilesize(ioend);
-	}
-	xfs_destroy_ioend(ioend);
-}
 
-/*
- * IO read completion for regular, written extents.
- */
-STATIC void
-xfs_end_bio_read(
-	struct work_struct	*work)
-{
-	xfs_ioend_t		*ioend =
-		container_of(work, xfs_ioend_t, io_work);
+	/*
+	 * For unwritten extents we need to issue transactions to convert a
+	 * range to normal written extens after the data I/O has finished.
+	 */
+	if (ioend->io_type == IOMAP_UNWRITTEN &&
+	    likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
+		int error;
+
+		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+						 ioend->io_size);
+		if (error)
+			ioend->io_error = error;
+	}
 
+	/*
+	 * We might have to update the on-disk file size after extending
+	 * writes.
+	 */
+	if (ioend->io_type != IOMAP_READ)
+		xfs_setfilesize(ioend);
 	xfs_destroy_ioend(ioend);
 }
 
@@ -314,10 +238,10 @@ xfs_finish_ioend(
 	int		wait)
 {
 	if (atomic_dec_and_test(&ioend->io_remaining)) {
-		struct workqueue_struct *wq = xfsdatad_workqueue;
-		if (ioend->io_work.func == xfs_end_bio_unwritten)
-			wq = xfsconvertd_workqueue;
+		struct workqueue_struct *wq;
 
+		wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+			xfsconvertd_workqueue : xfsdatad_workqueue;
 		queue_work(wq, &ioend->io_work);
 		if (wait)
 			flush_workqueue(wq);
@@ -355,15 +279,7 @@ xfs_alloc_ioend(
 	ioend->io_offset = 0;
 	ioend->io_size = 0;
 
-	if (type == IOMAP_UNWRITTEN)
-		INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
-	else if (type == IOMAP_DELAY)
-		INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
-	else if (type == IOMAP_READ)
-		INIT_WORK(&ioend->io_work, xfs_end_bio_read);
-	else
-		INIT_WORK(&ioend->io_work, xfs_end_bio_written);
-
+	INIT_WORK(&ioend->io_work, xfs_end_io);
 	return ioend;
 }
 
@@ -380,7 +296,7 @@ xfs_map_blocks(
 	return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
 }
 
-STATIC_INLINE int
+STATIC int
 xfs_iomap_valid(
 	xfs_iomap_t		*iomapp,
 	loff_t			offset)
@@ -412,8 +328,9 @@ xfs_end_bio(
 
 STATIC void
 xfs_submit_ioend_bio(
-	xfs_ioend_t	*ioend,
-	struct bio	*bio)
+	struct writeback_control *wbc,
+	xfs_ioend_t		*ioend,
+	struct bio		*bio)
 {
 	atomic_inc(&ioend->io_remaining);
 	bio->bi_private = ioend;
@@ -426,7 +343,8 @@ xfs_submit_ioend_bio(
 	if (xfs_ioend_new_eof(ioend))
 		xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
 
-	submit_bio(WRITE, bio);
+	submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
+		   WRITE_SYNC_PLUG : WRITE, bio);
 	ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
 	bio_put(bio);
 }
@@ -505,6 +423,7 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
  */
 STATIC void
 xfs_submit_ioend(
+	struct writeback_control *wbc,
 	xfs_ioend_t		*ioend)
 {
 	xfs_ioend_t		*head = ioend;
@@ -533,19 +452,19 @@ xfs_submit_ioend(
  retry:
 				bio = xfs_alloc_ioend_bio(bh);
 			} else if (bh->b_blocknr != lastblock + 1) {
-				xfs_submit_ioend_bio(ioend, bio);
+				xfs_submit_ioend_bio(wbc, ioend, bio);
 				goto retry;
 			}
 
 			if (bio_add_buffer(bio, bh) != bh->b_size) {
-				xfs_submit_ioend_bio(ioend, bio);
+				xfs_submit_ioend_bio(wbc, ioend, bio);
 				goto retry;
 			}
 
 			lastblock = bh->b_blocknr;
 		}
 		if (bio)
-			xfs_submit_ioend_bio(ioend, bio);
+			xfs_submit_ioend_bio(wbc, ioend, bio);
 		xfs_finish_ioend(ioend, 0);
 	} while ((ioend = next) != NULL);
 }
@@ -904,16 +823,9 @@ xfs_convert_page(
 
 	if (startio) {
 		if (count) {
-			struct backing_dev_info *bdi;
-
-			bdi = inode->i_mapping->backing_dev_info;
 			wbc->nr_to_write--;
-			if (bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
-				done = 1;
-			} else if (wbc->nr_to_write <= 0) {
+			if (wbc->nr_to_write <= 0)
 				done = 1;
-			}
 		}
 		xfs_start_page_writeback(page, !page_dirty, count);
 	}
@@ -1198,7 +1110,7 @@ xfs_page_state_convert(
 	}
 
 	if (iohead)
-		xfs_submit_ioend(iohead);
+		xfs_submit_ioend(wbc, iohead);
 
 	return page_dirty;
 
@@ -1249,7 +1161,7 @@ xfs_vm_writepage(
 	int			delalloc, unmapped, unwritten;
 	struct inode		*inode = page->mapping->host;
 
-	xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
+	trace_xfs_writepage(inode, page, 0);
 
 	/*
 	 * We need a transaction if:
@@ -1354,7 +1266,7 @@ xfs_vm_releasepage(
 		.nr_to_write = 1,
 	};
 
-	xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0);
+	trace_xfs_releasepage(inode, page, 0);
 
 	if (!page_has_buffers(page))
 		return 0;
@@ -1535,7 +1447,7 @@ xfs_end_io_direct(
 		 * didn't map an unwritten extent so switch it's completion
 		 * handler.
 		 */
-		INIT_WORK(&ioend->io_work, xfs_end_bio_written);
+		ioend->io_type = IOMAP_NEW;
 		xfs_finish_ioend(ioend, 0);
 	}
 
@@ -1562,19 +1474,13 @@ xfs_vm_direct_IO(
 
 	bdev = xfs_find_bdev_for_inode(XFS_I(inode));
 
-	if (rw == WRITE) {
-		iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
-		ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
-			bdev, iov, offset, nr_segs,
-			xfs_get_blocks_direct,
-			xfs_end_io_direct);
-	} else {
-		iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
-		ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
-			bdev, iov, offset, nr_segs,
-			xfs_get_blocks_direct,
-			xfs_end_io_direct);
-	}
+	iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
+					IOMAP_UNWRITTEN : IOMAP_READ);
+
+	ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+					    offset, nr_segs,
+					    xfs_get_blocks_direct,
+					    xfs_end_io_direct);
 
 	if (unlikely(ret != -EIOCBQUEUED && iocb->private))
 		xfs_destroy_ioend(iocb->private);
@@ -1634,8 +1540,7 @@ xfs_vm_invalidatepage(
 	struct page		*page,
 	unsigned long		offset)
 {
-	xfs_page_trace(XFS_INVALIDPAGE_ENTER,
-			page->mapping->host, page, offset);
+	trace_xfs_invalidatepage(page->mapping->host, page, offset);
 	block_invalidatepage(page, offset);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 221b3e66cee..4cfc6ea87df 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -45,4 +45,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 extern void xfs_ioend_init(void);
 extern void xfs_ioend_wait(struct xfs_inode *);
 
+extern void xfs_count_page_state(struct page *, int *, int *, int *);
+
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 965df1227d6..b4c7d4248aa 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -39,6 +39,7 @@
 #include "xfs_ag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
+#include "xfs_trace.h"
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
@@ -53,34 +54,6 @@ static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
 struct workqueue_struct *xfsconvertd_workqueue;
 
-#ifdef XFS_BUF_TRACE
-void
-xfs_buf_trace(
-	xfs_buf_t	*bp,
-	char		*id,
-	void		*data,
-	void		*ra)
-{
-	ktrace_enter(xfs_buf_trace_buf,
-		bp, id,
-		(void *)(unsigned long)bp->b_flags,
-		(void *)(unsigned long)bp->b_hold.counter,
-		(void *)(unsigned long)bp->b_sema.count,
-		(void *)current,
-		data, ra,
-		(void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
-		(void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
-		(void *)(unsigned long)bp->b_buffer_length,
-		NULL, NULL, NULL, NULL, NULL);
-}
-ktrace_t *xfs_buf_trace_buf;
-#define XFS_BUF_TRACE_SIZE	4096
-#define XB_TRACE(bp, id, data)	\
-	xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
-#else
-#define XB_TRACE(bp, id, data)	do { } while (0)
-#endif
-
 #ifdef XFS_BUF_LOCK_TRACKING
 # define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
 # define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
@@ -149,7 +122,7 @@ page_region_mask(
 	return mask;
 }
 
-STATIC_INLINE void
+STATIC void
 set_page_region(
 	struct page	*page,
 	size_t		offset,
@@ -161,7 +134,7 @@ set_page_region(
 		SetPageUptodate(page);
 }
 
-STATIC_INLINE int
+STATIC int
 test_page_region(
 	struct page	*page,
 	size_t		offset,
@@ -279,7 +252,8 @@ _xfs_buf_initialize(
 	init_waitqueue_head(&bp->b_waiters);
 
 	XFS_STATS_INC(xb_create);
-	XB_TRACE(bp, "initialize", target);
+
+	trace_xfs_buf_init(bp, _RET_IP_);
 }
 
 /*
@@ -332,7 +306,7 @@ void
 xfs_buf_free(
 	xfs_buf_t		*bp)
 {
-	XB_TRACE(bp, "free", 0);
+	trace_xfs_buf_free(bp, _RET_IP_);
 
 	ASSERT(list_empty(&bp->b_hash_list));
 
@@ -445,7 +419,6 @@ _xfs_buf_lookup_pages(
 	if (page_count == bp->b_page_count)
 		bp->b_flags |= XBF_DONE;
 
-	XB_TRACE(bp, "lookup_pages", (long)page_count);
 	return error;
 }
 
@@ -548,7 +521,6 @@ found:
 	if (down_trylock(&bp->b_sema)) {
 		if (!(flags & XBF_TRYLOCK)) {
 			/* wait for buffer ownership */
-			XB_TRACE(bp, "get_lock", 0);
 			xfs_buf_lock(bp);
 			XFS_STATS_INC(xb_get_locked_waited);
 		} else {
@@ -571,7 +543,8 @@ found:
 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
 		bp->b_flags &= XBF_MAPPED;
 	}
-	XB_TRACE(bp, "got_lock", 0);
+
+	trace_xfs_buf_find(bp, flags, _RET_IP_);
 	XFS_STATS_INC(xb_get_locked);
 	return bp;
 }
@@ -582,7 +555,7 @@ found:
  *	although backing storage may not be.
  */
 xfs_buf_t *
-xfs_buf_get_flags(
+xfs_buf_get(
 	xfs_buftarg_t		*target,/* target for buffer		*/
 	xfs_off_t		ioff,	/* starting offset of range	*/
 	size_t			isize,	/* length of range		*/
@@ -627,7 +600,7 @@ xfs_buf_get_flags(
 	bp->b_bn = ioff;
 	bp->b_count_desired = bp->b_buffer_length;
 
-	XB_TRACE(bp, "get", (unsigned long)flags);
+	trace_xfs_buf_get(bp, flags, _RET_IP_);
 	return bp;
 
  no_buffer:
@@ -644,8 +617,6 @@ _xfs_buf_read(
 {
 	int			status;
 
-	XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
-
 	ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
 	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
 
@@ -661,7 +632,7 @@ _xfs_buf_read(
 }
 
 xfs_buf_t *
-xfs_buf_read_flags(
+xfs_buf_read(
 	xfs_buftarg_t		*target,
 	xfs_off_t		ioff,
 	size_t			isize,
@@ -671,21 +642,20 @@ xfs_buf_read_flags(
 
 	flags |= XBF_READ;
 
-	bp = xfs_buf_get_flags(target, ioff, isize, flags);
+	bp = xfs_buf_get(target, ioff, isize, flags);
 	if (bp) {
+		trace_xfs_buf_read(bp, flags, _RET_IP_);
+
 		if (!XFS_BUF_ISDONE(bp)) {
-			XB_TRACE(bp, "read", (unsigned long)flags);
 			XFS_STATS_INC(xb_get_read);
 			_xfs_buf_read(bp, flags);
 		} else if (flags & XBF_ASYNC) {
-			XB_TRACE(bp, "read_async", (unsigned long)flags);
 			/*
 			 * Read ahead call which is already satisfied,
 			 * drop the buffer
 			 */
 			goto no_buffer;
 		} else {
-			XB_TRACE(bp, "read_done", (unsigned long)flags);
 			/* We do not want read in the flags */
 			bp->b_flags &= ~XBF_READ;
 		}
@@ -718,7 +688,7 @@ xfs_buf_readahead(
 		return;
 
 	flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
-	xfs_buf_read_flags(target, ioff, isize, flags);
+	xfs_buf_read(target, ioff, isize, flags);
 }
 
 xfs_buf_t *
@@ -823,7 +793,7 @@ xfs_buf_get_noaddr(
 
 	xfs_buf_unlock(bp);
 
-	XB_TRACE(bp, "no_daddr", len);
+	trace_xfs_buf_get_noaddr(bp, _RET_IP_);
 	return bp;
 
  fail_free_mem:
@@ -845,8 +815,8 @@ void
 xfs_buf_hold(
 	xfs_buf_t		*bp)
 {
+	trace_xfs_buf_hold(bp, _RET_IP_);
 	atomic_inc(&bp->b_hold);
-	XB_TRACE(bp, "hold", 0);
 }
 
 /*
@@ -859,7 +829,7 @@ xfs_buf_rele(
 {
 	xfs_bufhash_t		*hash = bp->b_hash;
 
-	XB_TRACE(bp, "rele", bp->b_relse);
+	trace_xfs_buf_rele(bp, _RET_IP_);
 
 	if (unlikely(!hash)) {
 		ASSERT(!bp->b_relse);
@@ -909,21 +879,19 @@ xfs_buf_cond_lock(
 	int			locked;
 
 	locked = down_trylock(&bp->b_sema) == 0;
-	if (locked) {
+	if (locked)
 		XB_SET_OWNER(bp);
-	}
-	XB_TRACE(bp, "cond_lock", (long)locked);
+
+	trace_xfs_buf_cond_lock(bp, _RET_IP_);
 	return locked ? 0 : -EBUSY;
 }
 
-#if defined(DEBUG) || defined(XFS_BLI_TRACE)
 int
 xfs_buf_lock_value(
 	xfs_buf_t		*bp)
 {
 	return bp->b_sema.count;
 }
-#endif
 
 /*
  *	Locks a buffer object.
@@ -935,12 +903,14 @@ void
 xfs_buf_lock(
 	xfs_buf_t		*bp)
 {
-	XB_TRACE(bp, "lock", 0);
+	trace_xfs_buf_lock(bp, _RET_IP_);
+
 	if (atomic_read(&bp->b_io_remaining))
 		blk_run_address_space(bp->b_target->bt_mapping);
 	down(&bp->b_sema);
 	XB_SET_OWNER(bp);
-	XB_TRACE(bp, "locked", 0);
+
+	trace_xfs_buf_lock_done(bp, _RET_IP_);
 }
 
 /*
@@ -962,7 +932,8 @@ xfs_buf_unlock(
 
 	XB_CLEAR_OWNER(bp);
 	up(&bp->b_sema);
-	XB_TRACE(bp, "unlock", 0);
+
+	trace_xfs_buf_unlock(bp, _RET_IP_);
 }
 
 
@@ -974,17 +945,18 @@ void
 xfs_buf_pin(
 	xfs_buf_t		*bp)
 {
+	trace_xfs_buf_pin(bp, _RET_IP_);
 	atomic_inc(&bp->b_pin_count);
-	XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
 }
 
 void
 xfs_buf_unpin(
 	xfs_buf_t		*bp)
 {
+	trace_xfs_buf_unpin(bp, _RET_IP_);
+
 	if (atomic_dec_and_test(&bp->b_pin_count))
 		wake_up_all(&bp->b_waiters);
-	XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
 }
 
 int
@@ -1035,7 +1007,7 @@ xfs_buf_iodone_work(
 	 */
 	if ((bp->b_error == EOPNOTSUPP) &&
 	    (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-		XB_TRACE(bp, "ordered_retry", bp->b_iodone);
+		trace_xfs_buf_ordered_retry(bp, _RET_IP_);
 		bp->b_flags &= ~XBF_ORDERED;
 		bp->b_flags |= _XFS_BARRIER_FAILED;
 		xfs_buf_iorequest(bp);
@@ -1050,12 +1022,12 @@ xfs_buf_ioend(
 	xfs_buf_t		*bp,
 	int			schedule)
 {
+	trace_xfs_buf_iodone(bp, _RET_IP_);
+
 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
 	if (bp->b_error == 0)
 		bp->b_flags |= XBF_DONE;
 
-	XB_TRACE(bp, "iodone", bp->b_iodone);
-
 	if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
 		if (schedule) {
 			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
@@ -1075,7 +1047,7 @@ xfs_buf_ioerror(
 {
 	ASSERT(error >= 0 && error <= 0xffff);
 	bp->b_error = (unsigned short)error;
-	XB_TRACE(bp, "ioerror", (unsigned long)error);
+	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
 }
 
 int
@@ -1083,7 +1055,7 @@ xfs_bawrite(
 	void			*mp,
 	struct xfs_buf		*bp)
 {
-	XB_TRACE(bp, "bawrite", 0);
+	trace_xfs_buf_bawrite(bp, _RET_IP_);
 
 	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
 
@@ -1102,7 +1074,7 @@ xfs_bdwrite(
 	void			*mp,
 	struct xfs_buf		*bp)
 {
-	XB_TRACE(bp, "bdwrite", 0);
+	trace_xfs_buf_bdwrite(bp, _RET_IP_);
 
 	bp->b_strat = xfs_bdstrat_cb;
 	bp->b_mount = mp;
@@ -1113,7 +1085,7 @@ xfs_bdwrite(
 	xfs_buf_delwri_queue(bp, 1);
 }
 
-STATIC_INLINE void
+STATIC void
 _xfs_buf_ioend(
 	xfs_buf_t		*bp,
 	int			schedule)
@@ -1253,7 +1225,7 @@ int
 xfs_buf_iorequest(
 	xfs_buf_t		*bp)
 {
-	XB_TRACE(bp, "iorequest", 0);
+	trace_xfs_buf_iorequest(bp, _RET_IP_);
 
 	if (bp->b_flags & XBF_DELWRI) {
 		xfs_buf_delwri_queue(bp, 1);
@@ -1287,11 +1259,13 @@ int
 xfs_buf_iowait(
 	xfs_buf_t		*bp)
 {
-	XB_TRACE(bp, "iowait", 0);
+	trace_xfs_buf_iowait(bp, _RET_IP_);
+
 	if (atomic_read(&bp->b_io_remaining))
 		blk_run_address_space(bp->b_target->bt_mapping);
 	wait_for_completion(&bp->b_iowait);
-	XB_TRACE(bp, "iowaited", (long)bp->b_error);
+
+	trace_xfs_buf_iowait_done(bp, _RET_IP_);
 	return bp->b_error;
 }
 
@@ -1604,7 +1578,8 @@ xfs_buf_delwri_queue(
 	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
 	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
 
-	XB_TRACE(bp, "delwri_q", (long)unlock);
+	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
+
 	ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
 
 	spin_lock(dwlk);
@@ -1644,7 +1619,7 @@ xfs_buf_delwri_dequeue(
 	if (dequeued)
 		xfs_buf_rele(bp);
 
-	XB_TRACE(bp, "delwri_dq", (long)dequeued);
+	trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
 }
 
 STATIC void
@@ -1692,7 +1667,7 @@ xfs_buf_delwri_split(
 	INIT_LIST_HEAD(list);
 	spin_lock(dwlk);
 	list_for_each_entry_safe(bp, n, dwq, b_list) {
-		XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
+		trace_xfs_buf_delwri_split(bp, _RET_IP_);
 		ASSERT(bp->b_flags & XBF_DELWRI);
 
 		if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1816,14 +1791,10 @@ xfs_flush_buftarg(
 int __init
 xfs_buf_init(void)
 {
-#ifdef XFS_BUF_TRACE
-	xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
-#endif
-
 	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
 						KM_ZONE_HWALIGN, NULL);
 	if (!xfs_buf_zone)
-		goto out_free_trace_buf;
+		goto out;
 
 	xfslogd_workqueue = create_workqueue("xfslogd");
 	if (!xfslogd_workqueue)
@@ -1846,10 +1817,7 @@ xfs_buf_init(void)
 	destroy_workqueue(xfslogd_workqueue);
  out_free_buf_zone:
 	kmem_zone_destroy(xfs_buf_zone);
- out_free_trace_buf:
-#ifdef XFS_BUF_TRACE
-	ktrace_free(xfs_buf_trace_buf);
-#endif
+ out:
 	return -ENOMEM;
 }
 
@@ -1861,9 +1829,6 @@ xfs_buf_terminate(void)
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);
 	kmem_zone_destroy(xfs_buf_zone);
-#ifdef XFS_BUF_TRACE
-	ktrace_free(xfs_buf_trace_buf);
-#endif
 }
 
 #ifdef CONFIG_KDB_MODULES
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 9b4d666ad31..a509f4addc2 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -95,6 +95,28 @@ typedef enum {
 	_XFS_BARRIER_FAILED = (1 << 23),
 } xfs_buf_flags_t;
 
+#define XFS_BUF_FLAGS \
+	{ XBF_READ,		"READ" }, \
+	{ XBF_WRITE,		"WRITE" }, \
+	{ XBF_MAPPED,		"MAPPED" }, \
+	{ XBF_ASYNC,		"ASYNC" }, \
+	{ XBF_DONE,		"DONE" }, \
+	{ XBF_DELWRI,		"DELWRI" }, \
+	{ XBF_STALE,		"STALE" }, \
+	{ XBF_FS_MANAGED,	"FS_MANAGED" }, \
+	{ XBF_ORDERED,		"ORDERED" }, \
+	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
+	{ XBF_LOCK,		"LOCK" },  	/* should never be set */\
+	{ XBF_TRYLOCK,		"TRYLOCK" }, 	/* ditto */\
+	{ XBF_DONT_BLOCK,	"DONT_BLOCK" },	/* ditto */\
+	{ _XBF_PAGE_CACHE,	"PAGE_CACHE" }, \
+	{ _XBF_PAGES,		"PAGES" }, \
+	{ _XBF_RUN_QUEUES,	"RUN_QUEUES" }, \
+	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
+	{ _XBF_PAGE_LOCKED,	"PAGE_LOCKED" }, \
+	{ _XFS_BARRIER_FAILED,	"BARRIER_FAILED" }
+
+
 typedef enum {
 	XBT_FORCE_SLEEP = 0,
 	XBT_FORCE_FLUSH = 1,
@@ -186,15 +208,10 @@ extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
 #define xfs_incore(buftarg,blkno,len,lockit) \
 	_xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
 
-extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t,
+extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
-#define xfs_buf_get(target, blkno, len, flags) \
-	xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
-
-extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t,
+extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
-#define xfs_buf_read(target, blkno, len, flags) \
-	xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
 
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
 extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
@@ -248,13 +265,6 @@ extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
 extern int xfs_buf_init(void);
 extern void xfs_buf_terminate(void);
 
-#ifdef XFS_BUF_TRACE
-extern ktrace_t *xfs_buf_trace_buf;
-extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
-#else
-#define xfs_buf_trace(bp,id,ptr,ra)	do { } while (0)
-#endif
-
 #define xfs_buf_target_name(target)	\
 	({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
 
@@ -370,10 +380,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 
 #define xfs_bpin(bp)		xfs_buf_pin(bp)
 #define xfs_bunpin(bp)		xfs_buf_unpin(bp)
-
-#define xfs_buftrace(id, bp)	\
-	    xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
-
 #define xfs_biodone(bp)		xfs_buf_ioend(bp, 0)
 
 #define xfs_biomove(bp, off, len, data, rw) \
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index eff61e2732a..e4caeb28ce2 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -52,7 +52,7 @@ xfs_file_aio_read(
 	loff_t			pos)
 {
 	struct file		*file = iocb->ki_filp;
-	int			ioflags = IO_ISAIO;
+	int			ioflags = 0;
 
 	BUG_ON(iocb->ki_pos != pos);
 	if (unlikely(file->f_flags & O_DIRECT))
@@ -71,7 +71,7 @@ xfs_file_aio_write(
 	loff_t			pos)
 {
 	struct file		*file = iocb->ki_filp;
-	int			ioflags = IO_ISAIO;
+	int			ioflags = 0;
 
 	BUG_ON(iocb->ki_pos != pos);
 	if (unlikely(file->f_flags & O_DIRECT))
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 08be36d7326..7501b85fd86 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -19,6 +19,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_trace.h"
 
 int  fs_noerr(void) { return 0; }
 int  fs_nosys(void) { return ENOSYS; }
@@ -51,6 +52,8 @@ xfs_flushinval_pages(
 	struct address_space *mapping = VFS_I(ip)->i_mapping;
 	int		ret = 0;
 
+	trace_xfs_pagecache_inval(ip, first, last);
+
 	if (mapping->nrpages) {
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
 		ret = filemap_write_and_wait(mapping);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 5bb523d7f37..a034cf62443 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -51,6 +51,7 @@
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
+#include "xfs_trace.h"
 
 #include <linux/capability.h>
 #include <linux/dcache.h>
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index eafcc7c1870..be1527b1670 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -46,6 +46,7 @@
 #include "xfs_attr.h"
 #include "xfs_ioctl.h"
 #include "xfs_ioctl32.h"
+#include "xfs_trace.h"
 
 #define  _NATIVE_IOC(cmd, type) \
 	  _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index cd42ef78f6b..1d5b298ba8b 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -47,6 +47,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -573,8 +574,8 @@ xfs_vn_fallocate(
 	bf.l_len = len;
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-	error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-				      0, XFS_ATTR_NOLOCK);
+	error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
+				       0, XFS_ATTR_NOLOCK);
 	if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
 	    offset + len > i_size_read(inode))
 		new_size = offset + len;
@@ -585,7 +586,7 @@ xfs_vn_fallocate(
 
 		iattr.ia_valid = ATTR_SIZE;
 		iattr.ia_size = new_size;
-		error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
 	}
 
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 6127e24062d..5af0c81ca1a 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -40,7 +40,6 @@
 #include <sv.h>
 #include <time.h>
 
-#include <support/ktrace.h>
 #include <support/debug.h>
 #include <support/uuid.h>
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 072050f8d34..0d32457abef 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -48,73 +48,12 @@
 #include "xfs_utils.h"
 #include "xfs_iomap.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 
 #include <linux/capability.h>
 #include <linux/writeback.h>
 
 
-#if defined(XFS_RW_TRACE)
-void
-xfs_rw_enter_trace(
-	int			tag,
-	xfs_inode_t		*ip,
-	void			*data,
-	size_t			segs,
-	loff_t			offset,
-	int			ioflags)
-{
-	if (ip->i_rwtrace == NULL)
-		return;
-	ktrace_enter(ip->i_rwtrace,
-		(void *)(unsigned long)tag,
-		(void *)ip,
-		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-		(void *)data,
-		(void *)((unsigned long)segs),
-		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(offset & 0xffffffff)),
-		(void *)((unsigned long)ioflags),
-		(void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(ip->i_new_size & 0xffffffff)),
-		(void *)((unsigned long)current_pid()),
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL);
-}
-
-void
-xfs_inval_cached_trace(
-	xfs_inode_t	*ip,
-	xfs_off_t	offset,
-	xfs_off_t	len,
-	xfs_off_t	first,
-	xfs_off_t	last)
-{
-
-	if (ip->i_rwtrace == NULL)
-		return;
-	ktrace_enter(ip->i_rwtrace,
-		(void *)(__psint_t)XFS_INVAL_CACHED,
-		(void *)ip,
-		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(offset & 0xffffffff)),
-		(void *)((unsigned long)((len >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(len & 0xffffffff)),
-		(void *)((unsigned long)((first >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(first & 0xffffffff)),
-		(void *)((unsigned long)((last >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(last & 0xffffffff)),
-		(void *)((unsigned long)current_pid()),
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL);
-}
-#endif
-
 /*
  *	xfs_iozero
  *
@@ -250,13 +189,10 @@ xfs_read(
 		}
 	}
 
-	xfs_rw_enter_trace(XFS_READ_ENTER, ip,
-				(void *)iovp, segs, *offset, ioflags);
+	trace_xfs_file_read(ip, size, *offset, ioflags);
 
 	iocb->ki_pos = *offset;
 	ret = generic_file_aio_read(iocb, iovp, segs, *offset);
-	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
-		ret = wait_on_sync_kiocb(iocb);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
@@ -294,8 +230,9 @@ xfs_splice_read(
 			return -error;
 		}
 	}
-	xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, ip,
-			   pipe, count, *ppos, ioflags);
+
+	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+
 	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
@@ -344,8 +281,8 @@ xfs_splice_write(
 		ip->i_new_size = new_size;
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
-	xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, ip,
-			   pipe, count, *ppos, ioflags);
+	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
+
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_write_bytes, ret);
@@ -712,8 +649,6 @@ start:
 	if ((ioflags & IO_ISDIRECT)) {
 		if (mapping->nrpages) {
 			WARN_ON(need_i_mutex == 0);
-			xfs_inval_cached_trace(xip, pos, -1,
-					(pos & PAGE_CACHE_MASK), -1);
 			error = xfs_flushinval_pages(xip,
 					(pos & PAGE_CACHE_MASK),
 					-1, FI_REMAPF_LOCKED);
@@ -730,8 +665,7 @@ start:
 			need_i_mutex = 0;
 		}
 
- 		xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs,
-				*offset, ioflags);
+		trace_xfs_file_direct_write(xip, count, *offset, ioflags);
 		ret = generic_file_direct_write(iocb, iovp,
 				&segs, pos, offset, count, ocount);
 
@@ -754,8 +688,7 @@ start:
 		ssize_t ret2 = 0;
 
 write_retry:
-		xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
-				*offset, ioflags);
+		trace_xfs_file_buffered_write(xip, count, *offset, ioflags);
 		ret2 = generic_file_buffered_write(iocb, iovp, segs,
 				pos, offset, count, ret);
 		/*
@@ -774,9 +707,6 @@ write_retry:
 
 	current->backing_dev_info = NULL;
 
-	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
-		ret = wait_on_sync_kiocb(iocb);
-
 	isize = i_size_read(inode);
 	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
 		*offset = isize;
@@ -811,7 +741,7 @@ write_retry:
 	XFS_STATS_ADD(xs_write_bytes, ret);
 
 	/* Handle various SYNC-type writes */
-	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
 		loff_t end = pos + ret - 1;
 		int error2;
 
@@ -863,7 +793,7 @@ int
 xfs_bdstrat_cb(struct xfs_buf *bp)
 {
 	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
-		xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
+		trace_xfs_bdstrat_shut(bp, _RET_IP_);
 		/*
 		 * Metadata write that didn't get logged but
 		 * written delayed anyway. These aren't associated
@@ -896,7 +826,7 @@ xfsbdstrat(
 		return;
 	}
 
-	xfs_buftrace("XFSBDSTRAT IOERROR", bp);
+	trace_xfs_bdstrat_shut(bp, _RET_IP_);
 	xfs_bioerror_relse(bp);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index e6be37dbd0e..d1f7789c7ff 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -20,52 +20,7 @@
 
 struct xfs_mount;
 struct xfs_inode;
-struct xfs_bmbt_irec;
 struct xfs_buf;
-struct xfs_iomap;
-
-#if defined(XFS_RW_TRACE)
-/*
- * Defines for the trace mechanisms in xfs_lrw.c.
- */
-#define	XFS_RW_KTRACE_SIZE	128
-
-#define	XFS_READ_ENTER		1
-#define	XFS_WRITE_ENTER		2
-#define XFS_IOMAP_READ_ENTER	3
-#define	XFS_IOMAP_WRITE_ENTER	4
-#define	XFS_IOMAP_READ_MAP	5
-#define	XFS_IOMAP_WRITE_MAP	6
-#define	XFS_IOMAP_WRITE_NOSPACE	7
-#define	XFS_ITRUNC_START	8
-#define	XFS_ITRUNC_FINISH1	9
-#define	XFS_ITRUNC_FINISH2	10
-#define	XFS_CTRUNC1		11
-#define	XFS_CTRUNC2		12
-#define	XFS_CTRUNC3		13
-#define	XFS_CTRUNC4		14
-#define	XFS_CTRUNC5		15
-#define	XFS_CTRUNC6		16
-#define	XFS_BUNMAP		17
-#define	XFS_INVAL_CACHED	18
-#define	XFS_DIORD_ENTER		19
-#define	XFS_DIOWR_ENTER		20
-#define	XFS_WRITEPAGE_ENTER	22
-#define	XFS_RELEASEPAGE_ENTER	23
-#define	XFS_INVALIDPAGE_ENTER	24
-#define	XFS_IOMAP_ALLOC_ENTER	25
-#define	XFS_IOMAP_ALLOC_MAP	26
-#define	XFS_IOMAP_UNWRITTEN	27
-#define XFS_SPLICE_READ_ENTER	28
-#define XFS_SPLICE_WRITE_ENTER	29
-extern void xfs_rw_enter_trace(int, struct xfs_inode *,
-		void *, size_t, loff_t, int);
-extern void xfs_inval_cached_trace(struct xfs_inode *,
-		xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t);
-#else
-#define xfs_rw_enter_trace(tag, ip, data, size, offset, ioflags)
-#define xfs_inval_cached_trace(ip, offset, len, first, last)
-#endif
 
 /* errors from xfsbdstrat() must be extracted from the buffer */
 extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 18a4b8e11df..09783cc444a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -15,6 +15,7 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
+
 #include "xfs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
@@ -52,11 +53,11 @@
 #include "xfs_trans_priv.h"
 #include "xfs_filestream.h"
 #include "xfs_da_btree.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
 #include "xfs_sync.h"
+#include "xfs_trace.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -930,13 +931,39 @@ xfs_fs_alloc_inode(
  */
 STATIC void
 xfs_fs_destroy_inode(
-	struct inode	*inode)
+	struct inode		*inode)
 {
-	xfs_inode_t		*ip = XFS_I(inode);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	xfs_itrace_entry(ip);
 
 	XFS_STATS_INC(vn_reclaim);
-	if (xfs_reclaim(ip))
-		panic("%s: cannot reclaim 0x%p\n", __func__, inode);
+
+	/* bad inode, get out here ASAP */
+	if (is_bad_inode(inode))
+		goto out_reclaim;
+
+	xfs_ioend_wait(ip);
+
+	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+
+	/*
+	 * We should never get here with one of the reclaim flags already set.
+	 */
+	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+
+	/*
+	 * If we have nothing to flush with this inode then complete the
+	 * teardown now, otherwise delay the flush operation.
+	 */
+	if (!xfs_inode_clean(ip)) {
+		xfs_inode_set_reclaim_tag(ip);
+		return;
+	}
+
+out_reclaim:
+	xfs_ireclaim(ip);
 }
 
 /*
@@ -973,7 +1000,6 @@ xfs_fs_inode_init_once(
 
 	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 		     "xfsino", ip->i_ino);
-	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 }
 
 /*
@@ -1075,6 +1101,20 @@ xfs_fs_clear_inode(
 	XFS_STATS_INC(vn_remove);
 	XFS_STATS_DEC(vn_active);
 
+	/*
+	 * The iolock is used by the file system to coordinate reads,
+	 * writes, and block truncates.  Up to this point the lock
+	 * protected concurrent accesses by users of the inode.  But
+	 * from here forward we're doing some final processing of the
+	 * inode because we're done with it, and although we reuse the
+	 * iolock for protection it is really a distinct lock class
+	 * (in the lockdep sense) from before.  To keep lockdep happy
+	 * (and basically indicate what we are doing), we explicitly
+	 * re-init the iolock here.
+	 */
+	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
 	xfs_inactive(ip);
 }
 
@@ -1092,8 +1132,6 @@ xfs_fs_put_super(
 	struct super_block	*sb)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
-	struct xfs_inode	*rip = mp->m_rootip;
-	int			unmount_event_flags = 0;
 
 	xfs_syncd_stop(mp);
 
@@ -1109,20 +1147,7 @@ xfs_fs_put_super(
 		xfs_sync_attr(mp, 0);
 	}
 
-#ifdef HAVE_DMAPI
-	if (mp->m_flags & XFS_MOUNT_DMAPI) {
-		unmount_event_flags =
-			(mp->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ?
-				0 : DM_FLAGS_UNWANTED;
-		/*
-		 * Ignore error from dmapi here, first unmount is not allowed
-		 * to fail anyway, and second we wouldn't want to fail a
-		 * unmount because of dmapi.
-		 */
-		XFS_SEND_PREUNMOUNT(mp, rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
-				NULL, NULL, 0, 0, unmount_event_flags);
-	}
-#endif
+	XFS_SEND_PREUNMOUNT(mp);
 
 	/*
 	 * Blow away any referenced inode in the filestreams cache.
@@ -1133,10 +1158,7 @@ xfs_fs_put_super(
 
 	XFS_bflush(mp->m_ddev_targp);
 
-	if (mp->m_flags & XFS_MOUNT_DMAPI) {
-		XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
-				unmount_event_flags);
-	}
+	XFS_SEND_UNMOUNT(mp);
 
 	xfs_unmountfs(mp);
 	xfs_freesb(mp);
@@ -1504,8 +1526,6 @@ xfs_fs_fill_super(
 		goto fail_vnrele;
 
 	kfree(mtpt);
-
-	xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
 	return 0;
 
  out_filestream_unmount:
@@ -1581,94 +1601,6 @@ static struct file_system_type xfs_fs_type = {
 };
 
 STATIC int __init
-xfs_alloc_trace_bufs(void)
-{
-#ifdef XFS_ALLOC_TRACE
-	xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_MAYFAIL);
-	if (!xfs_alloc_trace_buf)
-		goto out;
-#endif
-#ifdef XFS_BMAP_TRACE
-	xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_MAYFAIL);
-	if (!xfs_bmap_trace_buf)
-		goto out_free_alloc_trace;
-#endif
-#ifdef XFS_BTREE_TRACE
-	xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE,
-					     KM_MAYFAIL);
-	if (!xfs_allocbt_trace_buf)
-		goto out_free_bmap_trace;
-
-	xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL);
-	if (!xfs_inobt_trace_buf)
-		goto out_free_allocbt_trace;
-
-	xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
-	if (!xfs_bmbt_trace_buf)
-		goto out_free_inobt_trace;
-#endif
-#ifdef XFS_ATTR_TRACE
-	xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
-	if (!xfs_attr_trace_buf)
-		goto out_free_bmbt_trace;
-#endif
-#ifdef XFS_DIR2_TRACE
-	xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_MAYFAIL);
-	if (!xfs_dir2_trace_buf)
-		goto out_free_attr_trace;
-#endif
-
-	return 0;
-
-#ifdef XFS_DIR2_TRACE
- out_free_attr_trace:
-#endif
-#ifdef XFS_ATTR_TRACE
-	ktrace_free(xfs_attr_trace_buf);
- out_free_bmbt_trace:
-#endif
-#ifdef XFS_BTREE_TRACE
-	ktrace_free(xfs_bmbt_trace_buf);
- out_free_inobt_trace:
-	ktrace_free(xfs_inobt_trace_buf);
- out_free_allocbt_trace:
-	ktrace_free(xfs_allocbt_trace_buf);
- out_free_bmap_trace:
-#endif
-#ifdef XFS_BMAP_TRACE
-	ktrace_free(xfs_bmap_trace_buf);
- out_free_alloc_trace:
-#endif
-#ifdef XFS_ALLOC_TRACE
-	ktrace_free(xfs_alloc_trace_buf);
- out:
-#endif
-	return -ENOMEM;
-}
-
-STATIC void
-xfs_free_trace_bufs(void)
-{
-#ifdef XFS_DIR2_TRACE
-	ktrace_free(xfs_dir2_trace_buf);
-#endif
-#ifdef XFS_ATTR_TRACE
-	ktrace_free(xfs_attr_trace_buf);
-#endif
-#ifdef XFS_BTREE_TRACE
-	ktrace_free(xfs_bmbt_trace_buf);
-	ktrace_free(xfs_inobt_trace_buf);
-	ktrace_free(xfs_allocbt_trace_buf);
-#endif
-#ifdef XFS_BMAP_TRACE
-	ktrace_free(xfs_bmap_trace_buf);
-#endif
-#ifdef XFS_ALLOC_TRACE
-	ktrace_free(xfs_alloc_trace_buf);
-#endif
-}
-
-STATIC int __init
 xfs_init_zones(void)
 {
 
@@ -1809,7 +1741,6 @@ init_xfs_fs(void)
 	printk(KERN_INFO XFS_VERSION_STRING " with "
 			 XFS_BUILD_OPTIONS " enabled\n");
 
-	ktrace_init(64);
 	xfs_ioend_init();
 	xfs_dir_startup();
 
@@ -1817,13 +1748,9 @@ init_xfs_fs(void)
 	if (error)
 		goto out;
 
-	error = xfs_alloc_trace_bufs();
-	if (error)
-		goto out_destroy_zones;
-
 	error = xfs_mru_cache_init();
 	if (error)
-		goto out_free_trace_buffers;
+		goto out_destroy_zones;
 
 	error = xfs_filestream_init();
 	if (error)
@@ -1858,8 +1785,6 @@ init_xfs_fs(void)
 	xfs_filestream_uninit();
  out_mru_cache_uninit:
 	xfs_mru_cache_uninit();
- out_free_trace_buffers:
-	xfs_free_trace_bufs();
  out_destroy_zones:
 	xfs_destroy_zones();
  out:
@@ -1876,9 +1801,7 @@ exit_xfs_fs(void)
 	xfs_buf_terminate();
 	xfs_filestream_uninit();
 	xfs_mru_cache_uninit();
-	xfs_free_trace_bufs();
 	xfs_destroy_zones();
-	ktrace_uninit();
 }
 
 module_init(init_xfs_fs);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 18175ebd58e..233d4b9881b 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -56,12 +56,6 @@ extern void xfs_qm_exit(void);
 # define XFS_BIGFS_STRING
 #endif
 
-#ifdef CONFIG_XFS_TRACE
-# define XFS_TRACE_STRING	"tracing, "
-#else
-# define XFS_TRACE_STRING
-#endif
-
 #ifdef CONFIG_XFS_DMAPI
 # define XFS_DMAPI_STRING	"dmapi support, "
 #else
@@ -78,7 +72,6 @@ extern void xfs_qm_exit(void);
 				XFS_SECURITY_STRING \
 				XFS_REALTIME_STRING \
 				XFS_BIGFS_STRING \
-				XFS_TRACE_STRING \
 				XFS_DMAPI_STRING \
 				XFS_DBG_STRING /* DBG must be last */
 
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 961df0a22c7..6fed97a8cd3 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -44,6 +44,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_rw.h"
 #include "xfs_quota.h"
+#include "xfs_trace.h"
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -663,10 +664,9 @@ xfs_syncd_stop(
 	kthread_stop(mp->m_sync_task);
 }
 
-int
+STATIC int
 xfs_reclaim_inode(
 	xfs_inode_t	*ip,
-	int		locked,
 	int		sync_mode)
 {
 	xfs_perag_t	*pag = xfs_get_perag(ip->i_mount, ip->i_ino);
@@ -682,10 +682,6 @@ xfs_reclaim_inode(
 	    !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
 		spin_unlock(&ip->i_flags_lock);
 		write_unlock(&pag->pag_ici_lock);
-		if (locked) {
-			xfs_ifunlock(ip);
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		}
 		return -EAGAIN;
 	}
 	__xfs_iflags_set(ip, XFS_IRECLAIM);
@@ -704,10 +700,8 @@ xfs_reclaim_inode(
 	 * We get the flush lock regardless, though, just to make sure
 	 * we don't free it while it is being flushed.
 	 */
-	if (!locked) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_iflock(ip);
-	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_iflock(ip);
 
 	/*
 	 * In the case of a forced shutdown we rely on xfs_iflush() to
@@ -778,7 +772,7 @@ xfs_reclaim_inode_now(
 	}
 	read_unlock(&pag->pag_ici_lock);
 
-	return xfs_reclaim_inode(ip, 0, flags);
+	return xfs_reclaim_inode(ip, flags);
 }
 
 int
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 27920eb7a82..a500b4d9183 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -44,7 +44,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
 
 void xfs_flush_inodes(struct xfs_inode *ip);
 
-int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index c5bc67c4e3b..7bb5092d6ae 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -55,170 +55,140 @@ xfs_stats_clear_proc_handler(
 
 static ctl_table xfs_table[] = {
 	{
-		.ctl_name	= XFS_SGID_INHERIT,
 		.procname	= "irix_sgid_inherit",
 		.data		= &xfs_params.sgid_inherit.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.sgid_inherit.min,
 		.extra2		= &xfs_params.sgid_inherit.max
 	},
 	{
-		.ctl_name	= XFS_SYMLINK_MODE,
 		.procname	= "irix_symlink_mode",
 		.data		= &xfs_params.symlink_mode.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.symlink_mode.min,
 		.extra2		= &xfs_params.symlink_mode.max
 	},
 	{
-		.ctl_name	= XFS_PANIC_MASK,
 		.procname	= "panic_mask",
 		.data		= &xfs_params.panic_mask.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.panic_mask.min,
 		.extra2		= &xfs_params.panic_mask.max
 	},
 
 	{
-		.ctl_name	= XFS_ERRLEVEL,
 		.procname	= "error_level",
 		.data		= &xfs_params.error_level.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.error_level.min,
 		.extra2		= &xfs_params.error_level.max
 	},
 	{
-		.ctl_name	= XFS_SYNCD_TIMER,
 		.procname	= "xfssyncd_centisecs",
 		.data		= &xfs_params.syncd_timer.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.syncd_timer.min,
 		.extra2		= &xfs_params.syncd_timer.max
 	},
 	{
-		.ctl_name	= XFS_INHERIT_SYNC,
 		.procname	= "inherit_sync",
 		.data		= &xfs_params.inherit_sync.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.inherit_sync.min,
 		.extra2		= &xfs_params.inherit_sync.max
 	},
 	{
-		.ctl_name	= XFS_INHERIT_NODUMP,
 		.procname	= "inherit_nodump",
 		.data		= &xfs_params.inherit_nodump.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.inherit_nodump.min,
 		.extra2		= &xfs_params.inherit_nodump.max
 	},
 	{
-		.ctl_name	= XFS_INHERIT_NOATIME,
 		.procname	= "inherit_noatime",
 		.data		= &xfs_params.inherit_noatim.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.inherit_noatim.min,
 		.extra2		= &xfs_params.inherit_noatim.max
 	},
 	{
-		.ctl_name	= XFS_BUF_TIMER,
 		.procname	= "xfsbufd_centisecs",
 		.data		= &xfs_params.xfs_buf_timer.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.xfs_buf_timer.min,
 		.extra2		= &xfs_params.xfs_buf_timer.max
 	},
 	{
-		.ctl_name	= XFS_BUF_AGE,
 		.procname	= "age_buffer_centisecs",
 		.data		= &xfs_params.xfs_buf_age.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.xfs_buf_age.min,
 		.extra2		= &xfs_params.xfs_buf_age.max
 	},
 	{
-		.ctl_name	= XFS_INHERIT_NOSYM,
 		.procname	= "inherit_nosymlinks",
 		.data		= &xfs_params.inherit_nosym.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.inherit_nosym.min,
 		.extra2		= &xfs_params.inherit_nosym.max
 	},
 	{
-		.ctl_name	= XFS_ROTORSTEP,
 		.procname	= "rotorstep",
 		.data		= &xfs_params.rotorstep.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.rotorstep.min,
 		.extra2		= &xfs_params.rotorstep.max
 	},
 	{
-		.ctl_name	= XFS_INHERIT_NODFRG,
 		.procname	= "inherit_nodefrag",
 		.data		= &xfs_params.inherit_nodfrg.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.inherit_nodfrg.min,
 		.extra2		= &xfs_params.inherit_nodfrg.max
 	},
 	{
-		.ctl_name	= XFS_FILESTREAM_TIMER,
 		.procname	= "filestream_centisecs",
 		.data		= &xfs_params.fstrm_timer.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xfs_params.fstrm_timer.min,
 		.extra2		= &xfs_params.fstrm_timer.max,
 	},
 	/* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
 	{
-		.ctl_name	= XFS_STATS_CLEAR,
 		.procname	= "stats_clear",
 		.data		= &xfs_params.stats_clear.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &xfs_stats_clear_proc_handler,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= xfs_stats_clear_proc_handler,
 		.extra1		= &xfs_params.stats_clear.min,
 		.extra2		= &xfs_params.stats_clear.max
 	},
@@ -229,7 +199,6 @@ static ctl_table xfs_table[] = {
 
 static ctl_table xfs_dir_table[] = {
 	{
-		.ctl_name	= FS_XFS,
 		.procname	= "xfs",
 		.mode		= 0555,
 		.child		= xfs_table
@@ -239,7 +208,6 @@ static ctl_table xfs_dir_table[] = {
 
 static ctl_table xfs_root_table[] = {
 	{
-		.ctl_name	= CTL_FS,
 		.procname	= "fs",
 		.mode		= 0555,
 		.child		= xfs_dir_table
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
new file mode 100644
index 00000000000..856eb3c8d60
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_quota.h"
+#include "xfs_iomap.h"
+#include "xfs_aops.h"
+#include "quota/xfs_dquot_item.h"
+#include "quota/xfs_dquot.h"
+
+/*
+ * Format fsblock number into a static buffer & return it.
+ */
+STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno)
+{
+	static char rval[50];
+
+	if (bno == NULLFSBLOCK)
+		sprintf(rval, "NULLFSBLOCK");
+	else if (isnullstartblock(bno))
+		sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno));
+	else
+		sprintf(rval, "%lld", (xfs_dfsbno_t)bno);
+	return rval;
+}
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "xfs_trace.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
new file mode 100644
index 00000000000..c40834bdee5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -0,0 +1,1369 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs
+
+#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_H
+
+#include <linux/tracepoint.h>
+
+struct xfs_agf;
+struct xfs_alloc_arg;
+struct xfs_attr_list_context;
+struct xfs_buf_log_item;
+struct xfs_da_args;
+struct xfs_da_node_entry;
+struct xfs_dquot;
+struct xlog_ticket;
+struct log;
+
+#define DEFINE_ATTR_LIST_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_attr_list_context *ctx), \
+	TP_ARGS(ctx), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__field(u32, hashval) \
+		__field(u32, blkno) \
+		__field(u32, offset) \
+		__field(void *, alist) \
+		__field(int, bufsize) \
+		__field(int, count) \
+		__field(int, firstu) \
+		__field(int, dupcnt) \
+		__field(int, flags) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; \
+		__entry->ino = ctx->dp->i_ino; \
+		__entry->hashval = ctx->cursor->hashval; \
+		__entry->blkno = ctx->cursor->blkno; \
+		__entry->offset = ctx->cursor->offset; \
+		__entry->alist = ctx->alist; \
+		__entry->bufsize = ctx->bufsize; \
+		__entry->count = ctx->count; \
+		__entry->firstu = ctx->firstu; \
+		__entry->flags = ctx->flags; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " \
+		  "alist 0x%p size %u count %u firstu %u flags %d %s", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		   __entry->ino, \
+		   __entry->hashval, \
+		   __entry->blkno, \
+		   __entry->offset, \
+		   __entry->dupcnt, \
+		   __entry->alist, \
+		   __entry->bufsize, \
+		   __entry->count, \
+		   __entry->firstu, \
+		   __entry->flags, \
+		   __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) \
+	) \
+)
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+
+TRACE_EVENT(xfs_attr_list_node_descend,
+	TP_PROTO(struct xfs_attr_list_context *ctx,
+		 struct xfs_da_node_entry *btree),
+	TP_ARGS(ctx, btree),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(u32, hashval)
+		__field(u32, blkno)
+		__field(u32, offset)
+		__field(void *, alist)
+		__field(int, bufsize)
+		__field(int, count)
+		__field(int, firstu)
+		__field(int, dupcnt)
+		__field(int, flags)
+		__field(u32, bt_hashval)
+		__field(u32, bt_before)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+		__entry->ino = ctx->dp->i_ino;
+		__entry->hashval = ctx->cursor->hashval;
+		__entry->blkno = ctx->cursor->blkno;
+		__entry->offset = ctx->cursor->offset;
+		__entry->alist = ctx->alist;
+		__entry->bufsize = ctx->bufsize;
+		__entry->count = ctx->count;
+		__entry->firstu = ctx->firstu;
+		__entry->flags = ctx->flags;
+		__entry->bt_hashval = be32_to_cpu(btree->hashval);
+		__entry->bt_before = be32_to_cpu(btree->before);
+	),
+	TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+		  "alist 0x%p size %u count %u firstu %u flags %d %s "
+		  "node hashval %u, node before %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->hashval,
+		   __entry->blkno,
+		   __entry->offset,
+		   __entry->dupcnt,
+		   __entry->alist,
+		   __entry->bufsize,
+		   __entry->count,
+		   __entry->firstu,
+		   __entry->flags,
+		   __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+		   __entry->bt_hashval,
+		   __entry->bt_before)
+);
+
+TRACE_EVENT(xfs_iext_insert,
+	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
+		 struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
+	TP_ARGS(ip, idx, r, state, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_extnum_t, idx)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+		__field(xfs_exntst_t, state)
+		__field(int, bmap_state)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->idx = idx;
+		__entry->startoff = r->br_startoff;
+		__entry->startblock = r->br_startblock;
+		__entry->blockcount = r->br_blockcount;
+		__entry->state = r->br_state;
+		__entry->bmap_state = state;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+		  "offset %lld block %s count %lld flag %d caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+		  (long)__entry->idx,
+		  __entry->startoff,
+		  xfs_fmtfsblock(__entry->startblock),
+		  __entry->blockcount,
+		  __entry->state,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_BMAP_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip, idx, state, caller_ip), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__field(xfs_extnum_t, idx) \
+		__field(xfs_fileoff_t, startoff) \
+		__field(xfs_fsblock_t, startblock) \
+		__field(xfs_filblks_t, blockcount) \
+		__field(xfs_exntst_t, state) \
+		__field(int, bmap_state) \
+		__field(unsigned long, caller_ip) \
+	), \
+	TP_fast_assign( \
+		struct xfs_ifork	*ifp = (state & BMAP_ATTRFORK) ? \
+						ip->i_afp : &ip->i_df; \
+		struct xfs_bmbt_irec	r; \
+	\
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); \
+		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
+		__entry->ino = ip->i_ino; \
+		__entry->idx = idx; \
+		__entry->startoff = r.br_startoff; \
+		__entry->startblock = r.br_startblock; \
+		__entry->blockcount = r.br_blockcount; \
+		__entry->state = r.br_state; \
+		__entry->bmap_state = state; \
+		__entry->caller_ip = caller_ip; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " \
+		  "offset %lld block %s count %lld flag %d caller %pf", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino, \
+		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), \
+		  (long)__entry->idx, \
+		  __entry->startoff, \
+		  xfs_fmtfsblock(__entry->startblock), \
+		  __entry->blockcount, \
+		  __entry->state, \
+		  (char *)__entry->caller_ip) \
+)
+
+DEFINE_BMAP_EVENT(xfs_iext_remove);
+DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
+DEFINE_BMAP_EVENT(xfs_bmap_post_update);
+DEFINE_BMAP_EVENT(xfs_extlist);
+
+#define DEFINE_BUF_EVENT(tname) \
+TRACE_EVENT(tname, \
+	TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
+	TP_ARGS(bp, caller_ip), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_daddr_t, bno) \
+		__field(size_t, buffer_length) \
+		__field(int, hold) \
+		__field(int, pincount) \
+		__field(unsigned, lockval) \
+		__field(unsigned, flags) \
+		__field(unsigned long, caller_ip) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = bp->b_target->bt_dev; \
+		__entry->bno = bp->b_bn; \
+		__entry->buffer_length = bp->b_buffer_length; \
+		__entry->hold = atomic_read(&bp->b_hold); \
+		__entry->pincount = atomic_read(&bp->b_pin_count); \
+		__entry->lockval = xfs_buf_lock_value(bp); \
+		__entry->flags = bp->b_flags; \
+		__entry->caller_ip = caller_ip; \
+	), \
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \
+		  "lock %d flags %s caller %pf", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  (unsigned long long)__entry->bno, \
+		  __entry->buffer_length, \
+		  __entry->hold, \
+		  __entry->pincount, \
+		  __entry->lockval, \
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), \
+		  (void *)__entry->caller_ip) \
+)
+DEFINE_BUF_EVENT(xfs_buf_init);
+DEFINE_BUF_EVENT(xfs_buf_free);
+DEFINE_BUF_EVENT(xfs_buf_hold);
+DEFINE_BUF_EVENT(xfs_buf_rele);
+DEFINE_BUF_EVENT(xfs_buf_pin);
+DEFINE_BUF_EVENT(xfs_buf_unpin);
+DEFINE_BUF_EVENT(xfs_buf_iodone);
+DEFINE_BUF_EVENT(xfs_buf_iorequest);
+DEFINE_BUF_EVENT(xfs_buf_bawrite);
+DEFINE_BUF_EVENT(xfs_buf_bdwrite);
+DEFINE_BUF_EVENT(xfs_buf_lock);
+DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_cond_lock);
+DEFINE_BUF_EVENT(xfs_buf_unlock);
+DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
+DEFINE_BUF_EVENT(xfs_buf_iowait);
+DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_split);
+DEFINE_BUF_EVENT(xfs_buf_get_noaddr);
+DEFINE_BUF_EVENT(xfs_bdstrat_shut);
+DEFINE_BUF_EVENT(xfs_buf_item_relse);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
+
+/* not really buffer traces, but the buf provides useful information */
+DEFINE_BUF_EVENT(xfs_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_reset_dqcounts);
+DEFINE_BUF_EVENT(xfs_inode_item_push);
+
+/* pass flags explicitly */
+#define DEFINE_BUF_FLAGS_EVENT(tname) \
+TRACE_EVENT(tname, \
+	TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
+	TP_ARGS(bp, flags, caller_ip), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_daddr_t, bno) \
+		__field(size_t, buffer_length) \
+		__field(int, hold) \
+		__field(int, pincount) \
+		__field(unsigned, lockval) \
+		__field(unsigned, flags) \
+		__field(unsigned long, caller_ip) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = bp->b_target->bt_dev; \
+		__entry->bno = bp->b_bn; \
+		__entry->buffer_length = bp->b_buffer_length; \
+		__entry->flags = flags; \
+		__entry->hold = atomic_read(&bp->b_hold); \
+		__entry->pincount = atomic_read(&bp->b_pin_count); \
+		__entry->lockval = xfs_buf_lock_value(bp); \
+		__entry->caller_ip = caller_ip; \
+	), \
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \
+		  "lock %d flags %s caller %pf", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  (unsigned long long)__entry->bno, \
+		  __entry->buffer_length, \
+		  __entry->hold, \
+		  __entry->pincount, \
+		  __entry->lockval, \
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), \
+		  (void *)__entry->caller_ip) \
+)
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
+
+TRACE_EVENT(xfs_buf_ioerror,
+	TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip),
+	TP_ARGS(bp, error, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(size_t, buffer_length)
+		__field(unsigned, flags)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(int, error)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = bp->b_bn;
+		__entry->buffer_length = bp->b_buffer_length;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = xfs_buf_lock_value(bp);
+		__entry->error = error;
+		__entry->flags = bp->b_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d error %d flags %s caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->buffer_length,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __entry->error,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
+);
+
+#define DEFINE_BUF_ITEM_EVENT(tname) \
+TRACE_EVENT(tname, \
+	TP_PROTO(struct xfs_buf_log_item *bip), \
+	TP_ARGS(bip), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_daddr_t, buf_bno) \
+		__field(size_t, buf_len) \
+		__field(int, buf_hold) \
+		__field(int, buf_pincount) \
+		__field(int, buf_lockval) \
+		__field(unsigned, buf_flags) \
+		__field(unsigned, bli_recur) \
+		__field(int, bli_refcount) \
+		__field(unsigned, bli_flags) \
+		__field(void *, li_desc) \
+		__field(unsigned, li_flags) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = bip->bli_buf->b_target->bt_dev; \
+		__entry->bli_flags = bip->bli_flags; \
+		__entry->bli_recur = bip->bli_recur; \
+		__entry->bli_refcount = atomic_read(&bip->bli_refcount); \
+		__entry->buf_bno = bip->bli_buf->b_bn; \
+		__entry->buf_len = bip->bli_buf->b_buffer_length; \
+		__entry->buf_flags = bip->bli_buf->b_flags; \
+		__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); \
+		__entry->buf_pincount = \
+			atomic_read(&bip->bli_buf->b_pin_count); \
+		__entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf); \
+		__entry->li_desc = bip->bli_item.li_desc; \
+		__entry->li_flags = bip->bli_item.li_flags; \
+	), \
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \
+		  "lock %d flags %s recur %d refcount %d bliflags %s " \
+		  "lidesc 0x%p liflags %s", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  (unsigned long long)__entry->buf_bno, \
+		  __entry->buf_len, \
+		  __entry->buf_hold, \
+		  __entry->buf_pincount, \
+		  __entry->buf_lockval, \
+		  __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), \
+		  __entry->bli_recur, \
+		  __entry->bli_refcount, \
+		  __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), \
+		  __entry->li_desc, \
+		  __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) \
+)
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+
+#define DEFINE_LOCK_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip,  lock_flags, caller_ip), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__field(int, lock_flags) \
+		__field(unsigned long, caller_ip) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
+		__entry->ino = ip->i_ino; \
+		__entry->lock_flags = lock_flags; \
+		__entry->caller_ip = caller_ip; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino, \
+		  __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), \
+		  (void *)__entry->caller_ip) \
+)
+
+DEFINE_LOCK_EVENT(xfs_ilock);
+DEFINE_LOCK_EVENT(xfs_ilock_nowait);
+DEFINE_LOCK_EVENT(xfs_ilock_demote);
+DEFINE_LOCK_EVENT(xfs_iunlock);
+
+#define DEFINE_IGET_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_inode *ip), \
+	TP_ARGS(ip), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
+		__entry->ino = ip->i_ino; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino) \
+)
+DEFINE_IGET_EVENT(xfs_iget_skip);
+DEFINE_IGET_EVENT(xfs_iget_reclaim);
+DEFINE_IGET_EVENT(xfs_iget_found);
+DEFINE_IGET_EVENT(xfs_iget_alloc);
+
+#define DEFINE_INODE_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
+	TP_ARGS(ip, caller_ip), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__field(int, count) \
+		__field(unsigned long, caller_ip) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
+		__entry->ino = ip->i_ino; \
+		__entry->count = atomic_read(&VFS_I(ip)->i_count); \
+		__entry->caller_ip = caller_ip; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino, \
+		  __entry->count, \
+		  (char *)__entry->caller_ip) \
+)
+DEFINE_INODE_EVENT(xfs_ihold);
+DEFINE_INODE_EVENT(xfs_irele);
+/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
+DEFINE_INODE_EVENT(xfs_inode);
+#define xfs_itrace_entry(ip)    \
+	trace_xfs_inode(ip, _THIS_IP_)
+
+#define DEFINE_DQUOT_EVENT(tname) \
+TRACE_EVENT(tname, \
+	TP_PROTO(struct xfs_dquot *dqp), \
+	TP_ARGS(dqp), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(__be32, id) \
+		__field(unsigned, flags) \
+		__field(unsigned, nrefs) \
+		__field(unsigned long long, res_bcount) \
+		__field(unsigned long long, bcount) \
+		__field(unsigned long long, icount) \
+		__field(unsigned long long, blk_hardlimit) \
+		__field(unsigned long long, blk_softlimit) \
+		__field(unsigned long long, ino_hardlimit) \
+		__field(unsigned long long, ino_softlimit) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = dqp->q_mount->m_super->s_dev; \
+		__entry->id = dqp->q_core.d_id; \
+		__entry->flags = dqp->dq_flags; \
+		__entry->nrefs = dqp->q_nrefs; \
+		__entry->res_bcount = dqp->q_res_bcount; \
+		__entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); \
+		__entry->icount = be64_to_cpu(dqp->q_core.d_icount); \
+		__entry->blk_hardlimit = \
+			be64_to_cpu(dqp->q_core.d_blk_hardlimit); \
+		__entry->blk_softlimit = \
+			be64_to_cpu(dqp->q_core.d_blk_softlimit); \
+		__entry->ino_hardlimit = \
+			be64_to_cpu(dqp->q_core.d_ino_hardlimit); \
+		__entry->ino_softlimit = \
+			be64_to_cpu(dqp->q_core.d_ino_softlimit); \
+	), \
+	TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " \
+		  "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " \
+		  "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  be32_to_cpu(__entry->id), \
+		  __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), \
+		  __entry->nrefs, \
+		  __entry->res_bcount, \
+		  __entry->bcount, \
+		  __entry->blk_hardlimit, \
+		  __entry->blk_softlimit, \
+		  __entry->icount, \
+		  __entry->ino_hardlimit, \
+		  __entry->ino_softlimit) \
+)
+DEFINE_DQUOT_EVENT(xfs_dqadjust);
+DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
+DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqattach_found);
+DEFINE_DQUOT_EVENT(xfs_dqattach_get);
+DEFINE_DQUOT_EVENT(xfs_dqinit);
+DEFINE_DQUOT_EVENT(xfs_dqreuse);
+DEFINE_DQUOT_EVENT(xfs_dqalloc);
+DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
+DEFINE_DQUOT_EVENT(xfs_dqread);
+DEFINE_DQUOT_EVENT(xfs_dqread_fail);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
+DEFINE_DQUOT_EVENT(xfs_dqget_hit);
+DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqput);
+DEFINE_DQUOT_EVENT(xfs_dqput_wait);
+DEFINE_DQUOT_EVENT(xfs_dqput_free);
+DEFINE_DQUOT_EVENT(xfs_dqrele);
+DEFINE_DQUOT_EVENT(xfs_dqflush);
+DEFINE_DQUOT_EVENT(xfs_dqflush_force);
+DEFINE_DQUOT_EVENT(xfs_dqflush_done);
+/* not really iget events, but we re-use the format */
+DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
+DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
+
+
+#define DEFINE_LOGGRANT_EVENT(tname) \
+TRACE_EVENT(tname, \
+	TP_PROTO(struct log *log, struct xlog_ticket *tic), \
+	TP_ARGS(log, tic), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(unsigned, trans_type) \
+		__field(char, ocnt) \
+		__field(char, cnt) \
+		__field(int, curr_res) \
+		__field(int, unit_res) \
+		__field(unsigned int, flags) \
+		__field(void *, reserve_headq) \
+		__field(void *, write_headq) \
+		__field(int, grant_reserve_cycle) \
+		__field(int, grant_reserve_bytes) \
+		__field(int, grant_write_cycle) \
+		__field(int, grant_write_bytes) \
+		__field(int, curr_cycle) \
+		__field(int, curr_block) \
+		__field(xfs_lsn_t, tail_lsn) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = log->l_mp->m_super->s_dev; \
+		__entry->trans_type = tic->t_trans_type; \
+		__entry->ocnt = tic->t_ocnt; \
+		__entry->cnt = tic->t_cnt; \
+		__entry->curr_res = tic->t_curr_res; \
+		__entry->unit_res = tic->t_unit_res; \
+		__entry->flags = tic->t_flags; \
+		__entry->reserve_headq = log->l_reserve_headq; \
+		__entry->write_headq = log->l_write_headq; \
+		__entry->grant_reserve_cycle = log->l_grant_reserve_cycle; \
+		__entry->grant_reserve_bytes = log->l_grant_reserve_bytes; \
+		__entry->grant_write_cycle = log->l_grant_write_cycle; \
+		__entry->grant_write_bytes = log->l_grant_write_bytes; \
+		__entry->curr_cycle = log->l_curr_cycle; \
+		__entry->curr_block = log->l_curr_block; \
+		__entry->tail_lsn = log->l_tail_lsn; \
+	), \
+	TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " \
+		  "t_unit_res %u t_flags %s reserve_headq 0x%p " \
+		  "write_headq 0x%p grant_reserve_cycle %d " \
+		  "grant_reserve_bytes %d grant_write_cycle %d " \
+		  "grant_write_bytes %d curr_cycle %d curr_block %d " \
+		  "tail_cycle %d tail_block %d", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), \
+		  __entry->ocnt, \
+		  __entry->cnt, \
+		  __entry->curr_res, \
+		  __entry->unit_res, \
+		  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), \
+		  __entry->reserve_headq, \
+		  __entry->write_headq, \
+		  __entry->grant_reserve_cycle, \
+		  __entry->grant_reserve_bytes, \
+		  __entry->grant_write_cycle, \
+		  __entry->grant_write_bytes, \
+		  __entry->curr_cycle, \
+		  __entry->curr_block, \
+		  CYCLE_LSN(__entry->tail_lsn), \
+		  BLOCK_LSN(__entry->tail_lsn) \
+	) \
+)
+DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
+DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
+DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+
+#define DEFINE_RW_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+	TP_ARGS(ip, count, offset, flags), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__field(xfs_fsize_t, size) \
+		__field(xfs_fsize_t, new_size) \
+		__field(loff_t, offset) \
+		__field(size_t, count) \
+		__field(int, flags) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
+		__entry->ino = ip->i_ino; \
+		__entry->size = ip->i_d.di_size; \
+		__entry->new_size = ip->i_new_size; \
+		__entry->offset = offset; \
+		__entry->count = count; \
+		__entry->flags = flags; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+		  "offset 0x%llx count 0x%zx ioflags %s", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino, \
+		  __entry->size, \
+		  __entry->new_size, \
+		  __entry->offset, \
+		  __entry->count, \
+		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
+)
+DEFINE_RW_EVENT(xfs_file_read);
+DEFINE_RW_EVENT(xfs_file_buffered_write);
+DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_splice_read);
+DEFINE_RW_EVENT(xfs_file_splice_write);
+
+
+#define DEFINE_PAGE_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
+	TP_ARGS(inode, page, off), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__field(pgoff_t, pgoff) \
+		__field(loff_t, size) \
+		__field(unsigned long, offset) \
+		__field(int, delalloc) \
+		__field(int, unmapped) \
+		__field(int, unwritten) \
+	), \
+	TP_fast_assign( \
+		int delalloc = -1, unmapped = -1, unwritten = -1; \
+	\
+		if (page_has_buffers(page)) \
+			xfs_count_page_state(page, &delalloc, \
+					     &unmapped, &unwritten); \
+		__entry->dev = inode->i_sb->s_dev; \
+		__entry->ino = XFS_I(inode)->i_ino; \
+		__entry->pgoff = page_offset(page); \
+		__entry->size = i_size_read(inode); \
+		__entry->offset = off; \
+		__entry->delalloc = delalloc; \
+		__entry->unmapped = unmapped; \
+		__entry->unwritten = unwritten; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
+		  "delalloc %d unmapped %d unwritten %d", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino, \
+		  __entry->pgoff, \
+		  __entry->size, \
+		  __entry->offset, \
+		  __entry->delalloc, \
+		  __entry->unmapped, \
+		  __entry->unwritten) \
+)
+DEFINE_PAGE_EVENT(xfs_writepage);
+DEFINE_PAGE_EVENT(xfs_releasepage);
+DEFINE_PAGE_EVENT(xfs_invalidatepage);
+
+#define DEFINE_IOMAP_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+		 int flags, struct xfs_bmbt_irec *irec), \
+	TP_ARGS(ip, offset, count, flags, irec), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__field(loff_t, size) \
+		__field(loff_t, new_size) \
+		__field(loff_t, offset) \
+		__field(size_t, count) \
+		__field(int, flags) \
+		__field(xfs_fileoff_t, startoff) \
+		__field(xfs_fsblock_t, startblock) \
+		__field(xfs_filblks_t, blockcount) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
+		__entry->ino = ip->i_ino; \
+		__entry->size = ip->i_d.di_size; \
+		__entry->new_size = ip->i_new_size; \
+		__entry->offset = offset; \
+		__entry->count = count; \
+		__entry->flags = flags; \
+		__entry->startoff = irec ? irec->br_startoff : 0; \
+		__entry->startblock = irec ? irec->br_startblock : 0; \
+		__entry->blockcount = irec ? irec->br_blockcount : 0; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+		  "offset 0x%llx count %zd flags %s " \
+		  "startoff 0x%llx startblock 0x%llx blockcount 0x%llx", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino, \
+		  __entry->size, \
+		  __entry->new_size, \
+		  __entry->offset, \
+		  __entry->count, \
+		  __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
+		  __entry->startoff, \
+		  __entry->startblock, \
+		  __entry->blockcount) \
+)
+DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+
+#define DEFINE_SIMPLE_IO_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
+	TP_ARGS(ip, offset, count), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__field(loff_t, size) \
+		__field(loff_t, new_size) \
+		__field(loff_t, offset) \
+		__field(size_t, count) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
+		__entry->ino = ip->i_ino; \
+		__entry->size = ip->i_d.di_size; \
+		__entry->new_size = ip->i_new_size; \
+		__entry->offset = offset; \
+		__entry->count = count; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+		  "offset 0x%llx count %zd", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino, \
+		  __entry->size, \
+		  __entry->new_size, \
+		  __entry->offset, \
+		  __entry->count) \
+);
+DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
+DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+
+
+TRACE_EVENT(xfs_itruncate_start,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size, int flag,
+		 xfs_off_t toss_start, xfs_off_t toss_finish),
+	TP_ARGS(ip, new_size, flag, toss_start, toss_finish),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fsize_t, new_size)
+		__field(xfs_off_t, toss_start)
+		__field(xfs_off_t, toss_finish)
+		__field(int, flag)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = new_size;
+		__entry->toss_start = toss_start;
+		__entry->toss_finish = toss_finish;
+		__entry->flag = flag;
+	),
+	TP_printk("dev %d:%d ino 0x%llx %s size 0x%llx new_size 0x%llx "
+		  "toss start 0x%llx toss finish 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->flag, "|", XFS_ITRUNC_FLAGS),
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->toss_start,
+		  __entry->toss_finish)
+);
+
+#define DEFINE_ITRUNC_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
+	TP_ARGS(ip, new_size), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__field(xfs_fsize_t, size) \
+		__field(xfs_fsize_t, new_size) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
+		__entry->ino = ip->i_ino; \
+		__entry->size = ip->i_d.di_size; \
+		__entry->new_size = new_size; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino, \
+		  __entry->size, \
+		  __entry->new_size) \
+)
+DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end);
+
+TRACE_EVENT(xfs_pagecache_inval,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
+	TP_ARGS(ip, start, finish),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_off_t, start)
+		__field(xfs_off_t, finish)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->start = start;
+		__entry->finish = finish;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->start,
+		  __entry->finish)
+);
+
+TRACE_EVENT(xfs_bunmap,
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len,
+		 int flags, unsigned long caller_ip),
+	TP_ARGS(ip, bno, len, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fileoff_t, bno)
+		__field(xfs_filblks_t, len)
+		__field(unsigned long, caller_ip)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->bno = bno;
+		__entry->len = len;
+		__entry->caller_ip = caller_ip;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
+		  "flags %s caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->bno,
+		  __entry->len,
+		  __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
+		  (void *)__entry->caller_ip)
+
+);
+
+TRACE_EVENT(xfs_alloc_busy,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+		 xfs_extlen_t len, int slot),
+	TP_ARGS(mp, agno, agbno, len, slot),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(int, slot)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->slot = slot;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->slot)
+
+);
+
+#define XFS_BUSY_STATES \
+	{ 0,	"found" }, \
+	{ 1,	"missing" }
+
+TRACE_EVENT(xfs_alloc_unbusy,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 int slot, int found),
+	TP_ARGS(mp, agno, slot, found),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, slot)
+		__field(int, found)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->slot = slot;
+		__entry->found = found;
+	),
+	TP_printk("dev %d:%d agno %u slot %d %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->slot,
+		  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+
+TRACE_EVENT(xfs_alloc_busysearch,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+		 xfs_extlen_t len, int found),
+	TP_ARGS(mp, agno, agbno, len, found),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(int, found)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->found = found;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+
+TRACE_EVENT(xfs_agf,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agf, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, flags)
+		__field(__u32, length)
+		__field(__u32, bno_root)
+		__field(__u32, cnt_root)
+		__field(__u32, bno_level)
+		__field(__u32, cnt_level)
+		__field(__u32, flfirst)
+		__field(__u32, fllast)
+		__field(__u32, flcount)
+		__field(__u32, freeblks)
+		__field(__u32, longest)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = be32_to_cpu(agf->agf_seqno),
+		__entry->flags = flags;
+		__entry->length = be32_to_cpu(agf->agf_length),
+		__entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
+		__entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
+		__entry->bno_level =
+				be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
+		__entry->cnt_level =
+				be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+		__entry->flfirst = be32_to_cpu(agf->agf_flfirst),
+		__entry->fllast = be32_to_cpu(agf->agf_fllast),
+		__entry->flcount = be32_to_cpu(agf->agf_flcount),
+		__entry->freeblks = be32_to_cpu(agf->agf_freeblks),
+		__entry->longest = be32_to_cpu(agf->agf_longest);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
+		  "levels b %u c %u flfirst %u fllast %u flcount %u "
+		  "freeblks %u longest %u caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
+		  __entry->length,
+		  __entry->bno_root,
+		  __entry->cnt_root,
+		  __entry->bno_level,
+		  __entry->cnt_level,
+		  __entry->flfirst,
+		  __entry->fllast,
+		  __entry->flcount,
+		  __entry->freeblks,
+		  __entry->longest,
+		  (void *)__entry->caller_ip)
+);
+
+TRACE_EVENT(xfs_free_extent,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+		 xfs_extlen_t len, bool isfl, int haveleft, int haveright),
+	TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(int, isfl)
+		__field(int, haveleft)
+		__field(int, haveright)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->isfl = isfl;
+		__entry->haveleft = haveleft;
+		__entry->haveright = haveright;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->isfl,
+		  __entry->haveleft ?
+			(__entry->haveright ? "both" : "left") :
+			(__entry->haveright ? "right" : "none"))
+
+);
+
+#define DEFINE_ALLOC_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_alloc_arg *args), \
+	TP_ARGS(args), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_agnumber_t, agno) \
+		__field(xfs_agblock_t, agbno) \
+		__field(xfs_extlen_t, minlen) \
+		__field(xfs_extlen_t, maxlen) \
+		__field(xfs_extlen_t, mod) \
+		__field(xfs_extlen_t, prod) \
+		__field(xfs_extlen_t, minleft) \
+		__field(xfs_extlen_t, total) \
+		__field(xfs_extlen_t, alignment) \
+		__field(xfs_extlen_t, minalignslop) \
+		__field(xfs_extlen_t, len) \
+		__field(short, type) \
+		__field(short, otype) \
+		__field(char, wasdel) \
+		__field(char, wasfromfl) \
+		__field(char, isfl) \
+		__field(char, userdata) \
+		__field(xfs_fsblock_t, firstblock) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = args->mp->m_super->s_dev; \
+		__entry->agno = args->agno; \
+		__entry->agbno = args->agbno; \
+		__entry->minlen = args->minlen; \
+		__entry->maxlen = args->maxlen; \
+		__entry->mod = args->mod; \
+		__entry->prod = args->prod; \
+		__entry->minleft = args->minleft; \
+		__entry->total = args->total; \
+		__entry->alignment = args->alignment; \
+		__entry->minalignslop = args->minalignslop; \
+		__entry->len = args->len; \
+		__entry->type = args->type; \
+		__entry->otype = args->otype; \
+		__entry->wasdel = args->wasdel; \
+		__entry->wasfromfl = args->wasfromfl; \
+		__entry->isfl = args->isfl; \
+		__entry->userdata = args->userdata; \
+		__entry->firstblock = args->firstblock; \
+	), \
+	TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " \
+		  "prod %u minleft %u total %u alignment %u minalignslop %u " \
+		  "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " \
+		  "userdata %d firstblock 0x%llx", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->agno, \
+		  __entry->agbno, \
+		  __entry->minlen, \
+		  __entry->maxlen, \
+		  __entry->mod, \
+		  __entry->prod, \
+		  __entry->minleft, \
+		  __entry->total, \
+		  __entry->alignment, \
+		  __entry->minalignslop, \
+		  __entry->len, \
+		  __print_symbolic(__entry->type, XFS_ALLOC_TYPES), \
+		  __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), \
+		  __entry->wasdel, \
+		  __entry->wasfromfl, \
+		  __entry->isfl, \
+		  __entry->userdata, \
+		  __entry->firstblock) \
+)
+
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
+
+#define DEFINE_DIR2_TRACE(tname) \
+TRACE_EVENT(tname, \
+	TP_PROTO(struct xfs_da_args *args), \
+	TP_ARGS(args), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__dynamic_array(char, name, args->namelen) \
+		__field(int, namelen) \
+		__field(xfs_dahash_t, hashval) \
+		__field(xfs_ino_t, inumber) \
+		__field(int, op_flags) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev; \
+		__entry->ino = args->dp->i_ino; \
+		if (args->namelen) \
+			memcpy(__get_str(name), args->name, args->namelen); \
+		__entry->namelen = args->namelen; \
+		__entry->hashval = args->hashval; \
+		__entry->inumber = args->inumber; \
+		__entry->op_flags = args->op_flags; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " \
+		  "inumber 0x%llx op_flags %s", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino, \
+		  __entry->namelen, \
+		  __entry->namelen ? __get_str(name) : NULL, \
+		  __entry->namelen, \
+		  __entry->hashval, \
+		  __entry->inumber, \
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) \
+)
+DEFINE_DIR2_TRACE(xfs_dir2_sf_addname);
+DEFINE_DIR2_TRACE(xfs_dir2_sf_create);
+DEFINE_DIR2_TRACE(xfs_dir2_sf_lookup);
+DEFINE_DIR2_TRACE(xfs_dir2_sf_replace);
+DEFINE_DIR2_TRACE(xfs_dir2_sf_removename);
+DEFINE_DIR2_TRACE(xfs_dir2_sf_toino4);
+DEFINE_DIR2_TRACE(xfs_dir2_sf_toino8);
+DEFINE_DIR2_TRACE(xfs_dir2_sf_to_block);
+DEFINE_DIR2_TRACE(xfs_dir2_block_addname);
+DEFINE_DIR2_TRACE(xfs_dir2_block_lookup);
+DEFINE_DIR2_TRACE(xfs_dir2_block_replace);
+DEFINE_DIR2_TRACE(xfs_dir2_block_removename);
+DEFINE_DIR2_TRACE(xfs_dir2_block_to_sf);
+DEFINE_DIR2_TRACE(xfs_dir2_block_to_leaf);
+DEFINE_DIR2_TRACE(xfs_dir2_leaf_addname);
+DEFINE_DIR2_TRACE(xfs_dir2_leaf_lookup);
+DEFINE_DIR2_TRACE(xfs_dir2_leaf_replace);
+DEFINE_DIR2_TRACE(xfs_dir2_leaf_removename);
+DEFINE_DIR2_TRACE(xfs_dir2_leaf_to_block);
+DEFINE_DIR2_TRACE(xfs_dir2_leaf_to_node);
+DEFINE_DIR2_TRACE(xfs_dir2_node_addname);
+DEFINE_DIR2_TRACE(xfs_dir2_node_lookup);
+DEFINE_DIR2_TRACE(xfs_dir2_node_replace);
+DEFINE_DIR2_TRACE(xfs_dir2_node_removename);
+DEFINE_DIR2_TRACE(xfs_dir2_node_to_leaf);
+
+#define DEFINE_DIR2_SPACE_TRACE(tname) \
+TRACE_EVENT(tname, \
+	TP_PROTO(struct xfs_da_args *args, int idx), \
+	TP_ARGS(args, idx), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_ino_t, ino) \
+		__field(int, op_flags) \
+		__field(int, idx) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev; \
+		__entry->ino = args->dp->i_ino; \
+		__entry->op_flags = args->op_flags; \
+		__entry->idx = idx; \
+	), \
+	TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->ino, \
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), \
+		  __entry->idx) \
+)
+DEFINE_DIR2_SPACE_TRACE(xfs_dir2_leafn_add);
+DEFINE_DIR2_SPACE_TRACE(xfs_dir2_leafn_remove);
+DEFINE_DIR2_SPACE_TRACE(xfs_dir2_grow_inode);
+DEFINE_DIR2_SPACE_TRACE(xfs_dir2_shrink_inode);
+
+TRACE_EVENT(xfs_dir2_leafn_moveents,
+	TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
+	TP_ARGS(args, src_idx, dst_idx, count),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, op_flags)
+		__field(int, src_idx)
+		__field(int, dst_idx)
+		__field(int, count)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		__entry->op_flags = args->op_flags;
+		__entry->src_idx = src_idx;
+		__entry->dst_idx = dst_idx;
+		__entry->count = count;
+	),
+	TP_printk("dev %d:%d ino 0x%llx op_flags %s "
+		  "src_idx %d dst_idx %d count %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+		  __entry->src_idx,
+		  __entry->dst_idx,
+		  __entry->count)
+);
+
+#endif /* _TRACE_XFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE xfs_trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index ad7fbead4c9..7c220b4227b 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -36,10 +36,13 @@ struct attrlist_cursor_kern;
 /*
  * Flags for read/write calls - same values as IRIX
  */
-#define IO_ISAIO	0x00001		/* don't wait for completion */
 #define IO_ISDIRECT	0x00004		/* bypass page cache */
 #define IO_INVIS	0x00020		/* don't update inode timestamps */
 
+#define XFS_IO_FLAGS \
+	{ IO_ISDIRECT,	"DIRECT" }, \
+	{ IO_INVIS,	"INVIS"}
+
 /*
  * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
  */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 2f3f2229eaa..d7c7eea09fc 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -47,6 +47,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
+#include "xfs_trace.h"
 
 
 /*
@@ -112,10 +113,7 @@ xfs_qm_dqinit(
 		init_completion(&dqp->q_flush);
 		complete(&dqp->q_flush);
 
-#ifdef XFS_DQUOT_TRACE
-		dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS);
-		xfs_dqtrace_entry(dqp, "DQINIT");
-#endif
+		trace_xfs_dqinit(dqp);
 	} else {
 		/*
 		 * Only the q_core portion was zeroed in dqreclaim_one().
@@ -136,10 +134,7 @@ xfs_qm_dqinit(
 		 dqp->q_hash = NULL;
 		 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
 
-#ifdef XFS_DQUOT_TRACE
-		 ASSERT(dqp->q_trace);
-		 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
-#endif
+		trace_xfs_dqreuse(dqp);
 	}
 
 	/*
@@ -167,13 +162,8 @@ xfs_qm_dqdestroy(
 
 	mutex_destroy(&dqp->q_qlock);
 	sv_destroy(&dqp->q_pinwait);
-
-#ifdef XFS_DQUOT_TRACE
-	if (dqp->q_trace)
-	     ktrace_free(dqp->q_trace);
-	dqp->q_trace = NULL;
-#endif
 	kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+
 	atomic_dec(&xfs_Gqm->qm_totaldquots);
 }
 
@@ -195,49 +185,6 @@ xfs_qm_dqinit_core(
 	d->dd_diskdq.d_flags = type;
 }
 
-
-#ifdef XFS_DQUOT_TRACE
-/*
- * Dquot tracing for debugging.
- */
-/* ARGSUSED */
-void
-__xfs_dqtrace_entry(
-	xfs_dquot_t	*dqp,
-	char		*func,
-	void		*retaddr,
-	xfs_inode_t	*ip)
-{
-	xfs_dquot_t	*udqp = NULL;
-	xfs_ino_t	ino = 0;
-
-	ASSERT(dqp->q_trace);
-	if (ip) {
-		ino = ip->i_ino;
-		udqp = ip->i_udquot;
-	}
-	ktrace_enter(dqp->q_trace,
-		     (void *)(__psint_t)DQUOT_KTRACE_ENTRY,
-		     (void *)func,
-		     (void *)(__psint_t)dqp->q_nrefs,
-		     (void *)(__psint_t)dqp->dq_flags,
-		     (void *)(__psint_t)dqp->q_res_bcount,
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_bcount),
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_icount),
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_hardlimit),
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_softlimit),
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_hardlimit),
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_softlimit),
-		     (void *)(__psint_t)be32_to_cpu(dqp->q_core.d_id),
-		     (void *)(__psint_t)current_pid(),
-		     (void *)(__psint_t)ino,
-		     (void *)(__psint_t)retaddr,
-		     (void *)(__psint_t)udqp);
-	return;
-}
-#endif
-
-
 /*
  * If default limits are in force, push them into the dquot now.
  * We overwrite the dquot limits only if they are zero and this
@@ -425,7 +372,8 @@ xfs_qm_dqalloc(
 	xfs_trans_t	*tp = *tpp;
 
 	ASSERT(tp != NULL);
-	xfs_dqtrace_entry(dqp, "DQALLOC");
+
+	trace_xfs_dqalloc(dqp);
 
 	/*
 	 * Initialize the bmap freelist prior to calling bmapi code.
@@ -612,7 +560,8 @@ xfs_qm_dqtobp(
 	 * (in which case we already have the buf).
 	 */
 	if (! newdquot) {
-		xfs_dqtrace_entry(dqp, "DQTOBP READBUF");
+		trace_xfs_dqtobp_read(dqp);
+
 		if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 					       dqp->q_blkno,
 					       XFS_QI_DQCHUNKLEN(mp),
@@ -670,11 +619,12 @@ xfs_qm_dqread(
 
 	ASSERT(tpp);
 
+	trace_xfs_dqread(dqp);
+
 	/*
 	 * get a pointer to the on-disk dquot and the buffer containing it
 	 * dqp already knows its own type (GROUP/USER).
 	 */
-	xfs_dqtrace_entry(dqp, "DQREAD");
 	if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
 		return (error);
 	}
@@ -763,7 +713,7 @@ xfs_qm_idtodq(
 		 * or if the dquot didn't exist on disk and we ask to
 		 * allocate (ENOENT).
 		 */
-		xfs_dqtrace_entry(dqp, "DQREAD FAIL");
+		trace_xfs_dqread_fail(dqp);
 		cancelflags |= XFS_TRANS_ABORT;
 		goto error0;
 	}
@@ -817,7 +767,8 @@ xfs_qm_dqlookup(
 		 * id can't be modified without the hashlock anyway.
 		 */
 		if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
-			xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP");
+			trace_xfs_dqlookup_found(dqp);
+
 			/*
 			 * All in core dquots must be on the dqlist of mp
 			 */
@@ -827,7 +778,7 @@ xfs_qm_dqlookup(
 			if (dqp->q_nrefs == 0) {
 				ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
 				if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
-					xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT");
+					trace_xfs_dqlookup_want(dqp);
 
 					/*
 					 * We may have raced with dqreclaim_one()
@@ -857,8 +808,7 @@ xfs_qm_dqlookup(
 					/*
 					 * take it off the freelist
 					 */
-					xfs_dqtrace_entry(dqp,
-							"DQLOOKUP: TAKEOFF FL");
+					trace_xfs_dqlookup_freelist(dqp);
 					XQM_FREELIST_REMOVE(dqp);
 					/* xfs_qm_freelist_print(&(xfs_Gqm->
 							qm_dqfreelist),
@@ -878,8 +828,7 @@ xfs_qm_dqlookup(
 			 */
 			ASSERT(mutex_is_locked(&qh->qh_lock));
 			if (dqp->HL_PREVP != &qh->qh_next) {
-				xfs_dqtrace_entry(dqp,
-						  "DQLOOKUP: HASH MOVETOFRONT");
+				trace_xfs_dqlookup_move(dqp);
 				if ((d = dqp->HL_NEXT))
 					d->HL_PREVP = dqp->HL_PREVP;
 				*(dqp->HL_PREVP) = d;
@@ -889,7 +838,7 @@ xfs_qm_dqlookup(
 				dqp->HL_PREVP = &qh->qh_next;
 				qh->qh_next = dqp;
 			}
-			xfs_dqtrace_entry(dqp, "LOOKUP END");
+			trace_xfs_dqlookup_done(dqp);
 			*O_dqpp = dqp;
 			ASSERT(mutex_is_locked(&qh->qh_lock));
 			return (0);
@@ -971,7 +920,7 @@ xfs_qm_dqget(
 		ASSERT(*O_dqpp);
 		ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
 		mutex_unlock(&h->qh_lock);
-		xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
+		trace_xfs_dqget_hit(*O_dqpp);
 		return (0);	/* success */
 	}
 	XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
@@ -1104,7 +1053,7 @@ xfs_qm_dqget(
 	mutex_unlock(&h->qh_lock);
  dqret:
 	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	xfs_dqtrace_entry(dqp, "DQGET DONE");
+	trace_xfs_dqget_miss(dqp);
 	*O_dqpp = dqp;
 	return (0);
 }
@@ -1124,7 +1073,8 @@ xfs_qm_dqput(
 
 	ASSERT(dqp->q_nrefs > 0);
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	xfs_dqtrace_entry(dqp, "DQPUT");
+
+	trace_xfs_dqput(dqp);
 
 	if (dqp->q_nrefs != 1) {
 		dqp->q_nrefs--;
@@ -1137,7 +1087,7 @@ xfs_qm_dqput(
 	 * in the right order; but try to get it out-of-order first
 	 */
 	if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
-		xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT");
+		trace_xfs_dqput_wait(dqp);
 		xfs_dqunlock(dqp);
 		xfs_qm_freelist_lock(xfs_Gqm);
 		xfs_dqlock(dqp);
@@ -1148,7 +1098,8 @@ xfs_qm_dqput(
 
 		/* We can't depend on nrefs being == 1 here */
 		if (--dqp->q_nrefs == 0) {
-			xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST");
+			trace_xfs_dqput_free(dqp);
+
 			/*
 			 * insert at end of the freelist.
 			 */
@@ -1196,7 +1147,7 @@ xfs_qm_dqrele(
 	if (!dqp)
 		return;
 
-	xfs_dqtrace_entry(dqp, "DQRELE");
+	trace_xfs_dqrele(dqp);
 
 	xfs_dqlock(dqp);
 	/*
@@ -1229,7 +1180,7 @@ xfs_qm_dqflush(
 
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 	ASSERT(!completion_done(&dqp->q_flush));
-	xfs_dqtrace_entry(dqp, "DQFLUSH");
+	trace_xfs_dqflush(dqp);
 
 	/*
 	 * If not dirty, or it's pinned and we are not supposed to
@@ -1259,7 +1210,6 @@ xfs_qm_dqflush(
 	 * the ondisk-dquot has already been allocated for.
 	 */
 	if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
-		xfs_dqtrace_entry(dqp, "DQTOBP FAIL");
 		ASSERT(error != ENOENT);
 		/*
 		 * Quotas could have gotten turned off (ESRCH)
@@ -1297,7 +1247,7 @@ xfs_qm_dqflush(
 	 * get stuck waiting in the write for too long.
 	 */
 	if (XFS_BUF_ISPINNED(bp)) {
-		xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE");
+		trace_xfs_dqflush_force(dqp);
 		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
 	}
 
@@ -1308,7 +1258,9 @@ xfs_qm_dqflush(
 	} else {
 		error = xfs_bwrite(mp, bp);
 	}
-	xfs_dqtrace_entry(dqp, "DQFLUSH END");
+
+	trace_xfs_dqflush_done(dqp);
+
 	/*
 	 * dqp is still locked, but caller is free to unlock it now.
 	 */
@@ -1483,7 +1435,7 @@ xfs_qm_dqpurge(
 	 */
 	if (XFS_DQ_IS_DIRTY(dqp)) {
 		int	error;
-		xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
+
 		/* dqflush unlocks dqflock */
 		/*
 		 * Given that dqpurge is a very rare occurrence, it is OK
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 6533ead9b88..a0f7da586d1 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -85,9 +85,6 @@ typedef struct xfs_dquot {
 	struct completion q_flush;	/* flush completion queue */
 	atomic_t          q_pincount;	/* dquot pin count */
 	wait_queue_head_t q_pinwait;	/* dquot pinning wait queue */
-#ifdef XFS_DQUOT_TRACE
-	struct ktrace	*q_trace;	/* trace header structure */
-#endif
 } xfs_dquot_t;
 
 
@@ -98,7 +95,7 @@ typedef struct xfs_dquot {
 #define dq_flags	q_lists.dqm_flags
 
 /*
- * Lock hierachy for q_qlock:
+ * Lock hierarchy for q_qlock:
  *	XFS_QLOCK_NORMAL is the implicit default,
  * 	XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
  */
@@ -144,24 +141,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
 				     (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
 				     (XFS_IS_OQUOTA_ON((d)->q_mount))))
 
-#ifdef XFS_DQUOT_TRACE
-/*
- * Dquot Tracing stuff.
- */
-#define DQUOT_TRACE_SIZE	64
-#define DQUOT_KTRACE_ENTRY	1
-
-extern void		__xfs_dqtrace_entry(xfs_dquot_t *dqp, char *func,
-					    void *, xfs_inode_t *);
-#define xfs_dqtrace_entry_ino(a,b,ip) \
-		__xfs_dqtrace_entry((a), (b), (void*)__return_address, (ip))
-#define xfs_dqtrace_entry(a,b) \
-		__xfs_dqtrace_entry((a), (b), (void*)__return_address, NULL)
-#else
-#define xfs_dqtrace_entry(a,b)
-#define xfs_dqtrace_entry_ino(a,b,ip)
-#endif
-
 #ifdef QUOTADEBUG
 extern void		xfs_qm_dqprint(xfs_dquot_t *);
 #else
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 45b1bfef738..9e627a8b5b0 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -47,6 +47,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_qm.h"
+#include "xfs_trace.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
@@ -453,7 +454,7 @@ again:
 			xfs_dqunlock(dqp);
 			continue;
 		}
-		xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
+
 		/* XXX a sentinel would be better */
 		recl = XFS_QI_MPLRECLAIMS(mp);
 		if (!xfs_dqflock_nowait(dqp)) {
@@ -651,7 +652,7 @@ xfs_qm_dqattach_one(
 	 */
 	dqp = *IO_idqpp;
 	if (dqp) {
-		xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
+		trace_xfs_dqattach_found(dqp);
 		return 0;
 	}
 
@@ -704,7 +705,7 @@ xfs_qm_dqattach_one(
 	if (error)
 		return error;
 
-	xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
+	trace_xfs_dqattach_get(dqp);
 
 	/*
 	 * dqget may have dropped and re-acquired the ilock, but it guarantees
@@ -890,15 +891,15 @@ xfs_qm_dqdetach(
 	if (!(ip->i_udquot || ip->i_gdquot))
 		return;
 
+	trace_xfs_dquot_dqdetach(ip);
+
 	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
 	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
 	if (ip->i_udquot) {
-		xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip);
 		xfs_qm_dqrele(ip->i_udquot);
 		ip->i_udquot = NULL;
 	}
 	if (ip->i_gdquot) {
-		xfs_dqtrace_entry_ino(ip->i_gdquot, "DQDETTACH", ip);
 		xfs_qm_dqrele(ip->i_gdquot);
 		ip->i_gdquot = NULL;
 	}
@@ -977,7 +978,6 @@ xfs_qm_sync(
 		 * across a disk write
 		 */
 		xfs_qm_mplist_unlock(mp);
-		xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
 		error = xfs_qm_dqflush(dqp, flush_flags);
 		xfs_dqunlock(dqp);
 		if (error && XFS_FORCED_SHUTDOWN(mp))
@@ -1350,7 +1350,8 @@ xfs_qm_reset_dqcounts(
 	xfs_disk_dquot_t	*ddq;
 	int			j;
 
-	xfs_buftrace("RESET DQUOTS", bp);
+	trace_xfs_reset_dqcounts(bp, _RET_IP_);
+
 	/*
 	 * Reset all counters and timers. They'll be
 	 * started afresh by xfs_qm_quotacheck.
@@ -1543,7 +1544,9 @@ xfs_qm_quotacheck_dqadjust(
 	xfs_qcnt_t		rtblks)
 {
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	xfs_dqtrace_entry(dqp, "QCHECK DQADJUST");
+
+	trace_xfs_dqadjust(dqp);
+
 	/*
 	 * Adjust the inode count and the block count to reflect this inode's
 	 * resource usage.
@@ -1994,7 +1997,9 @@ xfs_qm_shake_freelist(
 		 */
 		if (XFS_DQ_IS_DIRTY(dqp)) {
 			int	error;
-			xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
+
+			trace_xfs_dqshake_dirty(dqp);
+
 			/*
 			 * We flush it delayed write, so don't bother
 			 * releasing the mplock.
@@ -2038,7 +2043,9 @@ xfs_qm_shake_freelist(
 				return nreclaimed;
 			goto tryagain;
 		}
-		xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING");
+
+		trace_xfs_dqshake_unlink(dqp);
+
 #ifdef QUOTADEBUG
 		cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
 			dqp, be32_to_cpu(dqp->q_core.d_id));
@@ -2125,7 +2132,9 @@ xfs_qm_dqreclaim_one(void)
 		 */
 		if (dqp->dq_flags & XFS_DQ_WANT) {
 			ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-			xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT");
+
+			trace_xfs_dqreclaim_want(dqp);
+
 			xfs_dqunlock(dqp);
 			xfs_qm_freelist_unlock(xfs_Gqm);
 			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
@@ -2171,7 +2180,9 @@ xfs_qm_dqreclaim_one(void)
 		 */
 		if (XFS_DQ_IS_DIRTY(dqp)) {
 			int	error;
-			xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
+
+			trace_xfs_dqreclaim_dirty(dqp);
+
 			/*
 			 * We flush it delayed write, so don't bother
 			 * releasing the freelist lock.
@@ -2194,8 +2205,9 @@ xfs_qm_dqreclaim_one(void)
 		if (!mutex_trylock(&dqp->q_hash->qh_lock))
 			goto mplistunlock;
 
+		trace_xfs_dqreclaim_unlink(dqp);
+
 		ASSERT(dqp->q_nrefs == 0);
-		xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING");
 		XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
 		XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
 		XQM_FREELIST_REMOVE(dqp);
@@ -2430,7 +2442,7 @@ xfs_qm_vop_dqalloc(
 		}
 	}
 	if (uq)
-		xfs_dqtrace_entry_ino(uq, "DQALLOC", ip);
+		trace_xfs_dquot_dqalloc(ip);
 
 	xfs_iunlock(ip, lockflags);
 	if (O_udqpp)
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 5d1a3b98a6e..71af76fe8a2 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -49,6 +49,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_qm.h"
+#include "xfs_trace.h"
 
 #ifdef DEBUG
 # define qdprintk(s, args...)	cmn_err(CE_DEBUG, s, ## args)
@@ -496,7 +497,6 @@ xfs_qm_scall_setqlim(
 		ASSERT(error != ENOENT);
 		return (error);
 	}
-	xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET");
 	xfs_trans_dqjoin(tp, dqp);
 	ddq = &dqp->q_core;
 
@@ -602,7 +602,6 @@ xfs_qm_scall_setqlim(
 	dqp->dq_flags |= XFS_DQ_DIRTY;
 	xfs_trans_log_dquot(tp, dqp);
 
-	xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
 	error = xfs_trans_commit(tp, 0);
 	xfs_qm_dqprint(dqp);
 	xfs_qm_dqrele(dqp);
@@ -630,7 +629,6 @@ xfs_qm_scall_getquota(
 		return (error);
 	}
 
-	xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS");
 	/*
 	 * If everything's NULL, this dquot doesn't quite exist as far as
 	 * our utility programs are concerned.
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 6f4fd37c67a..d2d20462fd4 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -41,10 +41,6 @@ extern void assfail(char *expr, char *f, int l);
 # define STATIC static noinline
 #endif
 
-#ifndef STATIC_INLINE
-# define STATIC_INLINE static inline
-#endif
-
 #else /* DEBUG */
 
 #define ASSERT(expr)	\
@@ -54,19 +50,5 @@ extern void assfail(char *expr, char *f, int l);
 # define STATIC noinline
 #endif
 
-/*
- * We stop inlining of inline functions in debug mode.
- * Unfortunately, this means static inline in header files
- * get multiple definitions, so they need to remain static.
- * This then gives tonnes of warnings about unused but defined
- * functions, so we need to add the unused attribute to prevent
- * these spurious warnings.
- */
-#ifndef STATIC_INLINE
-# define STATIC_INLINE static __attribute__ ((unused)) noinline
-#endif
-
 #endif /* DEBUG */
-
-
 #endif  /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
deleted file mode 100644
index 2d494c26717..00000000000
--- a/fs/xfs/support/ktrace.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <xfs.h>
-
-static kmem_zone_t *ktrace_hdr_zone;
-static kmem_zone_t *ktrace_ent_zone;
-static int          ktrace_zentries;
-
-void __init
-ktrace_init(int zentries)
-{
-	ktrace_zentries = roundup_pow_of_two(zentries);
-
-	ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
-					"ktrace_hdr");
-	ASSERT(ktrace_hdr_zone);
-
-	ktrace_ent_zone = kmem_zone_init(ktrace_zentries
-					* sizeof(ktrace_entry_t),
-					"ktrace_ent");
-	ASSERT(ktrace_ent_zone);
-}
-
-void __exit
-ktrace_uninit(void)
-{
-	kmem_zone_destroy(ktrace_hdr_zone);
-	kmem_zone_destroy(ktrace_ent_zone);
-}
-
-/*
- * ktrace_alloc()
- *
- * Allocate a ktrace header and enough buffering for the given
- * number of entries. Round the number of entries up to a
- * power of 2 so we can do fast masking to get the index from
- * the atomic index counter.
- */
-ktrace_t *
-ktrace_alloc(int nentries, unsigned int __nocast sleep)
-{
-	ktrace_t        *ktp;
-	ktrace_entry_t  *ktep;
-	int		entries;
-
-	ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
-
-	if (ktp == (ktrace_t*)NULL) {
-		/*
-		 * KM_SLEEP callers don't expect failure.
-		 */
-		if (sleep & KM_SLEEP)
-			panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
-
-		return NULL;
-	}
-
-	/*
-	 * Special treatment for buffers with the ktrace_zentries entries
-	 */
-	entries = roundup_pow_of_two(nentries);
-	if (entries == ktrace_zentries) {
-		ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
-							    sleep);
-	} else {
-		ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)),
-							    sleep | KM_LARGE);
-	}
-
-	if (ktep == NULL) {
-		/*
-		 * KM_SLEEP callers don't expect failure.
-		 */
-		if (sleep & KM_SLEEP)
-			panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
-
-		kmem_free(ktp);
-
-		return NULL;
-	}
-
-	ktp->kt_entries  = ktep;
-	ktp->kt_nentries = entries;
-	ASSERT(is_power_of_2(entries));
-	ktp->kt_index_mask = entries - 1;
-	atomic_set(&ktp->kt_index, 0);
-	ktp->kt_rollover = 0;
-	return ktp;
-}
-
-
-/*
- * ktrace_free()
- *
- * Free up the ktrace header and buffer.  It is up to the caller
- * to ensure that no-one is referencing it.
- */
-void
-ktrace_free(ktrace_t *ktp)
-{
-	if (ktp == (ktrace_t *)NULL)
-		return;
-
-	/*
-	 * Special treatment for the Vnode trace buffer.
-	 */
-	if (ktp->kt_nentries == ktrace_zentries)
-		kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
-	else
-		kmem_free(ktp->kt_entries);
-
-	kmem_zone_free(ktrace_hdr_zone, ktp);
-}
-
-
-/*
- * Enter the given values into the "next" entry in the trace buffer.
- * kt_index is always the index of the next entry to be filled.
- */
-void
-ktrace_enter(
-	ktrace_t        *ktp,
-	void            *val0,
-	void            *val1,
-	void            *val2,
-	void            *val3,
-	void            *val4,
-	void            *val5,
-	void            *val6,
-	void            *val7,
-	void            *val8,
-	void            *val9,
-	void            *val10,
-	void            *val11,
-	void            *val12,
-	void            *val13,
-	void            *val14,
-	void            *val15)
-{
-	int             index;
-	ktrace_entry_t  *ktep;
-
-	ASSERT(ktp != NULL);
-
-	/*
-	 * Grab an entry by pushing the index up to the next one.
-	 */
-	index = atomic_add_return(1, &ktp->kt_index);
-	index = (index - 1) & ktp->kt_index_mask;
-	if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
-		ktp->kt_rollover = 1;
-
-	ASSERT((index >= 0) && (index < ktp->kt_nentries));
-
-	ktep = &(ktp->kt_entries[index]);
-
-	ktep->val[0]  = val0;
-	ktep->val[1]  = val1;
-	ktep->val[2]  = val2;
-	ktep->val[3]  = val3;
-	ktep->val[4]  = val4;
-	ktep->val[5]  = val5;
-	ktep->val[6]  = val6;
-	ktep->val[7]  = val7;
-	ktep->val[8]  = val8;
-	ktep->val[9]  = val9;
-	ktep->val[10] = val10;
-	ktep->val[11] = val11;
-	ktep->val[12] = val12;
-	ktep->val[13] = val13;
-	ktep->val[14] = val14;
-	ktep->val[15] = val15;
-}
-
-/*
- * Return the number of entries in the trace buffer.
- */
-int
-ktrace_nentries(
-	ktrace_t        *ktp)
-{
-	int	index;
-	if (ktp == NULL)
-		return 0;
-
-	index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
-	return (ktp->kt_rollover ? ktp->kt_nentries : index);
-}
-
-/*
- * ktrace_first()
- *
- * This is used to find the start of the trace buffer.
- * In conjunction with ktrace_next() it can be used to
- * iterate through the entire trace buffer.  This code does
- * not do any locking because it is assumed that it is called
- * from the debugger.
- *
- * The caller must pass in a pointer to a ktrace_snap
- * structure in which we will keep some state used to
- * iterate through the buffer.  This state must not touched
- * by any code outside of this module.
- */
-ktrace_entry_t *
-ktrace_first(ktrace_t   *ktp, ktrace_snap_t     *ktsp)
-{
-	ktrace_entry_t  *ktep;
-	int             index;
-	int             nentries;
-
-	if (ktp->kt_rollover)
-		index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
-	else
-		index = 0;
-
-	ktsp->ks_start = index;
-	ktep = &(ktp->kt_entries[index]);
-
-	nentries = ktrace_nentries(ktp);
-	index++;
-	if (index < nentries) {
-		ktsp->ks_index = index;
-	} else {
-		ktsp->ks_index = 0;
-		if (index > nentries)
-			ktep = NULL;
-	}
-	return ktep;
-}
-
-/*
- * ktrace_next()
- *
- * This is used to iterate through the entries of the given
- * trace buffer.  The caller must pass in the ktrace_snap_t
- * structure initialized by ktrace_first().  The return value
- * will be either a pointer to the next ktrace_entry or NULL
- * if all of the entries have been traversed.
- */
-ktrace_entry_t *
-ktrace_next(
-	ktrace_t        *ktp,
-	ktrace_snap_t   *ktsp)
-{
-	int             index;
-	ktrace_entry_t  *ktep;
-
-	index = ktsp->ks_index;
-	if (index == ktsp->ks_start) {
-		ktep = NULL;
-	} else {
-		ktep = &ktp->kt_entries[index];
-	}
-
-	index++;
-	if (index == ktrace_nentries(ktp)) {
-		ktsp->ks_index = 0;
-	} else {
-		ktsp->ks_index = index;
-	}
-
-	return ktep;
-}
-
-/*
- * ktrace_skip()
- *
- * Skip the next "count" entries and return the entry after that.
- * Return NULL if this causes us to iterate past the beginning again.
- */
-ktrace_entry_t *
-ktrace_skip(
-	ktrace_t        *ktp,
-	int             count,
-	ktrace_snap_t   *ktsp)
-{
-	int             index;
-	int             new_index;
-	ktrace_entry_t  *ktep;
-	int             nentries = ktrace_nentries(ktp);
-
-	index = ktsp->ks_index;
-	new_index = index + count;
-	while (new_index >= nentries) {
-		new_index -= nentries;
-	}
-	if (index == ktsp->ks_start) {
-		/*
-		 * We've iterated around to the start, so we're done.
-		 */
-		ktep = NULL;
-	} else if ((new_index < index) && (index < ktsp->ks_index)) {
-		/*
-		 * We've skipped past the start again, so we're done.
-		 */
-		ktep = NULL;
-		ktsp->ks_index = ktsp->ks_start;
-	} else {
-		ktep = &(ktp->kt_entries[new_index]);
-		new_index++;
-		if (new_index == nentries) {
-			ktsp->ks_index = 0;
-		} else {
-			ktsp->ks_index = new_index;
-		}
-	}
-	return ktep;
-}
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
deleted file mode 100644
index 741d6947ca6..00000000000
--- a/fs/xfs/support/ktrace.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_KTRACE_H__
-#define __XFS_SUPPORT_KTRACE_H__
-
-/*
- * Trace buffer entry structure.
- */
-typedef struct ktrace_entry {
-	void	*val[16];
-} ktrace_entry_t;
-
-/*
- * Trace buffer header structure.
- */
-typedef struct ktrace {
-	int		kt_nentries;	/* number of entries in trace buf */
-	atomic_t	kt_index;	/* current index in entries */
-	unsigned int	kt_index_mask;
-	int		kt_rollover;
-	ktrace_entry_t	*kt_entries;	/* buffer of entries */
-} ktrace_t;
-
-/*
- * Trace buffer snapshot structure.
- */
-typedef struct ktrace_snap {
-	int		ks_start;	/* kt_index at time of snap */
-	int		ks_index;	/* current index */
-} ktrace_snap_t;
-
-
-#ifdef CONFIG_XFS_TRACE
-
-extern void ktrace_init(int zentries);
-extern void ktrace_uninit(void);
-
-extern ktrace_t *ktrace_alloc(int, unsigned int __nocast);
-extern void ktrace_free(ktrace_t *);
-
-extern void ktrace_enter(
-	ktrace_t	*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*);
-
-extern ktrace_entry_t   *ktrace_first(ktrace_t *, ktrace_snap_t *);
-extern int              ktrace_nentries(ktrace_t *);
-extern ktrace_entry_t   *ktrace_next(ktrace_t *, ktrace_snap_t *);
-extern ktrace_entry_t   *ktrace_skip(ktrace_t *, int, ktrace_snap_t *);
-
-#else
-#define ktrace_init(x)	do { } while (0)
-#define ktrace_uninit()	do { } while (0)
-#endif	/* CONFIG_XFS_TRACE */
-
-#endif	/* __XFS_SUPPORT_KTRACE_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 17254b529c5..5ad8ad3a1dc 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -25,21 +25,5 @@
 /* #define QUOTADEBUG 1 */
 #endif
 
-#ifdef CONFIG_XFS_TRACE
-#define XFS_ALLOC_TRACE 1
-#define XFS_ATTR_TRACE 1
-#define XFS_BLI_TRACE 1
-#define XFS_BMAP_TRACE 1
-#define XFS_BTREE_TRACE 1
-#define XFS_DIR2_TRACE 1
-#define XFS_DQUOT_TRACE 1
-#define XFS_ILOCK_TRACE 1
-#define XFS_LOG_TRACE 1
-#define XFS_RW_TRACE 1
-#define XFS_BUF_TRACE 1
-#define XFS_INODE_TRACE 1
-#define XFS_FILESTREAMS_TRACE 1
-#endif
-
 #include <linux-2.6/xfs_linux.h>
 #endif	/* __XFS_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index a5d54bf4931..6702bd86581 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -86,6 +86,20 @@ typedef struct xfs_agf {
 #define	XFS_AGF_NUM_BITS	12
 #define	XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)
 
+#define XFS_AGF_FLAGS \
+	{ XFS_AGF_MAGICNUM,	"MAGICNUM" }, \
+	{ XFS_AGF_VERSIONNUM,	"VERSIONNUM" }, \
+	{ XFS_AGF_SEQNO,	"SEQNO" }, \
+	{ XFS_AGF_LENGTH,	"LENGTH" }, \
+	{ XFS_AGF_ROOTS,	"ROOTS" }, \
+	{ XFS_AGF_LEVELS,	"LEVELS" }, \
+	{ XFS_AGF_FLFIRST,	"FLFIRST" }, \
+	{ XFS_AGF_FLLAST,	"FLLAST" }, \
+	{ XFS_AGF_FLCOUNT,	"FLCOUNT" }, \
+	{ XFS_AGF_FREEBLKS,	"FREEBLKS" }, \
+	{ XFS_AGF_LONGEST,	"LONGEST" }, \
+	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }
+
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGF_DADDR(mp)	((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
 #define	XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 2cf944eb796..a1c65fc6d9c 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -38,6 +38,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 
 #define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
@@ -51,30 +52,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 		    xfs_agblock_t bno,
 		    xfs_extlen_t len);
 
-#if defined(XFS_ALLOC_TRACE)
-ktrace_t *xfs_alloc_trace_buf;
-
-#define	TRACE_ALLOC(s,a)	\
-	xfs_alloc_trace_alloc(__func__, s, a, __LINE__)
-#define	TRACE_FREE(s,a,b,x,f)	\
-	xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__)
-#define	TRACE_MODAGF(s,a,f)	\
-	xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__)
-#define	TRACE_BUSY(__func__,s,ag,agb,l,sl,tp)	\
-	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
-#define	TRACE_UNBUSY(__func__,s,ag,sl,tp)	\
-	xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define	TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp)	\
-	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
-#else
-#define	TRACE_ALLOC(s,a)
-#define	TRACE_FREE(s,a,b,x,f)
-#define	TRACE_MODAGF(s,a,f)
-#define	TRACE_BUSY(s,a,ag,agb,l,sl,tp)
-#define	TRACE_UNBUSY(fname,s,ag,sl,tp)
-#define	TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp)
-#endif	/* XFS_ALLOC_TRACE */
-
 /*
  * Prototypes for per-ag allocation routines
  */
@@ -498,124 +475,6 @@ xfs_alloc_read_agfl(
 	return 0;
 }
 
-#if defined(XFS_ALLOC_TRACE)
-/*
- * Add an allocation trace entry for an alloc call.
- */
-STATIC void
-xfs_alloc_trace_alloc(
-	const char	*name,		/* function tag string */
-	char		*str,		/* additional string */
-	xfs_alloc_arg_t	*args,		/* allocation argument structure */
-	int		line)		/* source line number */
-{
-	ktrace_enter(xfs_alloc_trace_buf,
-		(void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)),
-		(void *)name,
-		(void *)str,
-		(void *)args->mp,
-		(void *)(__psunsigned_t)args->agno,
-		(void *)(__psunsigned_t)args->agbno,
-		(void *)(__psunsigned_t)args->minlen,
-		(void *)(__psunsigned_t)args->maxlen,
-		(void *)(__psunsigned_t)args->mod,
-		(void *)(__psunsigned_t)args->prod,
-		(void *)(__psunsigned_t)args->minleft,
-		(void *)(__psunsigned_t)args->total,
-		(void *)(__psunsigned_t)args->alignment,
-		(void *)(__psunsigned_t)args->len,
-		(void *)((((__psint_t)args->type) << 16) |
-			 (__psint_t)args->otype),
-		(void *)(__psint_t)((args->wasdel << 3) |
-				    (args->wasfromfl << 2) |
-				    (args->isfl << 1) |
-				    (args->userdata << 0)));
-}
-
-/*
- * Add an allocation trace entry for a free call.
- */
-STATIC void
-xfs_alloc_trace_free(
-	const char	*name,		/* function tag string */
-	char		*str,		/* additional string */
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	xfs_agblock_t	agbno,		/* a.g. relative block number */
-	xfs_extlen_t	len,		/* length of extent */
-	int		isfl,		/* set if is freelist allocation/free */
-	int		line)		/* source line number */
-{
-	ktrace_enter(xfs_alloc_trace_buf,
-		(void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)),
-		(void *)name,
-		(void *)str,
-		(void *)mp,
-		(void *)(__psunsigned_t)agno,
-		(void *)(__psunsigned_t)agbno,
-		(void *)(__psunsigned_t)len,
-		(void *)(__psint_t)isfl,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add an allocation trace entry for modifying an agf.
- */
-STATIC void
-xfs_alloc_trace_modagf(
-	const char	*name,		/* function tag string */
-	char		*str,		/* additional string */
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_agf_t	*agf,		/* new agf value */
-	int		flags,		/* logging flags for agf */
-	int		line)		/* source line number */
-{
-	ktrace_enter(xfs_alloc_trace_buf,
-		(void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)),
-		(void *)name,
-		(void *)str,
-		(void *)mp,
-		(void *)(__psint_t)flags,
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_seqno),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_length),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_flfirst),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_fllast),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_flcount),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_freeblks),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_longest));
-}
-
-STATIC void
-xfs_alloc_trace_busy(
-	const char	*name,		/* function tag string */
-	char		*str,		/* additional string */
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	xfs_agblock_t	agbno,		/* a.g. relative block number */
-	xfs_extlen_t	len,		/* length of extent */
-	int		slot,		/* perag Busy slot */
-	xfs_trans_t	*tp,
-	int		trtype,		/* type: add, delete, search */
-	int		line)		/* source line number */
-{
-	ktrace_enter(xfs_alloc_trace_buf,
-		(void *)(__psint_t)(trtype | (line << 16)),
-		(void *)name,
-		(void *)str,
-		(void *)mp,
-		(void *)(__psunsigned_t)agno,
-		(void *)(__psunsigned_t)agbno,
-		(void *)(__psunsigned_t)len,
-		(void *)(__psint_t)slot,
-		(void *)tp,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL);
-}
-#endif	/* XFS_ALLOC_TRACE */
-
 /*
  * Allocation group level functions.
  */
@@ -665,9 +524,6 @@ xfs_alloc_ag_vextent(
 	 */
 	if (args->agbno != NULLAGBLOCK) {
 		xfs_agf_t	*agf;	/* allocation group freelist header */
-#ifdef XFS_ALLOC_TRACE
-		xfs_mount_t	*mp = args->mp;
-#endif
 		long		slen = (long)args->len;
 
 		ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
@@ -682,7 +538,6 @@ xfs_alloc_ag_vextent(
 			args->pag->pagf_freeblks -= args->len;
 			ASSERT(be32_to_cpu(agf->agf_freeblks) <=
 				be32_to_cpu(agf->agf_length));
-			TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
 			xfs_alloc_log_agf(args->tp, args->agbp,
 						XFS_AGF_FREEBLKS);
 			/* search the busylist for these blocks */
@@ -792,13 +647,14 @@ xfs_alloc_ag_vextent_exact(
 	}
 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-	TRACE_ALLOC("normal", args);
+
+	trace_xfs_alloc_exact_done(args);
 	args->wasfromfl = 0;
 	return 0;
 
 error0:
 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
-	TRACE_ALLOC("error", args);
+	trace_xfs_alloc_exact_error(args);
 	return error;
 }
 
@@ -958,7 +814,7 @@ xfs_alloc_ag_vextent_near(
 		args->len = blen;
 		if (!xfs_alloc_fix_minleft(args)) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			TRACE_ALLOC("nominleft", args);
+			trace_xfs_alloc_near_nominleft(args);
 			return 0;
 		}
 		blen = args->len;
@@ -981,7 +837,8 @@ xfs_alloc_ag_vextent_near(
 			goto error0;
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-		TRACE_ALLOC("first", args);
+
+		trace_xfs_alloc_near_first(args);
 		return 0;
 	}
 	/*
@@ -1272,7 +1129,7 @@ xfs_alloc_ag_vextent_near(
 	 * If we couldn't get anything, give up.
 	 */
 	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
-		TRACE_ALLOC("neither", args);
+		trace_xfs_alloc_size_neither(args);
 		args->agbno = NULLAGBLOCK;
 		return 0;
 	}
@@ -1299,7 +1156,7 @@ xfs_alloc_ag_vextent_near(
 	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 	xfs_alloc_fix_len(args);
 	if (!xfs_alloc_fix_minleft(args)) {
-		TRACE_ALLOC("nominleft", args);
+		trace_xfs_alloc_near_nominleft(args);
 		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 		return 0;
@@ -1314,13 +1171,18 @@ xfs_alloc_ag_vextent_near(
 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
 			ltnew, rlen, XFSA_FIXUP_BNO_OK)))
 		goto error0;
-	TRACE_ALLOC(j ? "gt" : "lt", args);
+
+	if (j)
+		trace_xfs_alloc_near_greater(args);
+	else
+		trace_xfs_alloc_near_lesser(args);
+
 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 	xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
 	return 0;
 
  error0:
-	TRACE_ALLOC("error", args);
+	trace_xfs_alloc_near_error(args);
 	if (cnt_cur != NULL)
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
 	if (bno_cur_lt != NULL)
@@ -1371,7 +1233,7 @@ xfs_alloc_ag_vextent_size(
 			goto error0;
 		if (i == 0 || flen == 0) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			TRACE_ALLOC("noentry", args);
+			trace_xfs_alloc_size_noentry(args);
 			return 0;
 		}
 		ASSERT(i == 1);
@@ -1448,7 +1310,7 @@ xfs_alloc_ag_vextent_size(
 	xfs_alloc_fix_len(args);
 	if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-		TRACE_ALLOC("nominleft", args);
+		trace_xfs_alloc_size_nominleft(args);
 		args->agbno = NULLAGBLOCK;
 		return 0;
 	}
@@ -1471,11 +1333,11 @@ xfs_alloc_ag_vextent_size(
 		args->agbno + args->len <=
 			be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
 		error0);
-	TRACE_ALLOC("normal", args);
+	trace_xfs_alloc_size_done(args);
 	return 0;
 
 error0:
-	TRACE_ALLOC("error", args);
+	trace_xfs_alloc_size_error(args);
 	if (cnt_cur)
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
 	if (bno_cur)
@@ -1534,7 +1396,7 @@ xfs_alloc_ag_vextent_small(
 				be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
 				error0);
 			args->wasfromfl = 1;
-			TRACE_ALLOC("freelist", args);
+			trace_xfs_alloc_small_freelist(args);
 			*stat = 0;
 			return 0;
 		}
@@ -1556,17 +1418,17 @@ xfs_alloc_ag_vextent_small(
 	 */
 	if (flen < args->minlen) {
 		args->agbno = NULLAGBLOCK;
-		TRACE_ALLOC("notenough", args);
+		trace_xfs_alloc_small_notenough(args);
 		flen = 0;
 	}
 	*fbnop = fbno;
 	*flenp = flen;
 	*stat = 1;
-	TRACE_ALLOC("normal", args);
+	trace_xfs_alloc_small_done(args);
 	return 0;
 
 error0:
-	TRACE_ALLOC("error", args);
+	trace_xfs_alloc_small_error(args);
 	return error;
 }
 
@@ -1809,17 +1671,14 @@ xfs_free_ag_extent(
 			be32_to_cpu(agf->agf_freeblks) <=
 			be32_to_cpu(agf->agf_length),
 			error0);
-		TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
 		xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
 		if (!isfl)
 			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
 		XFS_STATS_INC(xs_freex);
 		XFS_STATS_ADD(xs_freeb, len);
 	}
-	TRACE_FREE(haveleft ?
-			(haveright ? "both" : "left") :
-			(haveright ? "right" : "none"),
-		agno, bno, len, isfl);
+
+	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
 
 	/*
 	 * Since blocks move to the free list without the coordination
@@ -1836,7 +1695,7 @@ xfs_free_ag_extent(
 	return 0;
 
  error0:
-	TRACE_FREE("error", agno, bno, len, isfl);
+	trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
 	if (bno_cur)
 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
 	if (cnt_cur)
@@ -2122,7 +1981,6 @@ xfs_alloc_get_freelist(
 		logflags |= XFS_AGF_BTREEBLKS;
 	}
 
-	TRACE_MODAGF(NULL, agf, logflags);
 	xfs_alloc_log_agf(tp, agbp, logflags);
 	*bnop = bno;
 
@@ -2165,6 +2023,8 @@ xfs_alloc_log_agf(
 		sizeof(xfs_agf_t)
 	};
 
+	trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+
 	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
 	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
 }
@@ -2230,13 +2090,11 @@ xfs_alloc_put_freelist(
 		logflags |= XFS_AGF_BTREEBLKS;
 	}
 
-	TRACE_MODAGF(NULL, agf, logflags);
 	xfs_alloc_log_agf(tp, agbp, logflags);
 
 	ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
 	blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
 	*blockp = cpu_to_be32(bno);
-	TRACE_MODAGF(NULL, agf, logflags);
 	xfs_alloc_log_agf(tp, agbp, logflags);
 	xfs_trans_log_buf(tp, agflbp,
 		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
@@ -2399,7 +2257,7 @@ xfs_alloc_vextent(
 	    args->minlen > args->maxlen || args->minlen > agsize ||
 	    args->mod >= args->prod) {
 		args->fsbno = NULLFSBLOCK;
-		TRACE_ALLOC("badargs", args);
+		trace_xfs_alloc_vextent_badargs(args);
 		return 0;
 	}
 	minleft = args->minleft;
@@ -2418,12 +2276,12 @@ xfs_alloc_vextent(
 		error = xfs_alloc_fix_freelist(args, 0);
 		args->minleft = minleft;
 		if (error) {
-			TRACE_ALLOC("nofix", args);
+			trace_xfs_alloc_vextent_nofix(args);
 			goto error0;
 		}
 		if (!args->agbp) {
 			up_read(&mp->m_peraglock);
-			TRACE_ALLOC("noagbp", args);
+			trace_xfs_alloc_vextent_noagbp(args);
 			break;
 		}
 		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
@@ -2488,7 +2346,7 @@ xfs_alloc_vextent(
 			error = xfs_alloc_fix_freelist(args, flags);
 			args->minleft = minleft;
 			if (error) {
-				TRACE_ALLOC("nofix", args);
+				trace_xfs_alloc_vextent_nofix(args);
 				goto error0;
 			}
 			/*
@@ -2499,7 +2357,9 @@ xfs_alloc_vextent(
 					goto error0;
 				break;
 			}
-			TRACE_ALLOC("loopfailed", args);
+
+			trace_xfs_alloc_vextent_loopfailed(args);
+
 			/*
 			 * Didn't work, figure out the next iteration.
 			 */
@@ -2526,7 +2386,7 @@ xfs_alloc_vextent(
 			if (args->agno == sagno) {
 				if (no_min == 1) {
 					args->agbno = NULLAGBLOCK;
-					TRACE_ALLOC("allfailed", args);
+					trace_xfs_alloc_vextent_allfailed(args);
 					break;
 				}
 				if (flags == 0) {
@@ -2642,16 +2502,16 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
 		}
 	}
 
+	trace_xfs_alloc_busy(mp, agno, bno, len, n);
+
 	if (n < XFS_PAGB_NUM_SLOTS) {
 		bsy = &mp->m_perag[agno].pagb_list[n];
 		mp->m_perag[agno].pagb_count++;
-		TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp);
 		bsy->busy_start = bno;
 		bsy->busy_length = len;
 		bsy->busy_tp = tp;
 		xfs_trans_add_busy(tp, agno, n);
 	} else {
-		TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp);
 		/*
 		 * The busy list is full!  Since it is now not possible to
 		 * track the free block, make this a synchronous transaction
@@ -2678,12 +2538,12 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
 	list = mp->m_perag[agno].pagb_list;
 
 	ASSERT(idx < XFS_PAGB_NUM_SLOTS);
+
+	trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp);
+
 	if (list[idx].busy_tp == tp) {
-		TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp);
 		list[idx].busy_tp = NULL;
 		mp->m_perag[agno].pagb_count--;
-	} else {
-		TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp);
 	}
 
 	spin_unlock(&mp->m_perag[agno].pagb_lock);
@@ -2724,24 +2584,22 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 			if ((bno > bend) || (uend < bsy->busy_start)) {
 				cnt--;
 			} else {
-				TRACE_BUSYSEARCH("xfs_alloc_search_busy",
-					 "found1", agno, bno, len, tp);
 				break;
 			}
 		}
 	}
 
+	trace_xfs_alloc_busysearch(mp, agno, bno, len, !!cnt);
+
 	/*
 	 * If a block was found, force the log through the LSN of the
 	 * transaction that freed the block
 	 */
 	if (cnt) {
-		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
 		lsn = bsy->busy_tp->t_commit_lsn;
 		spin_unlock(&mp->m_perag[agno].pagb_lock);
 		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
 	} else {
-		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
 		spin_unlock(&mp->m_perag[agno].pagb_lock);
 	}
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index e704caee10d..599bffa3978 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -37,6 +37,15 @@ typedef enum xfs_alloctype
 	XFS_ALLOCTYPE_THIS_BNO		/* at exactly this block */
 } xfs_alloctype_t;
 
+#define XFS_ALLOC_TYPES \
+	{ XFS_ALLOCTYPE_ANY_AG,		"ANY_AG" }, \
+	{ XFS_ALLOCTYPE_FIRST_AG,	"FIRST_AG" }, \
+	{ XFS_ALLOCTYPE_START_AG,	"START_AG" }, \
+	{ XFS_ALLOCTYPE_THIS_AG,	"THIS_AG" }, \
+	{ XFS_ALLOCTYPE_START_BNO,	"START_BNO" }, \
+	{ XFS_ALLOCTYPE_NEAR_BNO,	"NEAR_BNO" }, \
+	{ XFS_ALLOCTYPE_THIS_BNO,	"THIS_BNO" }
+
 /*
  * Flags for xfs_alloc_fix_freelist.
  */
@@ -109,24 +118,6 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 
 #ifdef __KERNEL__
 
-#if defined(XFS_ALLOC_TRACE)
-/*
- * Allocation tracing buffer size.
- */
-#define	XFS_ALLOC_TRACE_SIZE	4096
-extern ktrace_t *xfs_alloc_trace_buf;
-
-/*
- * Types for alloc tracing.
- */
-#define	XFS_ALLOC_KTRACE_ALLOC	1
-#define	XFS_ALLOC_KTRACE_FREE	2
-#define	XFS_ALLOC_KTRACE_MODAGF	3
-#define	XFS_ALLOC_KTRACE_BUSY	4
-#define	XFS_ALLOC_KTRACE_UNBUSY	5
-#define	XFS_ALLOC_KTRACE_BUSYSEARCH	6
-#endif
-
 void
 xfs_alloc_mark_busy(xfs_trans_t *tp,
 		xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index c10c3a292d3..adbd9141aea 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -39,6 +39,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 
 STATIC struct xfs_btree_cur *
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 4ece1906bd4..e953b6cfb2a 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -47,6 +47,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_rw.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 
 /*
  * xfs_attr.c
@@ -89,10 +90,6 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
 
 #define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
 
-#if defined(XFS_ATTR_TRACE)
-ktrace_t *xfs_attr_trace_buf;
-#endif
-
 STATIC int
 xfs_attr_name_to_xname(
 	struct xfs_name	*xname,
@@ -123,9 +120,13 @@ xfs_inode_hasattr(
  * Overall external interface routines.
  *========================================================================*/
 
-int
-xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
-		char *value, int *valuelenp, int flags)
+STATIC int
+xfs_attr_get_int(
+	struct xfs_inode	*ip,
+	struct xfs_name		*name,
+	char			*value,
+	int			*valuelenp,
+	int			flags)
 {
 	xfs_da_args_t   args;
 	int             error;
@@ -188,7 +189,7 @@ xfs_attr_get(
 		return error;
 
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	error = xfs_attr_fetch(ip, &xname, value, valuelenp, flags);
+	error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	return(error);
 }
@@ -636,7 +637,6 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
 		return EIO;
 
 	xfs_ilock(dp, XFS_ILOCK_SHARED);
-	xfs_attr_trace_l_c("syscall start", context);
 
 	/*
 	 * Decide on what work routines to call based on the inode size.
@@ -652,7 +652,6 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
 	}
 
 	xfs_iunlock(dp, XFS_ILOCK_SHARED);
-	xfs_attr_trace_l_c("syscall end", context);
 
 	return error;
 }
@@ -698,7 +697,7 @@ xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
 			context->count * sizeof(alist->al_offset[0]);
 	context->firstu -= ATTR_ENTSIZE(namelen);
 	if (context->firstu < arraytop) {
-		xfs_attr_trace_l_c("buffer full", context);
+		trace_xfs_attr_list_full(context);
 		alist->al_more = 1;
 		context->seen_enough = 1;
 		return 1;
@@ -710,7 +709,7 @@ xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
 	aep->a_name[namelen] = 0;
 	alist->al_offset[context->count++] = context->firstu;
 	alist->al_count = context->count;
-	xfs_attr_trace_l_c("add", context);
+	trace_xfs_attr_list_add(context);
 	return 0;
 }
 
@@ -1849,7 +1848,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 			node = bp->data;
 			switch (be16_to_cpu(node->hdr.info.magic)) {
 			case XFS_DA_NODE_MAGIC:
-				xfs_attr_trace_l_cn("wrong blk", context, node);
+				trace_xfs_attr_list_wrong_blk(context);
 				xfs_da_brelse(NULL, bp);
 				bp = NULL;
 				break;
@@ -1857,20 +1856,18 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 				leaf = bp->data;
 				if (cursor->hashval > be32_to_cpu(leaf->entries[
 				    be16_to_cpu(leaf->hdr.count)-1].hashval)) {
-					xfs_attr_trace_l_cl("wrong blk",
-							   context, leaf);
+					trace_xfs_attr_list_wrong_blk(context);
 					xfs_da_brelse(NULL, bp);
 					bp = NULL;
 				} else if (cursor->hashval <=
 					     be32_to_cpu(leaf->entries[0].hashval)) {
-					xfs_attr_trace_l_cl("maybe wrong blk",
-							   context, leaf);
+					trace_xfs_attr_list_wrong_blk(context);
 					xfs_da_brelse(NULL, bp);
 					bp = NULL;
 				}
 				break;
 			default:
-				xfs_attr_trace_l_c("wrong blk - ??", context);
+				trace_xfs_attr_list_wrong_blk(context);
 				xfs_da_brelse(NULL, bp);
 				bp = NULL;
 			}
@@ -1915,8 +1912,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 				if (cursor->hashval
 						<= be32_to_cpu(btree->hashval)) {
 					cursor->blkno = be32_to_cpu(btree->before);
-					xfs_attr_trace_l_cb("descending",
-							    context, btree);
+					trace_xfs_attr_list_node_descend(context,
+									 btree);
 					break;
 				}
 			}
@@ -2143,8 +2140,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
 		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
 
-		bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt,
-				       XFS_BUF_LOCK | XBF_DONT_BLOCK);
+		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
+				 XFS_BUF_LOCK | XBF_DONT_BLOCK);
 		ASSERT(bp);
 		ASSERT(!XFS_BUF_GETERROR(bp));
 
@@ -2266,85 +2263,3 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 	}
 	return(0);
 }
-
-#if defined(XFS_ATTR_TRACE)
-/*
- * Add a trace buffer entry for an attr_list context structure.
- */
-void
-xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
-{
-	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, context,
-		(__psunsigned_t)NULL,
-		(__psunsigned_t)NULL,
-		(__psunsigned_t)NULL);
-}
-
-/*
- * Add a trace buffer entry for a context structure and a Btree node.
- */
-void
-xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
-			 struct xfs_da_intnode *node)
-{
-	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, context,
-		(__psunsigned_t)be16_to_cpu(node->hdr.count),
-		(__psunsigned_t)be32_to_cpu(node->btree[0].hashval),
-		(__psunsigned_t)be32_to_cpu(node->btree[
-				    be16_to_cpu(node->hdr.count)-1].hashval));
-}
-
-/*
- * Add a trace buffer entry for a context structure and a Btree element.
- */
-void
-xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
-			  struct xfs_da_node_entry *btree)
-{
-	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, context,
-		(__psunsigned_t)be32_to_cpu(btree->hashval),
-		(__psunsigned_t)be32_to_cpu(btree->before),
-		(__psunsigned_t)NULL);
-}
-
-/*
- * Add a trace buffer entry for a context structure and a leaf block.
- */
-void
-xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
-			      struct xfs_attr_leafblock *leaf)
-{
-	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, context,
-		(__psunsigned_t)be16_to_cpu(leaf->hdr.count),
-		(__psunsigned_t)be32_to_cpu(leaf->entries[0].hashval),
-		(__psunsigned_t)be32_to_cpu(leaf->entries[
-				be16_to_cpu(leaf->hdr.count)-1].hashval));
-}
-
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-void
-xfs_attr_trace_enter(int type, char *where,
-			 struct xfs_attr_list_context *context,
-			 __psunsigned_t a13, __psunsigned_t a14,
-			 __psunsigned_t a15)
-{
-	ASSERT(xfs_attr_trace_buf);
-	ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
-		(void *)((__psunsigned_t)where),
-		(void *)((__psunsigned_t)context->dp),
-		(void *)((__psunsigned_t)context->cursor->hashval),
-		(void *)((__psunsigned_t)context->cursor->blkno),
-		(void *)((__psunsigned_t)context->cursor->offset),
-		(void *)((__psunsigned_t)context->alist),
-		(void *)((__psunsigned_t)context->bufsize),
-		(void *)((__psunsigned_t)context->count),
-		(void *)((__psunsigned_t)context->firstu),
-		NULL,
-		(void *)((__psunsigned_t)context->dupcnt),
-		(void *)((__psunsigned_t)context->flags),
-		(void *)a13, (void *)a14, (void *)a15);
-}
-#endif	/* XFS_ATTR_TRACE */
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index fb3b2a68b9b..59b410ce69a 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -48,6 +48,16 @@ struct xfs_attr_list_context;
 #define ATTR_KERNOTIME	0x1000	/* [kernel] don't update inode timestamps */
 #define ATTR_KERNOVAL	0x2000	/* [kernel] get attr size only, not value */
 
+#define XFS_ATTR_FLAGS \
+	{ ATTR_DONTFOLLOW, 	"DONTFOLLOW" }, \
+	{ ATTR_ROOT,		"ROOT" }, \
+	{ ATTR_TRUST,		"TRUST" }, \
+	{ ATTR_SECURE,		"SECURE" }, \
+	{ ATTR_CREATE,		"CREATE" }, \
+	{ ATTR_REPLACE,		"REPLACE" }, \
+	{ ATTR_KERNOTIME,	"KERNOTIME" }, \
+	{ ATTR_KERNOVAL,	"KERNOVAL" }
+
 /*
  * The maximum size (into the kernel or returned from the kernel) of an
  * attribute value or the buffer used for an attr_list() call.  Larger
@@ -131,7 +141,6 @@ typedef struct xfs_attr_list_context {
  */
 int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
 int xfs_attr_inactive(struct xfs_inode *dp);
-int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int);
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
 int xfs_attr_list_int(struct xfs_attr_list_context *);
 
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index afdc8911637..baf41b5af75 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -42,6 +42,7 @@
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 /*
  * xfs_attr_leaf.c
@@ -98,7 +99,7 @@ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
  * If namespace bits don't match return 0.
  * If all match then return 1.
  */
-STATIC_INLINE int
+STATIC int
 xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
 {
 	return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
@@ -594,7 +595,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	cursor = context->cursor;
 	ASSERT(cursor != NULL);
 
-	xfs_attr_trace_l_c("sf start", context);
+	trace_xfs_attr_list_sf(context);
 
 	/*
 	 * If the buffer is large enough and the cursor is at the start,
@@ -627,7 +628,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 				return error;
 			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 		}
-		xfs_attr_trace_l_c("sf big-gulp", context);
+		trace_xfs_attr_list_sf_all(context);
 		return(0);
 	}
 
@@ -653,7 +654,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 			XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
 					     XFS_ERRLEVEL_LOW,
 					     context->dp->i_mount, sfe);
-			xfs_attr_trace_l_c("sf corrupted", context);
 			kmem_free(sbuf);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
@@ -693,7 +693,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	}
 	if (i == nsbuf) {
 		kmem_free(sbuf);
-		xfs_attr_trace_l_c("blk end", context);
 		return(0);
 	}
 
@@ -719,7 +718,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	}
 
 	kmem_free(sbuf);
-	xfs_attr_trace_l_c("sf E-O-F", context);
 	return(0);
 }
 
@@ -2323,7 +2321,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 	cursor = context->cursor;
 	cursor->initted = 1;
 
-	xfs_attr_trace_l_cl("blk start", context, leaf);
+	trace_xfs_attr_list_leaf(context);
 
 	/*
 	 * Re-find our place in the leaf block if this is a new syscall.
@@ -2344,7 +2342,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 			}
 		}
 		if (i == be16_to_cpu(leaf->hdr.count)) {
-			xfs_attr_trace_l_c("not found", context);
+			trace_xfs_attr_list_notfound(context);
 			return(0);
 		}
 	} else {
@@ -2419,7 +2417,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 			break;
 		cursor->offset++;
 	}
-	xfs_attr_trace_l_cl("blk end", context, leaf);
+	trace_xfs_attr_list_leaf_end(context);
 	return(retval);
 }
 
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index ea22839caed..76ab7b0cbb3 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -25,8 +25,6 @@
  * to fit into the literal area of the inode.
  */
 
-struct xfs_inode;
-
 /*
  * Entries are packed toward the top as tight as possible.
  */
@@ -69,42 +67,4 @@ typedef struct xfs_attr_sf_sort {
 	(be16_to_cpu(((xfs_attr_shortform_t *)	\
 		((dp)->i_afp->if_u1.if_data))->hdr.totsize))
 
-#if defined(XFS_ATTR_TRACE)
-/*
- * Kernel tracing support for attribute lists
- */
-struct xfs_attr_list_context;
-struct xfs_da_intnode;
-struct xfs_da_node_entry;
-struct xfs_attr_leafblock;
-
-#define	XFS_ATTR_TRACE_SIZE	4096	/* size of global trace buffer */
-extern ktrace_t	*xfs_attr_trace_buf;
-
-/*
- * Trace record types.
- */
-#define	XFS_ATTR_KTRACE_L_C	1	/* context */
-#define	XFS_ATTR_KTRACE_L_CN	2	/* context, node */
-#define	XFS_ATTR_KTRACE_L_CB	3	/* context, btree */
-#define	XFS_ATTR_KTRACE_L_CL	4	/* context, leaf */
-
-void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context);
-void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
-			      struct xfs_da_intnode *node);
-void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
-			      struct xfs_da_node_entry *btree);
-void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
-			      struct xfs_attr_leafblock *leaf);
-void xfs_attr_trace_enter(int type, char *where,
-			     struct xfs_attr_list_context *context,
-			     __psunsigned_t a13, __psunsigned_t a14,
-			     __psunsigned_t a15);
-#else
-#define	xfs_attr_trace_l_c(w,c)
-#define	xfs_attr_trace_l_cn(w,c,n)
-#define	xfs_attr_trace_l_cb(w,c,b)
-#define	xfs_attr_trace_l_cl(w,c,l)
-#endif /* XFS_ATTR_TRACE */
-
 #endif	/* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8971fb09d38..98251cdc52a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -54,6 +54,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 
 
 #ifdef DEBUG
@@ -272,71 +273,6 @@ xfs_bmap_isaeof(
 	int             whichfork,	/* data or attribute fork */
 	char		*aeof);		/* return value */
 
-#ifdef XFS_BMAP_TRACE
-/*
- * Add bmap trace entry prior to a call to xfs_iext_remove.
- */
-STATIC void
-xfs_bmap_trace_delete(
-	const char	*fname,		/* function name */
-	char		*desc,		/* operation description */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* index of entry(entries) deleted */
-	xfs_extnum_t	cnt,		/* count of entries deleted, 1 or 2 */
-	int		whichfork);	/* data or attr fork */
-
-/*
- * Add bmap trace entry prior to a call to xfs_iext_insert, or
- * reading in the extents list from the disk (in the btree).
- */
-STATIC void
-xfs_bmap_trace_insert(
-	const char	*fname,		/* function name */
-	char		*desc,		/* operation description */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* index of entry(entries) inserted */
-	xfs_extnum_t	cnt,		/* count of entries inserted, 1 or 2 */
-	xfs_bmbt_irec_t	*r1,		/* inserted record 1 */
-	xfs_bmbt_irec_t	*r2,		/* inserted record 2 or null */
-	int		whichfork);	/* data or attr fork */
-
-/*
- * Add bmap trace entry after updating an extent record in place.
- */
-STATIC void
-xfs_bmap_trace_post_update(
-	const char	*fname,		/* function name */
-	char		*desc,		/* operation description */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* index of entry updated */
-	int		whichfork);	/* data or attr fork */
-
-/*
- * Add bmap trace entry prior to updating an extent record in place.
- */
-STATIC void
-xfs_bmap_trace_pre_update(
-	const char	*fname,		/* function name */
-	char		*desc,		/* operation description */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* index of entry to be updated */
-	int		whichfork);	/* data or attr fork */
-
-#define	XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)	\
-	xfs_bmap_trace_delete(__func__,d,ip,i,c,w)
-#define	XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)	\
-	xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w)
-#define	XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w)	\
-	xfs_bmap_trace_post_update(__func__,d,ip,i,w)
-#define	XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w)	\
-	xfs_bmap_trace_pre_update(__func__,d,ip,i,w)
-#else
-#define	XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)
-#define	XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)
-#define	XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w)
-#define	XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w)
-#endif	/* XFS_BMAP_TRACE */
-
 /*
  * Compute the worst-case number of indirect blocks that will be used
  * for ip's delayed extent of length "len".
@@ -363,18 +299,6 @@ xfs_bmap_validate_ret(
 #define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
 #endif /* DEBUG */
 
-#if defined(XFS_RW_TRACE)
-STATIC void
-xfs_bunmap_trace(
-	xfs_inode_t		*ip,
-	xfs_fileoff_t		bno,
-	xfs_filblks_t		len,
-	int			flags,
-	inst_t			*ra);
-#else
-#define	xfs_bunmap_trace(ip, bno, len, flags, ra)
-#endif	/* XFS_RW_TRACE */
-
 STATIC int
 xfs_bmap_count_tree(
 	xfs_mount_t     *mp,
@@ -590,9 +514,9 @@ xfs_bmap_add_extent(
 	 * already extents in the list.
 	 */
 	if (nextents == 0) {
-		XFS_BMAP_TRACE_INSERT("insert empty", ip, 0, 1, new, NULL,
-			whichfork);
-		xfs_iext_insert(ifp, 0, 1, new);
+		xfs_iext_insert(ip, 0, 1, new,
+				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+
 		ASSERT(cur == NULL);
 		ifp->if_lastex = 0;
 		if (!isnullstartblock(new->br_startblock)) {
@@ -759,26 +683,10 @@ xfs_bmap_add_extent_delay_real(
 	xfs_filblks_t		temp=0;	/* value for dnew calculations */
 	xfs_filblks_t		temp2=0;/* value for dnew calculations */
 	int			tmp_rval;	/* partial logging flags */
-	enum {				/* bit number definitions for state */
-		LEFT_CONTIG,	RIGHT_CONTIG,
-		LEFT_FILLING,	RIGHT_FILLING,
-		LEFT_DELAY,	RIGHT_DELAY,
-		LEFT_VALID,	RIGHT_VALID
-	};
 
 #define	LEFT		r[0]
 #define	RIGHT		r[1]
 #define	PREV		r[2]
-#define	MASK(b)		(1 << (b))
-#define	MASK2(a,b)	(MASK(a) | MASK(b))
-#define	MASK3(a,b,c)	(MASK2(a,b) | MASK(c))
-#define	MASK4(a,b,c,d)	(MASK3(a,b,c) | MASK(d))
-#define	STATE_SET(b,v)	((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define	STATE_TEST(b)	(state & MASK(b))
-#define	STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
-				       ((state &= ~MASK(b)), 0))
-#define	SWITCH_STATE		\
-	(state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
 
 	/*
 	 * Set up a bunch of variables to make the tests simpler.
@@ -790,69 +698,80 @@ xfs_bmap_add_extent_delay_real(
 	new_endoff = new->br_startoff + new->br_blockcount;
 	ASSERT(PREV.br_startoff <= new->br_startoff);
 	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
 	/*
 	 * Set flags determining what part of the previous delayed allocation
 	 * extent is being replaced by a real allocation.
 	 */
-	STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
-	STATE_SET(RIGHT_FILLING,
-		PREV.br_startoff + PREV.br_blockcount == new_endoff);
+	if (PREV.br_startoff == new->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
 	/*
 	 * Check and set flags if this segment has a left neighbor.
 	 * Don't set contiguous if the combined extent would be too large.
 	 */
-	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+	if (idx > 0) {
+		state |= BMAP_LEFT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
-		STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
+
+		if (isnullstartblock(LEFT.br_startblock))
+			state |= BMAP_LEFT_DELAY;
 	}
-	STATE_SET(LEFT_CONTIG,
-		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
-		LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
-		LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
-		LEFT.br_state == new->br_state &&
-		LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+	    LEFT.br_state == new->br_state &&
+	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
 	/*
 	 * Check and set flags if this segment has a right neighbor.
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (STATE_SET_TEST(RIGHT_VALID,
-			idx <
-			ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
+	if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
-		STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
+
+		if (isnullstartblock(RIGHT.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
 	}
-	STATE_SET(RIGHT_CONTIG,
-		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
-		new_endoff == RIGHT.br_startoff &&
-		new->br_startblock + new->br_blockcount ==
-		    RIGHT.br_startblock &&
-		new->br_state == RIGHT.br_state &&
-		new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
-		((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
-		  MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
-		 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
-		     <= MAXEXTLEN));
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new_endoff == RIGHT.br_startoff &&
+	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+	    new->br_state == RIGHT.br_state &&
+	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING)) !=
+		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING) ||
+	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+			<= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
 	error = 0;
 	/*
 	 * Switch out based on the FILLING and CONTIG state bits.
 	 */
-	switch (SWITCH_STATE) {
-
-	case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
 		/*
 		 * Filling in all of a previously delayed allocation extent.
 		 * The left and right neighbors are both contiguous with new.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + PREV.br_blockcount +
 			RIGHT.br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1,
-			XFS_DATA_FORK);
-		XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK);
-		xfs_iext_remove(ifp, idx, 2);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, idx, 2, state);
 		ip->i_df.if_lastex = idx - 1;
 		ip->i_d.di_nextents--;
 		if (cur == NULL)
@@ -885,20 +804,18 @@ xfs_bmap_add_extent_delay_real(
 			RIGHT.br_blockcount;
 		break;
 
-	case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
 		/*
 		 * Filling in all of a previously delayed allocation extent.
 		 * The left neighbor is contiguous, the right is not.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + PREV.br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
 		ip->i_df.if_lastex = idx - 1;
-		XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK);
-		xfs_iext_remove(ifp, idx, 1);
+		xfs_iext_remove(ip, idx, 1, state);
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -921,19 +838,19 @@ xfs_bmap_add_extent_delay_real(
 			PREV.br_blockcount;
 		break;
 
-	case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
 		/*
 		 * Filling in all of a previously delayed allocation extent.
 		 * The right neighbor is contiguous, the left is not.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount + RIGHT.br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+
 		ip->i_df.if_lastex = idx;
-		XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK);
-		xfs_iext_remove(ifp, idx + 1, 1);
+		xfs_iext_remove(ip, idx + 1, 1, state);
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -956,15 +873,16 @@ xfs_bmap_add_extent_delay_real(
 			RIGHT.br_blockcount;
 		break;
 
-	case MASK2(LEFT_FILLING, RIGHT_FILLING):
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
 		/*
 		 * Filling in all of a previously delayed allocation extent.
 		 * Neither the left nor right neighbors are contiguous with
 		 * the new one.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+
 		ip->i_df.if_lastex = idx;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
@@ -987,19 +905,20 @@ xfs_bmap_add_extent_delay_real(
 		temp2 = new->br_blockcount;
 		break;
 
-	case MASK2(LEFT_FILLING, LEFT_CONTIG):
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
 		/*
 		 * Filling in the first part of a previous delayed allocation.
 		 * The left neighbor is contiguous.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + new->br_blockcount);
 		xfs_bmbt_set_startoff(ep,
 			PREV.br_startoff + new->br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
 		temp = PREV.br_blockcount - new->br_blockcount;
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
 		ip->i_df.if_lastex = idx - 1;
 		if (cur == NULL)
@@ -1021,7 +940,7 @@ xfs_bmap_add_extent_delay_real(
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock));
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		*dnew = temp;
 		/* DELTA: The boundary between two in-core extents moved. */
 		temp = LEFT.br_startoff;
@@ -1029,18 +948,16 @@ xfs_bmap_add_extent_delay_real(
 			PREV.br_blockcount;
 		break;
 
-	case MASK(LEFT_FILLING):
+	case BMAP_LEFT_FILLING:
 		/*
 		 * Filling in the first part of a previous delayed allocation.
 		 * The left neighbor is not contiguous.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_startoff(ep, new_endoff);
 		temp = PREV.br_blockcount - new->br_blockcount;
 		xfs_bmbt_set_blockcount(ep, temp);
-		XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL,
-			XFS_DATA_FORK);
-		xfs_iext_insert(ifp, idx, 1, new);
+		xfs_iext_insert(ip, idx, 1, new, state);
 		ip->i_df.if_lastex = idx;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
@@ -1071,27 +988,27 @@ xfs_bmap_add_extent_delay_real(
 			(cur ? cur->bc_private.b.allocated : 0));
 		ep = xfs_iext_get_ext(ifp, idx + 1);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
 		*dnew = temp;
 		/* DELTA: One in-core extent is split in two. */
 		temp = PREV.br_startoff;
 		temp2 = PREV.br_blockcount;
 		break;
 
-	case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
 		/*
 		 * Filling in the last part of a previous delayed allocation.
 		 * The right neighbor is contiguous with the new allocation.
 		 */
 		temp = PREV.br_blockcount - new->br_blockcount;
-		XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
-		XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
 		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + RIGHT.br_blockcount,
 			RIGHT.br_state);
-		XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
 		ip->i_df.if_lastex = idx + 1;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
@@ -1112,7 +1029,7 @@ xfs_bmap_add_extent_delay_real(
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock));
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		*dnew = temp;
 		/* DELTA: The boundary between two in-core extents moved. */
 		temp = PREV.br_startoff;
@@ -1120,17 +1037,15 @@ xfs_bmap_add_extent_delay_real(
 			RIGHT.br_blockcount;
 		break;
 
-	case MASK(RIGHT_FILLING):
+	case BMAP_RIGHT_FILLING:
 		/*
 		 * Filling in the last part of a previous delayed allocation.
 		 * The right neighbor is not contiguous.
 		 */
 		temp = PREV.br_blockcount - new->br_blockcount;
-		XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL,
-			XFS_DATA_FORK);
-		xfs_iext_insert(ifp, idx + 1, 1, new);
+		xfs_iext_insert(ip, idx + 1, 1, new, state);
 		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
@@ -1161,7 +1076,7 @@ xfs_bmap_add_extent_delay_real(
 			(cur ? cur->bc_private.b.allocated : 0));
 		ep = xfs_iext_get_ext(ifp, idx);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		*dnew = temp;
 		/* DELTA: One in-core extent is split in two. */
 		temp = PREV.br_startoff;
@@ -1175,7 +1090,7 @@ xfs_bmap_add_extent_delay_real(
 		 * This case is avoided almost all the time.
 		 */
 		temp = new->br_startoff - PREV.br_startoff;
-		XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
 		r[0] = *new;
 		r[1].br_state = PREV.br_state;
@@ -1183,9 +1098,7 @@ xfs_bmap_add_extent_delay_real(
 		r[1].br_startoff = new_endoff;
 		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
 		r[1].br_blockcount = temp2;
-		XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1],
-			XFS_DATA_FORK);
-		xfs_iext_insert(ifp, idx + 1, 2, &r[0]);
+		xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
 		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
@@ -1242,24 +1155,24 @@ xfs_bmap_add_extent_delay_real(
 		}
 		ep = xfs_iext_get_ext(ifp, idx);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
-		XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
 			nullstartblock((int)temp2));
-		XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
 		*dnew = temp + temp2;
 		/* DELTA: One in-core extent is split in three. */
 		temp = PREV.br_startoff;
 		temp2 = PREV.br_blockcount;
 		break;
 
-	case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
-	case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
-	case MASK2(LEFT_FILLING, RIGHT_CONTIG):
-	case MASK2(RIGHT_FILLING, LEFT_CONTIG):
-	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
-	case MASK(LEFT_CONTIG):
-	case MASK(RIGHT_CONTIG):
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_CONTIG:
+	case BMAP_RIGHT_CONTIG:
 		/*
 		 * These cases are all impossible.
 		 */
@@ -1279,14 +1192,6 @@ done:
 #undef	LEFT
 #undef	RIGHT
 #undef	PREV
-#undef	MASK
-#undef	MASK2
-#undef	MASK3
-#undef	MASK4
-#undef	STATE_SET
-#undef	STATE_TEST
-#undef	STATE_SET_TEST
-#undef	SWITCH_STATE
 }
 
 /*
@@ -1316,27 +1221,10 @@ xfs_bmap_add_extent_unwritten_real(
 	int			state = 0;/* state bits, accessed thru macros */
 	xfs_filblks_t		temp=0;
 	xfs_filblks_t		temp2=0;
-	enum {				/* bit number definitions for state */
-		LEFT_CONTIG,	RIGHT_CONTIG,
-		LEFT_FILLING,	RIGHT_FILLING,
-		LEFT_DELAY,	RIGHT_DELAY,
-		LEFT_VALID,	RIGHT_VALID
-	};
 
 #define	LEFT		r[0]
 #define	RIGHT		r[1]
 #define	PREV		r[2]
-#define	MASK(b)		(1 << (b))
-#define	MASK2(a,b)	(MASK(a) | MASK(b))
-#define	MASK3(a,b,c)	(MASK2(a,b) | MASK(c))
-#define	MASK4(a,b,c,d)	(MASK3(a,b,c) | MASK(d))
-#define	STATE_SET(b,v)	((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define	STATE_TEST(b)	(state & MASK(b))
-#define	STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
-				       ((state &= ~MASK(b)), 0))
-#define	SWITCH_STATE		\
-	(state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
-
 	/*
 	 * Set up a bunch of variables to make the tests simpler.
 	 */
@@ -1352,68 +1240,78 @@ xfs_bmap_add_extent_unwritten_real(
 	new_endoff = new->br_startoff + new->br_blockcount;
 	ASSERT(PREV.br_startoff <= new->br_startoff);
 	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
 	/*
 	 * Set flags determining what part of the previous oldext allocation
 	 * extent is being replaced by a newext allocation.
 	 */
-	STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
-	STATE_SET(RIGHT_FILLING,
-		PREV.br_startoff + PREV.br_blockcount == new_endoff);
+	if (PREV.br_startoff == new->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
 	/*
 	 * Check and set flags if this segment has a left neighbor.
 	 * Don't set contiguous if the combined extent would be too large.
 	 */
-	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+	if (idx > 0) {
+		state |= BMAP_LEFT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
-		STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
+
+		if (isnullstartblock(LEFT.br_startblock))
+			state |= BMAP_LEFT_DELAY;
 	}
-	STATE_SET(LEFT_CONTIG,
-		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
-		LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
-		LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
-		LEFT.br_state == newext &&
-		LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+	    LEFT.br_state == newext &&
+	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
 	/*
 	 * Check and set flags if this segment has a right neighbor.
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (STATE_SET_TEST(RIGHT_VALID,
-			idx <
-			ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
+	if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
-		STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
+		if (isnullstartblock(RIGHT.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
 	}
-	STATE_SET(RIGHT_CONTIG,
-		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
-		new_endoff == RIGHT.br_startoff &&
-		new->br_startblock + new->br_blockcount ==
-		    RIGHT.br_startblock &&
-		newext == RIGHT.br_state &&
-		new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
-		((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
-		  MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
-		 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
-		     <= MAXEXTLEN));
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new_endoff == RIGHT.br_startoff &&
+	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+	    newext == RIGHT.br_state &&
+	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING)) !=
+		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING) ||
+	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+			<= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
 	/*
 	 * Switch out based on the FILLING and CONTIG state bits.
 	 */
-	switch (SWITCH_STATE) {
-
-	case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
 		/*
 		 * Setting all of a previous oldext extent to newext.
 		 * The left and right neighbors are both contiguous with new.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + PREV.br_blockcount +
 			RIGHT.br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1,
-			XFS_DATA_FORK);
-		XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK);
-		xfs_iext_remove(ifp, idx, 2);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, idx, 2, state);
 		ip->i_df.if_lastex = idx - 1;
 		ip->i_d.di_nextents -= 2;
 		if (cur == NULL)
@@ -1450,20 +1348,18 @@ xfs_bmap_add_extent_unwritten_real(
 			RIGHT.br_blockcount;
 		break;
 
-	case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
 		/*
 		 * Setting all of a previous oldext extent to newext.
 		 * The left neighbor is contiguous, the right is not.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + PREV.br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
 		ip->i_df.if_lastex = idx - 1;
-		XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK);
-		xfs_iext_remove(ifp, idx, 1);
+		xfs_iext_remove(ip, idx, 1, state);
 		ip->i_d.di_nextents--;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1492,21 +1388,18 @@ xfs_bmap_add_extent_unwritten_real(
 			PREV.br_blockcount;
 		break;
 
-	case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
 		/*
 		 * Setting all of a previous oldext extent to newext.
 		 * The right neighbor is contiguous, the left is not.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount + RIGHT.br_blockcount);
 		xfs_bmbt_set_state(ep, newext);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		ip->i_df.if_lastex = idx;
-		XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK);
-		xfs_iext_remove(ifp, idx + 1, 1);
+		xfs_iext_remove(ip, idx + 1, 1, state);
 		ip->i_d.di_nextents--;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1535,17 +1428,16 @@ xfs_bmap_add_extent_unwritten_real(
 			RIGHT.br_blockcount;
 		break;
 
-	case MASK2(LEFT_FILLING, RIGHT_FILLING):
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
 		/*
 		 * Setting all of a previous oldext extent to newext.
 		 * Neither the left nor right neighbors are contiguous with
 		 * the new one.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_state(ep, newext);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+
 		ip->i_df.if_lastex = idx;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
@@ -1566,27 +1458,25 @@ xfs_bmap_add_extent_unwritten_real(
 		temp2 = new->br_blockcount;
 		break;
 
-	case MASK2(LEFT_FILLING, LEFT_CONTIG):
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
 		/*
 		 * Setting the first part of a previous oldext extent to newext.
 		 * The left neighbor is contiguous.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			LEFT.br_blockcount + new->br_blockcount);
 		xfs_bmbt_set_startoff(ep,
 			PREV.br_startoff + new->br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1,
-			XFS_DATA_FORK);
-		XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep,
 			new->br_startblock + new->br_blockcount);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+
 		ip->i_df.if_lastex = idx - 1;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
@@ -1617,22 +1507,21 @@ xfs_bmap_add_extent_unwritten_real(
 			PREV.br_blockcount;
 		break;
 
-	case MASK(LEFT_FILLING):
+	case BMAP_LEFT_FILLING:
 		/*
 		 * Setting the first part of a previous oldext extent to newext.
 		 * The left neighbor is not contiguous.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
 		xfs_bmbt_set_startoff(ep, new_endoff);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
 		xfs_bmbt_set_startblock(ep,
 			new->br_startblock + new->br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx, XFS_DATA_FORK);
-		XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL,
-			XFS_DATA_FORK);
-		xfs_iext_insert(ifp, idx, 1, new);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+
+		xfs_iext_insert(ip, idx, 1, new, state);
 		ip->i_df.if_lastex = idx;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
@@ -1660,24 +1549,21 @@ xfs_bmap_add_extent_unwritten_real(
 		temp2 = PREV.br_blockcount;
 		break;
 
-	case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
 		/*
 		 * Setting the last part of a previous oldext extent to newext.
 		 * The right neighbor is contiguous with the new allocation.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx,
-			XFS_DATA_FORK);
-		XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + RIGHT.br_blockcount, newext);
-		XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+
 		ip->i_df.if_lastex = idx + 1;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
@@ -1707,18 +1593,17 @@ xfs_bmap_add_extent_unwritten_real(
 			RIGHT.br_blockcount;
 		break;
 
-	case MASK(RIGHT_FILLING):
+	case BMAP_RIGHT_FILLING:
 		/*
 		 * Setting the last part of a previous oldext extent to newext.
 		 * The right neighbor is not contiguous.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
-		XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL,
-			XFS_DATA_FORK);
-		xfs_iext_insert(ifp, idx + 1, 1, new);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+
+		xfs_iext_insert(ip, idx + 1, 1, new, state);
 		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
@@ -1756,19 +1641,18 @@ xfs_bmap_add_extent_unwritten_real(
 		 * newext.  Contiguity is impossible here.
 		 * One extent becomes three extents.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			new->br_startoff - PREV.br_startoff);
-		XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+
 		r[0] = *new;
 		r[1].br_startoff = new_endoff;
 		r[1].br_blockcount =
 			PREV.br_startoff + PREV.br_blockcount - new_endoff;
 		r[1].br_startblock = new->br_startblock + new->br_blockcount;
 		r[1].br_state = oldext;
-		XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1],
-			XFS_DATA_FORK);
-		xfs_iext_insert(ifp, idx + 1, 2, &r[0]);
+		xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
 		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents += 2;
 		if (cur == NULL)
@@ -1813,13 +1697,13 @@ xfs_bmap_add_extent_unwritten_real(
 		temp2 = PREV.br_blockcount;
 		break;
 
-	case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
-	case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
-	case MASK2(LEFT_FILLING, RIGHT_CONTIG):
-	case MASK2(RIGHT_FILLING, LEFT_CONTIG):
-	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
-	case MASK(LEFT_CONTIG):
-	case MASK(RIGHT_CONTIG):
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_CONTIG:
+	case BMAP_RIGHT_CONTIG:
 		/*
 		 * These cases are all impossible.
 		 */
@@ -1839,14 +1723,6 @@ done:
 #undef	LEFT
 #undef	RIGHT
 #undef	PREV
-#undef	MASK
-#undef	MASK2
-#undef	MASK3
-#undef	MASK4
-#undef	STATE_SET
-#undef	STATE_TEST
-#undef	STATE_SET_TEST
-#undef	SWITCH_STATE
 }
 
 /*
@@ -1872,62 +1748,57 @@ xfs_bmap_add_extent_hole_delay(
 	int			state;  /* state bits, accessed thru macros */
 	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
 	xfs_filblks_t		temp2=0;
-	enum {				/* bit number definitions for state */
-		LEFT_CONTIG,	RIGHT_CONTIG,
-		LEFT_DELAY,	RIGHT_DELAY,
-		LEFT_VALID,	RIGHT_VALID
-	};
-
-#define	MASK(b)			(1 << (b))
-#define	MASK2(a,b)		(MASK(a) | MASK(b))
-#define	STATE_SET(b,v)		((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define	STATE_TEST(b)		(state & MASK(b))
-#define	STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
-				       ((state &= ~MASK(b)), 0))
-#define	SWITCH_STATE		(state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
 
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	ep = xfs_iext_get_ext(ifp, idx);
 	state = 0;
 	ASSERT(isnullstartblock(new->br_startblock));
+
 	/*
 	 * Check and set flags if this segment has a left neighbor
 	 */
-	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+	if (idx > 0) {
+		state |= BMAP_LEFT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
-		STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
+
+		if (isnullstartblock(left.br_startblock))
+			state |= BMAP_LEFT_DELAY;
 	}
+
 	/*
 	 * Check and set flags if the current (right) segment exists.
 	 * If it doesn't exist, we're converting the hole at end-of-file.
 	 */
-	if (STATE_SET_TEST(RIGHT_VALID,
-			   idx <
-			   ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+	if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(ep, &right);
-		STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
+
+		if (isnullstartblock(right.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
 	}
+
 	/*
 	 * Set contiguity flags on the left and right neighbors.
 	 * Don't let extents get too large, even if the pieces are contiguous.
 	 */
-	STATE_SET(LEFT_CONTIG,
-		STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) &&
-		left.br_startoff + left.br_blockcount == new->br_startoff &&
-		left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
-	STATE_SET(RIGHT_CONTIG,
-		STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) &&
-		new->br_startoff + new->br_blockcount == right.br_startoff &&
-		new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
-		(!STATE_TEST(LEFT_CONTIG) ||
-		 (left.br_blockcount + new->br_blockcount +
-		     right.br_blockcount <= MAXEXTLEN)));
+	if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+	    left.br_startoff + left.br_blockcount == new->br_startoff &&
+	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+	    new->br_startoff + new->br_blockcount == right.br_startoff &&
+	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+	    (!(state & BMAP_LEFT_CONTIG) ||
+	     (left.br_blockcount + new->br_blockcount +
+	      right.br_blockcount <= MAXEXTLEN)))
+		state |= BMAP_RIGHT_CONTIG;
+
 	/*
 	 * Switch out based on the contiguity flags.
 	 */
-	switch (SWITCH_STATE) {
-
-	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
 		/*
 		 * New allocation is contiguous with delayed allocations
 		 * on the left and on the right.
@@ -1935,8 +1806,8 @@ xfs_bmap_add_extent_hole_delay(
 		 */
 		temp = left.br_blockcount + new->br_blockcount +
 			right.br_blockcount;
-		XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
-			XFS_DATA_FORK);
+
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
 		oldlen = startblockval(left.br_startblock) +
 			startblockval(new->br_startblock) +
@@ -1944,53 +1815,52 @@ xfs_bmap_add_extent_hole_delay(
 		newlen = xfs_bmap_worst_indlen(ip, temp);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
 			nullstartblock((int)newlen));
-		XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
-			XFS_DATA_FORK);
-		XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK);
-		xfs_iext_remove(ifp, idx, 1);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, idx, 1, state);
 		ip->i_df.if_lastex = idx - 1;
 		/* DELTA: Two in-core extents were replaced by one. */
 		temp2 = temp;
 		temp = left.br_startoff;
 		break;
 
-	case MASK(LEFT_CONTIG):
+	case BMAP_LEFT_CONTIG:
 		/*
 		 * New allocation is contiguous with a delayed allocation
 		 * on the left.
 		 * Merge the new allocation with the left neighbor.
 		 */
 		temp = left.br_blockcount + new->br_blockcount;
-		XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
 		oldlen = startblockval(left.br_startblock) +
 			startblockval(new->br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
 			nullstartblock((int)newlen));
-		XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1,
-			XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
 		ip->i_df.if_lastex = idx - 1;
 		/* DELTA: One in-core extent grew into a hole. */
 		temp2 = temp;
 		temp = left.br_startoff;
 		break;
 
-	case MASK(RIGHT_CONTIG):
+	case BMAP_RIGHT_CONTIG:
 		/*
 		 * New allocation is contiguous with a delayed allocation
 		 * on the right.
 		 * Merge the new allocation with the right neighbor.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		temp = new->br_blockcount + right.br_blockcount;
 		oldlen = startblockval(new->br_startblock) +
 			startblockval(right.br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
 		xfs_bmbt_set_allf(ep, new->br_startoff,
 			nullstartblock((int)newlen), temp, right.br_state);
-		XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+
 		ip->i_df.if_lastex = idx;
 		/* DELTA: One in-core extent grew into a hole. */
 		temp2 = temp;
@@ -2004,9 +1874,7 @@ xfs_bmap_add_extent_hole_delay(
 		 * Insert a new entry.
 		 */
 		oldlen = newlen = 0;
-		XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL,
-			XFS_DATA_FORK);
-		xfs_iext_insert(ifp, idx, 1, new);
+		xfs_iext_insert(ip, idx, 1, new, state);
 		ip->i_df.if_lastex = idx;
 		/* DELTA: A new in-core extent was added in a hole. */
 		temp2 = new->br_blockcount;
@@ -2030,12 +1898,6 @@ xfs_bmap_add_extent_hole_delay(
 	}
 	*logflagsp = 0;
 	return 0;
-#undef	MASK
-#undef	MASK2
-#undef	STATE_SET
-#undef	STATE_TEST
-#undef	STATE_SET_TEST
-#undef	SWITCH_STATE
 }
 
 /*
@@ -2062,83 +1924,75 @@ xfs_bmap_add_extent_hole_real(
 	int			state;	/* state bits, accessed thru macros */
 	xfs_filblks_t		temp=0;
 	xfs_filblks_t		temp2=0;
-	enum {				/* bit number definitions for state */
-		LEFT_CONTIG,	RIGHT_CONTIG,
-		LEFT_DELAY,	RIGHT_DELAY,
-		LEFT_VALID,	RIGHT_VALID
-	};
-
-#define	MASK(b)			(1 << (b))
-#define	MASK2(a,b)		(MASK(a) | MASK(b))
-#define	STATE_SET(b,v)		((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define	STATE_TEST(b)		(state & MASK(b))
-#define	STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
-				       ((state &= ~MASK(b)), 0))
-#define	SWITCH_STATE		(state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
 	ep = xfs_iext_get_ext(ifp, idx);
 	state = 0;
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
 	/*
 	 * Check and set flags if this segment has a left neighbor.
 	 */
-	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+	if (idx > 0) {
+		state |= BMAP_LEFT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
-		STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
+		if (isnullstartblock(left.br_startblock))
+			state |= BMAP_LEFT_DELAY;
 	}
+
 	/*
 	 * Check and set flags if this segment has a current value.
 	 * Not true if we're inserting into the "hole" at eof.
 	 */
-	if (STATE_SET_TEST(RIGHT_VALID,
-			   idx <
-			   ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+	if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(ep, &right);
-		STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
+		if (isnullstartblock(right.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
 	}
+
 	/*
 	 * We're inserting a real allocation between "left" and "right".
 	 * Set the contiguity flags.  Don't let extents get too large.
 	 */
-	STATE_SET(LEFT_CONTIG,
-		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
-		left.br_startoff + left.br_blockcount == new->br_startoff &&
-		left.br_startblock + left.br_blockcount == new->br_startblock &&
-		left.br_state == new->br_state &&
-		left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
-	STATE_SET(RIGHT_CONTIG,
-		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
-		new->br_startoff + new->br_blockcount == right.br_startoff &&
-		new->br_startblock + new->br_blockcount ==
-		    right.br_startblock &&
-		new->br_state == right.br_state &&
-		new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
-		(!STATE_TEST(LEFT_CONTIG) ||
-		 left.br_blockcount + new->br_blockcount +
-		     right.br_blockcount <= MAXEXTLEN));
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    left.br_startoff + left.br_blockcount == new->br_startoff &&
+	    left.br_startblock + left.br_blockcount == new->br_startblock &&
+	    left.br_state == new->br_state &&
+	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new->br_startoff + new->br_blockcount == right.br_startoff &&
+	    new->br_startblock + new->br_blockcount == right.br_startblock &&
+	    new->br_state == right.br_state &&
+	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+	    (!(state & BMAP_LEFT_CONTIG) ||
+	     left.br_blockcount + new->br_blockcount +
+	     right.br_blockcount <= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
 
 	error = 0;
 	/*
 	 * Select which case we're in here, and implement it.
 	 */
-	switch (SWITCH_STATE) {
-
-	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
 		/*
 		 * New allocation is contiguous with real allocations on the
 		 * left and on the right.
 		 * Merge all three into a single extent record.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
-			whichfork);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			left.br_blockcount + new->br_blockcount +
 			right.br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
-			whichfork);
-		XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, whichfork);
-		xfs_iext_remove(ifp, idx, 1);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, idx, 1, state);
 		ifp->if_lastex = idx - 1;
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -2173,16 +2027,17 @@ xfs_bmap_add_extent_hole_real(
 			right.br_blockcount;
 		break;
 
-	case MASK(LEFT_CONTIG):
+	case BMAP_LEFT_CONTIG:
 		/*
 		 * New allocation is contiguous with a real allocation
 		 * on the left.
 		 * Merge the new allocation with the left neighbor.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, whichfork);
+		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
 			left.br_blockcount + new->br_blockcount);
-		XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork);
+		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+
 		ifp->if_lastex = idx - 1;
 		if (cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
@@ -2207,17 +2062,18 @@ xfs_bmap_add_extent_hole_real(
 			new->br_blockcount;
 		break;
 
-	case MASK(RIGHT_CONTIG):
+	case BMAP_RIGHT_CONTIG:
 		/*
 		 * New allocation is contiguous with a real allocation
 		 * on the right.
 		 * Merge the new allocation with the right neighbor.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, whichfork);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
 			new->br_blockcount + right.br_blockcount,
 			right.br_state);
-		XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+
 		ifp->if_lastex = idx;
 		if (cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
@@ -2248,8 +2104,7 @@ xfs_bmap_add_extent_hole_real(
 		 * real allocation.
 		 * Insert a new entry.
 		 */
-		XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, whichfork);
-		xfs_iext_insert(ifp, idx, 1, new);
+		xfs_iext_insert(ip, idx, 1, new, state);
 		ifp->if_lastex = idx;
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
@@ -2283,12 +2138,6 @@ xfs_bmap_add_extent_hole_real(
 done:
 	*logflagsp = rval;
 	return error;
-#undef	MASK
-#undef	MASK2
-#undef	STATE_SET
-#undef	STATE_TEST
-#undef	STATE_SET_TEST
-#undef	SWITCH_STATE
 }
 
 /*
@@ -3115,8 +2964,13 @@ xfs_bmap_del_extent(
 	uint			qfield;	/* quota field to update */
 	xfs_filblks_t		temp;	/* for indirect length calculations */
 	xfs_filblks_t		temp2;	/* for indirect length calculations */
+	int			state = 0;
 
 	XFS_STATS_INC(xs_del_exlist);
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT((idx >= 0) && (idx < ifp->if_bytes /
@@ -3196,8 +3050,8 @@ xfs_bmap_del_extent(
 		/*
 		 * Matches the whole extent.  Delete the entry.
 		 */
-		XFS_BMAP_TRACE_DELETE("3", ip, idx, 1, whichfork);
-		xfs_iext_remove(ifp, idx, 1);
+		xfs_iext_remove(ip, idx, 1,
+				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
 		ifp->if_lastex = idx;
 		if (delay)
 			break;
@@ -3217,7 +3071,7 @@ xfs_bmap_del_extent(
 		/*
 		 * Deleting the first part of the extent.
 		 */
-		XFS_BMAP_TRACE_PRE_UPDATE("2", ip, idx, whichfork);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_startoff(ep, del_endoff);
 		temp = got.br_blockcount - del->br_blockcount;
 		xfs_bmbt_set_blockcount(ep, temp);
@@ -3226,13 +3080,12 @@ xfs_bmap_del_extent(
 			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 				da_old);
 			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx,
-				whichfork);
+			trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 			da_new = temp;
 			break;
 		}
 		xfs_bmbt_set_startblock(ep, del_endblock);
-		XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		if (!cur) {
 			flags |= xfs_ilog_fext(whichfork);
 			break;
@@ -3248,19 +3101,18 @@ xfs_bmap_del_extent(
 		 * Deleting the last part of the extent.
 		 */
 		temp = got.br_blockcount - del->br_blockcount;
-		XFS_BMAP_TRACE_PRE_UPDATE("1", ip, idx, whichfork);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
 		ifp->if_lastex = idx;
 		if (delay) {
 			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 				da_old);
 			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx,
-				whichfork);
+			trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 			da_new = temp;
 			break;
 		}
-		XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		if (!cur) {
 			flags |= xfs_ilog_fext(whichfork);
 			break;
@@ -3277,7 +3129,7 @@ xfs_bmap_del_extent(
 		 * Deleting the middle of the extent.
 		 */
 		temp = del->br_startoff - got.br_startoff;
-		XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, whichfork);
+		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
 		new.br_startoff = del_endoff;
 		temp2 = got_endoff - del_endoff;
@@ -3364,10 +3216,8 @@ xfs_bmap_del_extent(
 				}
 			}
 		}
-		XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, whichfork);
-		XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 1, &new, NULL,
-			whichfork);
-		xfs_iext_insert(ifp, idx + 1, 1, &new);
+		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		xfs_iext_insert(ip, idx + 1, 1, &new, state);
 		ifp->if_lastex = idx + 1;
 		break;
 	}
@@ -3687,7 +3537,9 @@ xfs_bmap_local_to_extents(
 		xfs_iext_add(ifp, 0, 1);
 		ep = xfs_iext_get_ext(ifp, 0);
 		xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
-		XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork);
+		trace_xfs_bmap_post_update(ip, 0,
+				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+				_THIS_IP_);
 		XFS_IFORK_NEXT_SET(ip, whichfork, 1);
 		ip->i_d.di_nblocks = 1;
 		xfs_trans_mod_dquot_byino(tp, ip,
@@ -3800,158 +3652,6 @@ xfs_bmap_search_extents(
 	return ep;
 }
 
-
-#ifdef XFS_BMAP_TRACE
-ktrace_t	*xfs_bmap_trace_buf;
-
-/*
- * Add a bmap trace buffer entry.  Base routine for the others.
- */
-STATIC void
-xfs_bmap_trace_addentry(
-	int		opcode,		/* operation */
-	const char	*fname,		/* function name */
-	char		*desc,		/* operation description */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* index of entry(ies) */
-	xfs_extnum_t	cnt,		/* count of entries, 1 or 2 */
-	xfs_bmbt_rec_host_t *r1,	/* first record */
-	xfs_bmbt_rec_host_t *r2,	/* second record or null */
-	int		whichfork)	/* data or attr fork */
-{
-	xfs_bmbt_rec_host_t tr2;
-
-	ASSERT(cnt == 1 || cnt == 2);
-	ASSERT(r1 != NULL);
-	if (cnt == 1) {
-		ASSERT(r2 == NULL);
-		r2 = &tr2;
-		memset(&tr2, 0, sizeof(tr2));
-	} else
-		ASSERT(r2 != NULL);
-	ktrace_enter(xfs_bmap_trace_buf,
-		(void *)(__psint_t)(opcode | (whichfork << 16)),
-		(void *)fname, (void *)desc, (void *)ip,
-		(void *)(__psint_t)idx,
-		(void *)(__psint_t)cnt,
-		(void *)(__psunsigned_t)(ip->i_ino >> 32),
-		(void *)(__psunsigned_t)(unsigned)ip->i_ino,
-		(void *)(__psunsigned_t)(r1->l0 >> 32),
-		(void *)(__psunsigned_t)(unsigned)(r1->l0),
-		(void *)(__psunsigned_t)(r1->l1 >> 32),
-		(void *)(__psunsigned_t)(unsigned)(r1->l1),
-		(void *)(__psunsigned_t)(r2->l0 >> 32),
-		(void *)(__psunsigned_t)(unsigned)(r2->l0),
-		(void *)(__psunsigned_t)(r2->l1 >> 32),
-		(void *)(__psunsigned_t)(unsigned)(r2->l1)
-		);
-	ASSERT(ip->i_xtrace);
-	ktrace_enter(ip->i_xtrace,
-		(void *)(__psint_t)(opcode | (whichfork << 16)),
-		(void *)fname, (void *)desc, (void *)ip,
-		(void *)(__psint_t)idx,
-		(void *)(__psint_t)cnt,
-		(void *)(__psunsigned_t)(ip->i_ino >> 32),
-		(void *)(__psunsigned_t)(unsigned)ip->i_ino,
-		(void *)(__psunsigned_t)(r1->l0 >> 32),
-		(void *)(__psunsigned_t)(unsigned)(r1->l0),
-		(void *)(__psunsigned_t)(r1->l1 >> 32),
-		(void *)(__psunsigned_t)(unsigned)(r1->l1),
-		(void *)(__psunsigned_t)(r2->l0 >> 32),
-		(void *)(__psunsigned_t)(unsigned)(r2->l0),
-		(void *)(__psunsigned_t)(r2->l1 >> 32),
-		(void *)(__psunsigned_t)(unsigned)(r2->l1)
-		);
-}
-
-/*
- * Add bmap trace entry prior to a call to xfs_iext_remove.
- */
-STATIC void
-xfs_bmap_trace_delete(
-	const char	*fname,		/* function name */
-	char		*desc,		/* operation description */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* index of entry(entries) deleted */
-	xfs_extnum_t	cnt,		/* count of entries deleted, 1 or 2 */
-	int		whichfork)	/* data or attr fork */
-{
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx,
-		cnt, xfs_iext_get_ext(ifp, idx),
-		cnt == 2 ? xfs_iext_get_ext(ifp, idx + 1) : NULL,
-		whichfork);
-}
-
-/*
- * Add bmap trace entry prior to a call to xfs_iext_insert, or
- * reading in the extents list from the disk (in the btree).
- */
-STATIC void
-xfs_bmap_trace_insert(
-	const char	*fname,		/* function name */
-	char		*desc,		/* operation description */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* index of entry(entries) inserted */
-	xfs_extnum_t	cnt,		/* count of entries inserted, 1 or 2 */
-	xfs_bmbt_irec_t	*r1,		/* inserted record 1 */
-	xfs_bmbt_irec_t	*r2,		/* inserted record 2 or null */
-	int		whichfork)	/* data or attr fork */
-{
-	xfs_bmbt_rec_host_t tr1;	/* compressed record 1 */
-	xfs_bmbt_rec_host_t tr2;	/* compressed record 2 if needed */
-
-	xfs_bmbt_set_all(&tr1, r1);
-	if (cnt == 2) {
-		ASSERT(r2 != NULL);
-		xfs_bmbt_set_all(&tr2, r2);
-	} else {
-		ASSERT(cnt == 1);
-		ASSERT(r2 == NULL);
-	}
-	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx,
-		cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork);
-}
-
-/*
- * Add bmap trace entry after updating an extent record in place.
- */
-STATIC void
-xfs_bmap_trace_post_update(
-	const char	*fname,		/* function name */
-	char		*desc,		/* operation description */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* index of entry updated */
-	int		whichfork)	/* data or attr fork */
-{
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx,
-		1, xfs_iext_get_ext(ifp, idx), NULL, whichfork);
-}
-
-/*
- * Add bmap trace entry prior to updating an extent record in place.
- */
-STATIC void
-xfs_bmap_trace_pre_update(
-	const char	*fname,		/* function name */
-	char		*desc,		/* operation description */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* index of entry to be updated */
-	int		whichfork)	/* data or attr fork */
-{
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1,
-		xfs_iext_get_ext(ifp, idx), NULL, whichfork);
-}
-#endif	/* XFS_BMAP_TRACE */
-
 /*
  * Compute the worst-case number of indirect blocks that will be used
  * for ip's delayed extent of length "len".
@@ -3983,37 +3683,6 @@ xfs_bmap_worst_indlen(
 	return rval;
 }
 
-#if defined(XFS_RW_TRACE)
-STATIC void
-xfs_bunmap_trace(
-	xfs_inode_t		*ip,
-	xfs_fileoff_t		bno,
-	xfs_filblks_t		len,
-	int			flags,
-	inst_t			*ra)
-{
-	if (ip->i_rwtrace == NULL)
-		return;
-	ktrace_enter(ip->i_rwtrace,
-		(void *)(__psint_t)XFS_BUNMAP,
-		(void *)ip,
-		(void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
-		(void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
-		(void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff),
-		(void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff),
-		(void *)(__psint_t)len,
-		(void *)(__psint_t)flags,
-		(void *)(unsigned long)current_cpu(),
-		(void *)ra,
-		(void *)0,
-		(void *)0,
-		(void *)0,
-		(void *)0,
-		(void *)0,
-		(void *)0);
-}
-#endif
-
 /*
  * Convert inode from non-attributed to attributed.
  * Must not be in a transaction, ip must not be locked.
@@ -4702,34 +4371,30 @@ error0:
 	return XFS_ERROR(EFSCORRUPTED);
 }
 
-#ifdef XFS_BMAP_TRACE
+#ifdef DEBUG
 /*
  * Add bmap trace insert entries for all the contents of the extent records.
  */
 void
 xfs_bmap_trace_exlist(
-	const char	*fname,		/* function name */
 	xfs_inode_t	*ip,		/* incore inode pointer */
 	xfs_extnum_t	cnt,		/* count of entries in the list */
-	int		whichfork)	/* data or attr fork */
+	int		whichfork,	/* data or attr fork */
+	unsigned long	caller_ip)
 {
-	xfs_bmbt_rec_host_t *ep;	/* current extent record */
 	xfs_extnum_t	idx;		/* extent record index */
 	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_bmbt_irec_t	s;		/* file extent record */
+	int		state = 0;
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
-	for (idx = 0; idx < cnt; idx++) {
-		ep = xfs_iext_get_ext(ifp, idx);
-		xfs_bmbt_get_all(ep, &s);
-		XFS_BMAP_TRACE_INSERT("exlist", ip, idx, 1, &s, NULL,
-			whichfork);
-	}
+	for (idx = 0; idx < cnt; idx++)
+		trace_xfs_extlist(ip, idx, whichfork, caller_ip);
 }
-#endif
 
-#ifdef DEBUG
 /*
  * Validate that the bmbt_irecs being returned from bmapi are valid
  * given the callers original parameters.  Specifically check the
@@ -5478,7 +5143,8 @@ xfs_bunmapi(
 	int			rsvd;		/* OK to allocate reserved blocks */
 	xfs_fsblock_t		sum;
 
-	xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address);
+	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+
 	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
 		XFS_ATTR_FORK : XFS_DATA_FORK;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 56f62d2edc3..419dafb9d87 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -95,6 +95,21 @@ typedef	struct xfs_bmap_free
 					/* need write cache flushing and no */
 					/* additional allocation alignments */
 
+#define XFS_BMAPI_FLAGS \
+	{ XFS_BMAPI_WRITE,	"WRITE" }, \
+	{ XFS_BMAPI_DELAY,	"DELAY" }, \
+	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
+	{ XFS_BMAPI_METADATA,	"METADATA" }, \
+	{ XFS_BMAPI_EXACT,	"EXACT" }, \
+	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
+	{ XFS_BMAPI_ASYNC,	"ASYNC" }, \
+	{ XFS_BMAPI_RSVBLOCKS,	"RSVBLOCKS" }, \
+	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
+	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
+	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
+	{ XFS_BMAPI_CONVERT,	"CONVERT" }
+
+
 static inline int xfs_bmapi_aflag(int w)
 {
 	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
@@ -135,36 +150,43 @@ typedef struct xfs_bmalloca {
 	char			conv;	/* overwriting unwritten extents */
 } xfs_bmalloca_t;
 
-#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE)
 /*
- * Trace operations for bmap extent tracing
+ * Flags for xfs_bmap_add_extent*.
  */
-#define	XFS_BMAP_KTRACE_DELETE	1
-#define	XFS_BMAP_KTRACE_INSERT	2
-#define	XFS_BMAP_KTRACE_PRE_UP	3
-#define	XFS_BMAP_KTRACE_POST_UP	4
-
-#define	XFS_BMAP_TRACE_SIZE	4096	/* size of global trace buffer */
-#define	XFS_BMAP_KTRACE_SIZE	32	/* size of per-inode trace buffer */
-extern ktrace_t	*xfs_bmap_trace_buf;
+#define BMAP_LEFT_CONTIG	(1 << 0)
+#define BMAP_RIGHT_CONTIG	(1 << 1)
+#define BMAP_LEFT_FILLING	(1 << 2)
+#define BMAP_RIGHT_FILLING	(1 << 3)
+#define BMAP_LEFT_DELAY		(1 << 4)
+#define BMAP_RIGHT_DELAY	(1 << 5)
+#define BMAP_LEFT_VALID		(1 << 6)
+#define BMAP_RIGHT_VALID	(1 << 7)
+#define BMAP_ATTRFORK		(1 << 8)
+
+#define XFS_BMAP_EXT_FLAGS \
+	{ BMAP_LEFT_CONTIG,	"LC" }, \
+	{ BMAP_RIGHT_CONTIG,	"RC" }, \
+	{ BMAP_LEFT_FILLING,	"LF" }, \
+	{ BMAP_RIGHT_FILLING,	"RF" }, \
+	{ BMAP_ATTRFORK,	"ATTR" }
 
 /*
  * Add bmap trace insert entries for all the contents of the extent list.
+ *
+ * Quite excessive tracing.  Only do this for debug builds.
  */
+#if defined(__KERNEL) && defined(DEBUG)
 void
 xfs_bmap_trace_exlist(
-	const char		*fname,		/* function name */
 	struct xfs_inode	*ip,		/* incore inode pointer */
 	xfs_extnum_t		cnt,		/* count of entries in list */
-	int			whichfork);	/* data or attr fork */
+	int			whichfork,
+	unsigned long		caller_ip);	/* data or attr fork */
 #define	XFS_BMAP_TRACE_EXLIST(ip,c,w)	\
-	xfs_bmap_trace_exlist(__func__,ip,c,w)
-
-#else	/* __KERNEL__ && XFS_BMAP_TRACE */
-
+	xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
+#else
 #define	XFS_BMAP_TRACE_EXLIST(ip,c,w)
-
-#endif	/* __KERNEL__ && XFS_BMAP_TRACE */
+#endif
 
 /*
  * Convert inode from non-attributed to attributed.
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index eb7b702d069..38751d5fac6 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -98,8 +98,7 @@ xfs_bmdr_to_bmbt(
  * This code must be in sync with the routines xfs_bmbt_get_startoff,
  * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
  */
-
-STATIC_INLINE void
+STATIC void
 __xfs_bmbt_get_all(
 		__uint64_t l0,
 		__uint64_t l1,
@@ -769,12 +768,6 @@ xfs_bmbt_trace_enter(
 		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
 		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
 		(void *)a8, (void *)a9, (void *)a10);
-	ktrace_enter(ip->i_btrace,
-		(void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-		(void *)func, (void *)s, (void *)ip, (void *)cur,
-		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
-		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
-		(void *)a8, (void *)a9, (void *)a10);
 }
 
 STATIC void
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 52b5f14d0c3..36a0992dd66 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -39,6 +39,7 @@
 #include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 /*
  * Cursor allocation zone.
@@ -81,7 +82,7 @@ xfs_btree_check_lblock(
 			XFS_ERRTAG_BTREE_CHECK_LBLOCK,
 			XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
 		if (bp)
-			xfs_buftrace("LBTREE ERROR", bp);
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
 		XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
 				 mp);
 		return XFS_ERROR(EFSCORRUPTED);
@@ -119,7 +120,7 @@ xfs_btree_check_sblock(
 			XFS_ERRTAG_BTREE_CHECK_SBLOCK,
 			XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
 		if (bp)
-			xfs_buftrace("SBTREE ERROR", bp);
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
 		XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
 			XFS_ERRLEVEL_LOW, cur->bc_mp, block);
 		return XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
index b3f5eb3c3c6..2d8a309873e 100644
--- a/fs/xfs/xfs_btree_trace.h
+++ b/fs/xfs/xfs_btree_trace.h
@@ -58,8 +58,6 @@ void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
 		struct xfs_buf *, int, int);
 void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
 		struct xfs_buf *, int, int, int);
-void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *,
-		xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int);
 void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
 void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
 		union xfs_btree_ptr, union xfs_btree_key *, int);
@@ -71,24 +69,10 @@ void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
 		union xfs_btree_rec *, int);
 void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
 
-
-#define XFS_ALLOCBT_TRACE_SIZE	4096	/* size of global trace buffer */
-extern ktrace_t	*xfs_allocbt_trace_buf;
-
-#define XFS_INOBT_TRACE_SIZE	4096	/* size of global trace buffer */
-extern ktrace_t	*xfs_inobt_trace_buf;
-
-#define XFS_BMBT_TRACE_SIZE	4096	/* size of global trace buffer */
-#define XFS_BMBT_KTRACE_SIZE	32	/* size of per-inode trace buffer */
-extern ktrace_t	*xfs_bmbt_trace_buf;
-
-
 #define	XFS_BTREE_TRACE_ARGBI(c, b, i)	\
 	xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
 #define	XFS_BTREE_TRACE_ARGBII(c, b, i, j)	\
 	xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
-#define	XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)	\
-	xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
 #define	XFS_BTREE_TRACE_ARGI(c, i)	\
 	xfs_btree_trace_argi(__func__, c, i, __LINE__)
 #define	XFS_BTREE_TRACE_ARGIPK(c, i, p, k)	\
@@ -104,7 +88,6 @@ extern ktrace_t	*xfs_bmbt_trace_buf;
 #else
 #define	XFS_BTREE_TRACE_ARGBI(c, b, i)
 #define	XFS_BTREE_TRACE_ARGBII(c, b, i, j)
-#define	XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)
 #define	XFS_BTREE_TRACE_ARGI(c, i)
 #define	XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
 #define	XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 92af4098c7e..a30f7e9eb2b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -29,6 +29,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 
 kmem_zone_t	*xfs_buf_item_zone;
@@ -164,7 +165,7 @@ xfs_buf_item_size(
 		 * is the buf log format structure with the
 		 * cancel flag in it.
 		 */
-		xfs_buf_item_trace("SIZE STALE", bip);
+		trace_xfs_buf_item_size_stale(bip);
 		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
 		return 1;
 	}
@@ -206,7 +207,7 @@ xfs_buf_item_size(
 		}
 	}
 
-	xfs_buf_item_trace("SIZE NORM", bip);
+	trace_xfs_buf_item_size(bip);
 	return nvecs;
 }
 
@@ -259,7 +260,7 @@ xfs_buf_item_format(
 		 * is the buf log format structure with the
 		 * cancel flag in it.
 		 */
-		xfs_buf_item_trace("FORMAT STALE", bip);
+		trace_xfs_buf_item_format_stale(bip);
 		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
 		bip->bli_format.blf_size = nvecs;
 		return;
@@ -335,7 +336,7 @@ xfs_buf_item_format(
 	/*
 	 * Check to make sure everything is consistent.
 	 */
-	xfs_buf_item_trace("FORMAT NORM", bip);
+	trace_xfs_buf_item_format(bip);
 	xfs_buf_item_log_check(bip);
 }
 
@@ -355,8 +356,7 @@ xfs_buf_item_pin(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 	       (bip->bli_flags & XFS_BLI_STALE));
-	xfs_buf_item_trace("PIN", bip);
-	xfs_buftrace("XFS_PIN", bp);
+	trace_xfs_buf_item_pin(bip);
 	xfs_bpin(bp);
 }
 
@@ -383,8 +383,7 @@ xfs_buf_item_unpin(
 	ASSERT(bp != NULL);
 	ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
-	xfs_buf_item_trace("UNPIN", bip);
-	xfs_buftrace("XFS_UNPIN", bp);
+	trace_xfs_buf_item_unpin(bip);
 
 	freed = atomic_dec_and_test(&bip->bli_refcount);
 	ailp = bip->bli_item.li_ailp;
@@ -395,8 +394,8 @@ xfs_buf_item_unpin(
 		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
 		ASSERT(XFS_BUF_ISSTALE(bp));
 		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
-		xfs_buf_item_trace("UNPIN STALE", bip);
-		xfs_buftrace("XFS_UNPIN STALE", bp);
+		trace_xfs_buf_item_unpin_stale(bip);
+
 		/*
 		 * If we get called here because of an IO error, we may
 		 * or may not have the item on the AIL. xfs_trans_ail_delete()
@@ -440,8 +439,8 @@ xfs_buf_item_unpin_remove(
 	if ((atomic_read(&bip->bli_refcount) == 1) &&
 	    (bip->bli_flags & XFS_BLI_STALE)) {
 		ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
-		xfs_buf_item_trace("UNPIN REMOVE", bip);
-		xfs_buftrace("XFS_UNPIN_REMOVE", bp);
+		trace_xfs_buf_item_unpin_stale(bip);
+
 		/*
 		 * yes -- clear the xaction descriptor in-use flag
 		 * and free the chunk if required.  We can safely
@@ -495,7 +494,7 @@ xfs_buf_item_trylock(
 	XFS_BUF_HOLD(bp);
 
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
+	trace_xfs_buf_item_trylock(bip);
 	return XFS_ITEM_SUCCESS;
 }
 
@@ -524,7 +523,6 @@ xfs_buf_item_unlock(
 	uint		hold;
 
 	bp = bip->bli_buf;
-	xfs_buftrace("XFS_UNLOCK", bp);
 
 	/*
 	 * Clear the buffer's association with this transaction.
@@ -547,7 +545,7 @@ xfs_buf_item_unlock(
 	 */
 	if (bip->bli_flags & XFS_BLI_STALE) {
 		bip->bli_flags &= ~XFS_BLI_LOGGED;
-		xfs_buf_item_trace("UNLOCK STALE", bip);
+		trace_xfs_buf_item_unlock_stale(bip);
 		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
 		if (!aborted)
 			return;
@@ -574,7 +572,7 @@ xfs_buf_item_unlock(
 	 * release the buffer at the end of this routine.
 	 */
 	hold = bip->bli_flags & XFS_BLI_HOLD;
-	xfs_buf_item_trace("UNLOCK", bip);
+	trace_xfs_buf_item_unlock(bip);
 
 	/*
 	 * If the buf item isn't tracking any data, free it.
@@ -618,7 +616,8 @@ xfs_buf_item_committed(
 	xfs_buf_log_item_t	*bip,
 	xfs_lsn_t		lsn)
 {
-	xfs_buf_item_trace("COMMITTED", bip);
+	trace_xfs_buf_item_committed(bip);
+
 	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
 	    (bip->bli_item.li_lsn != 0)) {
 		return bip->bli_item.li_lsn;
@@ -640,7 +639,7 @@ xfs_buf_item_push(
 	xfs_buf_t	*bp;
 
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	xfs_buf_item_trace("PUSH", bip);
+	trace_xfs_buf_item_push(bip);
 
 	bp = bip->bli_buf;
 
@@ -738,9 +737,6 @@ xfs_buf_item_init(
 	bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
 	bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
 	bip->bli_format.blf_map_size = map_size;
-#ifdef XFS_BLI_TRACE
-	bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
-#endif
 
 #ifdef XFS_TRANS_DEBUG
 	/*
@@ -878,9 +874,6 @@ xfs_buf_item_free(
 	kmem_free(bip->bli_logged);
 #endif /* XFS_TRANS_DEBUG */
 
-#ifdef XFS_BLI_TRACE
-	ktrace_free(bip->bli_trace);
-#endif
 	kmem_zone_free(xfs_buf_item_zone, bip);
 }
 
@@ -897,7 +890,8 @@ xfs_buf_item_relse(
 {
 	xfs_buf_log_item_t	*bip;
 
-	xfs_buftrace("XFS_RELSE", bp);
+	trace_xfs_buf_item_relse(bp, _RET_IP_);
+
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
 	XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
 	if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
@@ -994,7 +988,7 @@ xfs_buf_iodone_callbacks(
 		if (XFS_FORCED_SHUTDOWN(mp)) {
 			ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
 			XFS_BUF_SUPER_STALE(bp);
-			xfs_buftrace("BUF_IODONE_CB", bp);
+			trace_xfs_buf_item_iodone(bp, _RET_IP_);
 			xfs_buf_do_callbacks(bp, lip);
 			XFS_BUF_SET_FSPRIVATE(bp, NULL);
 			XFS_BUF_CLR_IODONE_FUNC(bp);
@@ -1030,7 +1024,7 @@ xfs_buf_iodone_callbacks(
 				XFS_BUF_SET_START(bp);
 			}
 			ASSERT(XFS_BUF_IODONE_FUNC(bp));
-			xfs_buftrace("BUF_IODONE ASYNC", bp);
+			trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
 			xfs_buf_relse(bp);
 		} else {
 			/*
@@ -1053,9 +1047,7 @@ xfs_buf_iodone_callbacks(
 		}
 		return;
 	}
-#ifdef XFSERRORDEBUG
-	xfs_buftrace("XFS BUFCB NOERR", bp);
-#endif
+
 	xfs_buf_do_callbacks(bp, lip);
 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
@@ -1081,7 +1073,9 @@ xfs_buf_error_relse(
 	XFS_BUF_DONE(bp);
 	XFS_BUF_UNDELAYWRITE(bp);
 	XFS_BUF_ERROR(bp,0);
-	xfs_buftrace("BUF_ERROR_RELSE", bp);
+
+	trace_xfs_buf_error_relse(bp, _RET_IP_);
+
 	if (! XFS_FORCED_SHUTDOWN(mp))
 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	/*
@@ -1128,34 +1122,3 @@ xfs_buf_iodone(
 	xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
 	xfs_buf_item_free(bip);
 }
-
-#if defined(XFS_BLI_TRACE)
-void
-xfs_buf_item_trace(
-	char			*id,
-	xfs_buf_log_item_t	*bip)
-{
-	xfs_buf_t		*bp;
-	ASSERT(bip->bli_trace != NULL);
-
-	bp = bip->bli_buf;
-	ktrace_enter(bip->bli_trace,
-		     (void *)id,
-		     (void *)bip->bli_buf,
-		     (void *)((unsigned long)bip->bli_flags),
-		     (void *)((unsigned long)bip->bli_recur),
-		     (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
-		     (void *)((unsigned long)
-				(0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),
-		     (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),
-		     (void *)((unsigned long)XFS_BUF_COUNT(bp)),
-		     (void *)((unsigned long)XFS_BUF_BFLAGS(bp)),
-		     XFS_BUF_FSPRIVATE(bp, void *),
-		     XFS_BUF_FSPRIVATE2(bp, void *),
-		     (void *)(unsigned long)XFS_BUF_ISPINNED(bp),
-		     (void *)XFS_BUF_IODONE_FUNC(bp),
-		     (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
-		     (void *)bip->bli_item.li_desc,
-		     (void *)((unsigned long)bip->bli_item.li_flags));
-}
-#endif /* XFS_BLI_TRACE */
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 5a41c348bb1..217f34af00c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -70,22 +70,21 @@ typedef struct xfs_buf_log_format_t {
 #define	XFS_BLI_INODE_ALLOC_BUF	0x10
 #define XFS_BLI_STALE_INODE	0x20
 
+#define XFS_BLI_FLAGS \
+	{ XFS_BLI_HOLD,		"HOLD" }, \
+	{ XFS_BLI_DIRTY,	"DIRTY" }, \
+	{ XFS_BLI_STALE,	"STALE" }, \
+	{ XFS_BLI_LOGGED,	"LOGGED" }, \
+	{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
+	{ XFS_BLI_STALE_INODE,	"STALE_INODE" }
+
 
 #ifdef __KERNEL__
 
 struct xfs_buf;
-struct ktrace;
 struct xfs_mount;
 struct xfs_buf_log_item;
 
-#if defined(XFS_BLI_TRACE)
-#define	XFS_BLI_TRACE_SIZE	32
-
-void	xfs_buf_item_trace(char *, struct xfs_buf_log_item *);
-#else
-#define	xfs_buf_item_trace(id, bip)
-#endif
-
 /*
  * This is the in core log item structure used to track information
  * needed to log buffers.  It tracks how many times the lock has been
@@ -97,9 +96,6 @@ typedef struct xfs_buf_log_item {
 	unsigned int		bli_flags;	/* misc flags */
 	unsigned int		bli_recur;	/* lock recursion count */
 	atomic_t		bli_refcount;	/* cnt of tp refs */
-#ifdef XFS_BLI_TRACE
-	struct ktrace		*bli_trace;	/* event trace buf */
-#endif
 #ifdef XFS_TRANS_DEBUG
 	char			*bli_orig;	/* original buffer copy */
 	char			*bli_logged;	/* bytes logged (bitmap) */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 2847bbc1c53..c0c8869115b 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -46,6 +46,7 @@
 #include "xfs_dir2_block.h"
 #include "xfs_dir2_node.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 /*
  * xfs_da_btree.c
@@ -2107,7 +2108,7 @@ xfs_da_do_buf(
 				   (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC),
 				mp, XFS_ERRTAG_DA_READ_BUF,
 				XFS_RANDOM_DA_READ_BUF))) {
-			xfs_buftrace("DA READ ERROR", rbp->bps[0]);
+			trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_);
 			XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
 					     XFS_ERRLEVEL_LOW, mp, info);
 			error = XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 8c536167bf7..30cd08f56a3 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -125,6 +125,13 @@ typedef struct xfs_da_args {
 #define XFS_DA_OP_OKNOENT	0x0008	/* lookup/add op, ENOENT ok, else die */
 #define XFS_DA_OP_CILOOKUP	0x0010	/* lookup to return CI name if found */
 
+#define XFS_DA_OP_FLAGS \
+	{ XFS_DA_OP_JUSTCHECK,	"JUSTCHECK" }, \
+	{ XFS_DA_OP_RENAME,	"RENAME" }, \
+	{ XFS_DA_OP_ADDNAME,	"ADDNAME" }, \
+	{ XFS_DA_OP_OKNOENT,	"OKNOENT" }, \
+	{ XFS_DA_OP_CILOOKUP,	"CILOOKUP" }
+
 /*
  * Structure to describe buffer(s) for a block.
  * This is needed in the directory version 2 format case, when
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index ab89a7e94a0..d1483a4f71b 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -43,6 +43,7 @@
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 
 /*
  * Syssgi interface for swapext
@@ -168,7 +169,6 @@ xfs_swap_extents(
 	}
 
 	if (VN_CACHED(VFS_I(tip)) != 0) {
-		xfs_inval_cached_trace(tip, 0, -1, 0, -1);
 		error = xfs_flushinval_pages(tip, 0, -1,
 				FI_REMAPF_LOCKED);
 		if (error)
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index bb1d58eb398..93634a7e90e 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -40,9 +40,9 @@
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
 #include "xfs_dir2_node.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_error.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 
 struct xfs_name xfs_name_dotdot = {"..", 2};
 
@@ -525,7 +525,8 @@ xfs_dir2_grow_inode(
 	xfs_trans_t	*tp;
 	xfs_drfsbno_t	nblks;
 
-	xfs_dir2_trace_args_s("grow_inode", args, space);
+	trace_xfs_dir2_grow_inode(args, space);
+
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
@@ -703,7 +704,8 @@ xfs_dir2_shrink_inode(
 	xfs_mount_t	*mp;
 	xfs_trans_t	*tp;
 
-	xfs_dir2_trace_args_db("shrink_inode", args, db, bp);
+	trace_xfs_dir2_shrink_inode(args, db);
+
 	dp = args->dp;
 	mp = dp->i_mount;
 	tp = args->trans;
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index ab52e9e1c1e..ddc4ecc7807 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -36,8 +36,8 @@
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 /*
  * Local function prototypes.
@@ -94,7 +94,8 @@ xfs_dir2_block_addname(
 	__be16			*tagp;		/* pointer to tag value */
 	xfs_trans_t		*tp;		/* transaction structure */
 
-	xfs_dir2_trace_args("block_addname", args);
+	trace_xfs_dir2_block_addname(args);
+
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
@@ -590,7 +591,8 @@ xfs_dir2_block_lookup(
 	int			error;		/* error return value */
 	xfs_mount_t		*mp;		/* filesystem mount point */
 
-	xfs_dir2_trace_args("block_lookup", args);
+	trace_xfs_dir2_block_lookup(args);
+
 	/*
 	 * Get the buffer, look up the entry.
 	 * If not found (ENOENT) then return, have no buffer.
@@ -747,7 +749,8 @@ xfs_dir2_block_removename(
 	int			size;		/* shortform size */
 	xfs_trans_t		*tp;		/* transaction pointer */
 
-	xfs_dir2_trace_args("block_removename", args);
+	trace_xfs_dir2_block_removename(args);
+
 	/*
 	 * Look up the entry in the block.  Gets the buffer and entry index.
 	 * It will always be there, the vnodeops level does a lookup first.
@@ -823,7 +826,8 @@ xfs_dir2_block_replace(
 	int			error;		/* error return value */
 	xfs_mount_t		*mp;		/* filesystem mount point */
 
-	xfs_dir2_trace_args("block_replace", args);
+	trace_xfs_dir2_block_replace(args);
+
 	/*
 	 * Lookup the entry in the directory.  Get buffer and entry index.
 	 * This will always succeed since the caller has already done a lookup.
@@ -897,7 +901,8 @@ xfs_dir2_leaf_to_block(
 	int			to;		/* block/leaf to index */
 	xfs_trans_t		*tp;		/* transaction pointer */
 
-	xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp);
+	trace_xfs_dir2_leaf_to_block(args);
+
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
@@ -1044,7 +1049,8 @@ xfs_dir2_sf_to_block(
 	xfs_trans_t		*tp;		/* transaction pointer */
 	struct xfs_name		name;
 
-	xfs_dir2_trace_args("sf_to_block", args);
+	trace_xfs_dir2_sf_to_block(args);
+
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 41ad537c49e..29f484c11b3 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -38,8 +38,8 @@
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
 #include "xfs_dir2_node.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 /*
  * Local function declarations.
@@ -80,7 +80,8 @@ xfs_dir2_block_to_leaf(
 	int			needscan;	/* need to rescan bestfree */
 	xfs_trans_t		*tp;		/* transaction pointer */
 
-	xfs_dir2_trace_args_b("block_to_leaf", args, dbp);
+	trace_xfs_dir2_block_to_leaf(args);
+
 	dp = args->dp;
 	mp = dp->i_mount;
 	tp = args->trans;
@@ -188,7 +189,8 @@ xfs_dir2_leaf_addname(
 	xfs_trans_t		*tp;		/* transaction pointer */
 	xfs_dir2_db_t		use_block;	/* data block number */
 
-	xfs_dir2_trace_args("leaf_addname", args);
+	trace_xfs_dir2_leaf_addname(args);
+
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
@@ -1266,7 +1268,8 @@ xfs_dir2_leaf_lookup(
 	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
 	xfs_trans_t		*tp;		/* transaction pointer */
 
-	xfs_dir2_trace_args("leaf_lookup", args);
+	trace_xfs_dir2_leaf_lookup(args);
+
 	/*
 	 * Look up name in the leaf block, returning both buffers and index.
 	 */
@@ -1454,7 +1457,8 @@ xfs_dir2_leaf_removename(
 	xfs_dir2_data_off_t	oldbest;	/* old value of best free */
 	xfs_trans_t		*tp;		/* transaction pointer */
 
-	xfs_dir2_trace_args("leaf_removename", args);
+	trace_xfs_dir2_leaf_removename(args);
+
 	/*
 	 * Lookup the leaf entry, get the leaf and data blocks read in.
 	 */
@@ -1586,7 +1590,8 @@ xfs_dir2_leaf_replace(
 	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
 	xfs_trans_t		*tp;		/* transaction pointer */
 
-	xfs_dir2_trace_args("leaf_replace", args);
+	trace_xfs_dir2_leaf_replace(args);
+
 	/*
 	 * Look up the entry.
 	 */
@@ -1766,7 +1771,9 @@ xfs_dir2_node_to_leaf(
 	if (state->path.active > 1)
 		return 0;
 	args = state->args;
-	xfs_dir2_trace_args("node_to_leaf", args);
+
+	trace_xfs_dir2_node_to_leaf(args);
+
 	mp = state->mp;
 	dp = args->dp;
 	tp = args->trans;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 5a81ccd1045..ce6e355199b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -37,8 +37,8 @@
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
 #include "xfs_dir2_node.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 /*
  * Function declarations.
@@ -123,7 +123,8 @@ xfs_dir2_leaf_to_node(
 	__be16			*to;		/* pointer to freespace entry */
 	xfs_trans_t		*tp;		/* transaction pointer */
 
-	xfs_dir2_trace_args_b("leaf_to_node", args, lbp);
+	trace_xfs_dir2_leaf_to_node(args);
+
 	dp = args->dp;
 	mp = dp->i_mount;
 	tp = args->trans;
@@ -196,7 +197,8 @@ xfs_dir2_leafn_add(
 	xfs_mount_t		*mp;		/* filesystem mount point */
 	xfs_trans_t		*tp;		/* transaction pointer */
 
-	xfs_dir2_trace_args_sb("leafn_add", args, index, bp);
+	trace_xfs_dir2_leafn_add(args, index);
+
 	dp = args->dp;
 	mp = dp->i_mount;
 	tp = args->trans;
@@ -711,8 +713,8 @@ xfs_dir2_leafn_moveents(
 	int		stale;			/* count stale leaves copied */
 	xfs_trans_t	*tp;			/* transaction pointer */
 
-	xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d,
-		start_d, count);
+	trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
+
 	/*
 	 * Silently return if nothing to do.
 	 */
@@ -933,7 +935,8 @@ xfs_dir2_leafn_remove(
 	int			needscan;	/* need to rescan data frees */
 	xfs_trans_t		*tp;		/* transaction pointer */
 
-	xfs_dir2_trace_args_sb("leafn_remove", args, index, bp);
+	trace_xfs_dir2_leafn_remove(args, index);
+
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
@@ -1363,7 +1366,8 @@ xfs_dir2_node_addname(
 	int			rval;		/* sub-return value */
 	xfs_da_state_t		*state;		/* btree cursor */
 
-	xfs_dir2_trace_args("node_addname", args);
+	trace_xfs_dir2_node_addname(args);
+
 	/*
 	 * Allocate and initialize the state (btree cursor).
 	 */
@@ -1822,7 +1826,8 @@ xfs_dir2_node_lookup(
 	int		rval;			/* operation return value */
 	xfs_da_state_t	*state;			/* btree cursor */
 
-	xfs_dir2_trace_args("node_lookup", args);
+	trace_xfs_dir2_node_lookup(args);
+
 	/*
 	 * Allocate and initialize the btree cursor.
 	 */
@@ -1875,7 +1880,8 @@ xfs_dir2_node_removename(
 	int			rval;		/* operation return value */
 	xfs_da_state_t		*state;		/* btree cursor */
 
-	xfs_dir2_trace_args("node_removename", args);
+	trace_xfs_dir2_node_removename(args);
+
 	/*
 	 * Allocate and initialize the btree cursor.
 	 */
@@ -1944,7 +1950,8 @@ xfs_dir2_node_replace(
 	int			rval;		/* internal return value */
 	xfs_da_state_t		*state;		/* btree cursor */
 
-	xfs_dir2_trace_args("node_replace", args);
+	trace_xfs_dir2_node_replace(args);
+
 	/*
 	 * Allocate and initialize the btree cursor.
 	 */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index e89734e8464..9d4f17a6967 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -37,7 +37,7 @@
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
-#include "xfs_dir2_trace.h"
+#include "xfs_trace.h"
 
 /*
  * Prototypes for internal functions.
@@ -169,7 +169,8 @@ xfs_dir2_block_to_sf(
 	xfs_dir2_sf_t		*sfp;		/* shortform structure */
 	xfs_ino_t               temp;
 
-	xfs_dir2_trace_args_sb("block_to_sf", args, size, bp);
+	trace_xfs_dir2_block_to_sf(args);
+
 	dp = args->dp;
 	mp = dp->i_mount;
 
@@ -281,7 +282,8 @@ xfs_dir2_sf_addname(
 	xfs_dir2_sf_t		*sfp;		/* shortform structure */
 	xfs_dir2_sf_entry_t	*sfep = NULL;	/* shortform entry */
 
-	xfs_dir2_trace_args("sf_addname", args);
+	trace_xfs_dir2_sf_addname(args);
+
 	ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
 	dp = args->dp;
 	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -654,7 +656,8 @@ xfs_dir2_sf_create(
 	xfs_dir2_sf_t	*sfp;		/* shortform structure */
 	int		size;		/* directory size */
 
-	xfs_dir2_trace_args_i("sf_create", args, pino);
+	trace_xfs_dir2_sf_create(args);
+
 	dp = args->dp;
 
 	ASSERT(dp != NULL);
@@ -808,7 +811,8 @@ xfs_dir2_sf_lookup(
 	enum xfs_dacmp		cmp;		/* comparison result */
 	xfs_dir2_sf_entry_t	*ci_sfep;	/* case-insens. entry */
 
-	xfs_dir2_trace_args("sf_lookup", args);
+	trace_xfs_dir2_sf_lookup(args);
+
 	xfs_dir2_sf_check(args);
 	dp = args->dp;
 
@@ -891,7 +895,8 @@ xfs_dir2_sf_removename(
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
 	xfs_dir2_sf_t		*sfp;		/* shortform structure */
 
-	xfs_dir2_trace_args("sf_removename", args);
+	trace_xfs_dir2_sf_removename(args);
+
 	dp = args->dp;
 
 	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -982,7 +987,8 @@ xfs_dir2_sf_replace(
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
 	xfs_dir2_sf_t		*sfp;		/* shortform structure */
 
-	xfs_dir2_trace_args("sf_replace", args);
+	trace_xfs_dir2_sf_replace(args);
+
 	dp = args->dp;
 
 	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -1125,7 +1131,8 @@ xfs_dir2_sf_toino4(
 	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
 	xfs_dir2_sf_t		*sfp;		/* new sf directory */
 
-	xfs_dir2_trace_args("sf_toino4", args);
+	trace_xfs_dir2_sf_toino4(args);
+
 	dp = args->dp;
 
 	/*
@@ -1202,7 +1209,8 @@ xfs_dir2_sf_toino8(
 	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
 	xfs_dir2_sf_t		*sfp;		/* new sf directory */
 
-	xfs_dir2_trace_args("sf_toino8", args);
+	trace_xfs_dir2_sf_toino8(args);
+
 	dp = args->dp;
 
 	/*
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
deleted file mode 100644
index 6cc7c0c681a..00000000000
--- a/fs/xfs/xfs_dir2_trace.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_dir2_trace.h"
-
-#ifdef XFS_DIR2_TRACE
-ktrace_t	*xfs_dir2_trace_buf;
-
-/*
- * Enter something in the trace buffers.
- */
-static void
-xfs_dir2_trace_enter(
-	xfs_inode_t	*dp,
-	int		type,
-	char		*where,
-	char		*name,
-	int		namelen,
-	void		*a0,
-	void		*a1,
-	void		*a2,
-	void		*a3,
-	void		*a4,
-	void		*a5,
-	void		*a6,
-	void		*a7)
-{
-	void		*n[5];
-
-	ASSERT(xfs_dir2_trace_buf);
-	ASSERT(dp->i_dir_trace);
-	if (name)
-		memcpy(n, name, min((int)sizeof(n), namelen));
-	else
-		memset((char *)n, 0, sizeof(n));
-	ktrace_enter(xfs_dir2_trace_buf,
-		(void *)(long)type, (void *)where,
-		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
-		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
-		(void *)(long)namelen,
-		(void *)n[0], (void *)n[1], (void *)n[2],
-		(void *)n[3], (void *)n[4]);
-	ktrace_enter(dp->i_dir_trace,
-		(void *)(long)type, (void *)where,
-		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
-		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
-		(void *)(long)namelen,
-		(void *)n[0], (void *)n[1], (void *)n[2],
-		(void *)n[3], (void *)n[4]);
-}
-
-void
-xfs_dir2_trace_args(
-	char		*where,
-	xfs_da_args_t	*args)
-{
-	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where,
-		(char *)args->name, (int)args->namelen,
-		(void *)(unsigned long)args->hashval,
-		(void *)((unsigned long)(args->inumber >> 32)),
-		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-		NULL, NULL);
-}
-
-void
-xfs_dir2_trace_args_b(
-	char		*where,
-	xfs_da_args_t	*args,
-	xfs_dabuf_t	*bp)
-{
-	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where,
-		(char *)args->name, (int)args->namelen,
-		(void *)(unsigned long)args->hashval,
-		(void *)((unsigned long)(args->inumber >> 32)),
-		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-		(void *)(bp ? bp->bps[0] : NULL), NULL);
-}
-
-void
-xfs_dir2_trace_args_bb(
-	char		*where,
-	xfs_da_args_t	*args,
-	xfs_dabuf_t	*lbp,
-	xfs_dabuf_t	*dbp)
-{
-	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where,
-		(char *)args->name, (int)args->namelen,
-		(void *)(unsigned long)args->hashval,
-		(void *)((unsigned long)(args->inumber >> 32)),
-		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-		(void *)(lbp ? lbp->bps[0] : NULL),
-		(void *)(dbp ? dbp->bps[0] : NULL));
-}
-
-void
-xfs_dir2_trace_args_bibii(
-	char		*where,
-	xfs_da_args_t	*args,
-	xfs_dabuf_t	*bs,
-	int		ss,
-	xfs_dabuf_t	*bd,
-	int		sd,
-	int		c)
-{
-	xfs_buf_t	*bpbs = bs ? bs->bps[0] : NULL;
-	xfs_buf_t	*bpbd = bd ? bd->bps[0] : NULL;
-
-	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where,
-		(char *)args->name, (int)args->namelen,
-		(void *)args->dp, (void *)args->trans,
-		(void *)bpbs, (void *)(long)ss, (void *)bpbd, (void *)(long)sd,
-		(void *)(long)c, NULL);
-}
-
-void
-xfs_dir2_trace_args_db(
-	char		*where,
-	xfs_da_args_t	*args,
-	xfs_dir2_db_t	db,
-	xfs_dabuf_t	*bp)
-{
-	xfs_buf_t	*dbp = bp ? bp->bps[0] : NULL;
-
-	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where,
-		(char *)args->name, (int)args->namelen,
-		(void *)(unsigned long)args->hashval,
-		(void *)((unsigned long)(args->inumber >> 32)),
-		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-		(void *)(long)db, (void *)dbp);
-}
-
-void
-xfs_dir2_trace_args_i(
-	char		*where,
-	xfs_da_args_t	*args,
-	xfs_ino_t	i)
-{
-	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where,
-		(char *)args->name, (int)args->namelen,
-		(void *)(unsigned long)args->hashval,
-		(void *)((unsigned long)(args->inumber >> 32)),
-		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-		(void *)((unsigned long)(i >> 32)),
-		(void *)((unsigned long)(i & 0xFFFFFFFF)));
-}
-
-void
-xfs_dir2_trace_args_s(
-	char		*where,
-	xfs_da_args_t	*args,
-	int		s)
-{
-	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where,
-		(char *)args->name, (int)args->namelen,
-		(void *)(unsigned long)args->hashval,
-		(void *)((unsigned long)(args->inumber >> 32)),
-		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-		(void *)(long)s, NULL);
-}
-
-void
-xfs_dir2_trace_args_sb(
-	char		*where,
-	xfs_da_args_t	*args,
-	int		s,
-	xfs_dabuf_t	*bp)
-{
-	xfs_buf_t	*dbp = bp ? bp->bps[0] : NULL;
-
-	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where,
-		(char *)args->name, (int)args->namelen,
-		(void *)(unsigned long)args->hashval,
-		(void *)((unsigned long)(args->inumber >> 32)),
-		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-		(void *)(long)s, (void *)dbp);
-}
-#endif	/* XFS_DIR2_TRACE */
diff --git a/fs/xfs/xfs_dir2_trace.h b/fs/xfs/xfs_dir2_trace.h
deleted file mode 100644
index ca3c754f482..00000000000
--- a/fs/xfs/xfs_dir2_trace.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR2_TRACE_H__
-#define __XFS_DIR2_TRACE_H__
-
-/*
- * Tracing for xfs v2 directories.
- */
-
-#if defined(XFS_DIR2_TRACE)
-
-struct ktrace;
-struct xfs_dabuf;
-struct xfs_da_args;
-
-#define	XFS_DIR2_GTRACE_SIZE		4096	/* global buffer */
-#define	XFS_DIR2_KTRACE_SIZE		32	/* per-inode buffer */
-extern struct ktrace *xfs_dir2_trace_buf;
-
-#define	XFS_DIR2_KTRACE_ARGS		1	/* args only */
-#define	XFS_DIR2_KTRACE_ARGS_B		2	/* args + buffer */
-#define	XFS_DIR2_KTRACE_ARGS_BB		3	/* args + 2 buffers */
-#define	XFS_DIR2_KTRACE_ARGS_DB		4	/* args, db, buffer */
-#define	XFS_DIR2_KTRACE_ARGS_I		5	/* args, inum */
-#define	XFS_DIR2_KTRACE_ARGS_S		6	/* args, int */
-#define	XFS_DIR2_KTRACE_ARGS_SB		7	/* args, int, buffer */
-#define	XFS_DIR2_KTRACE_ARGS_BIBII	8	/* args, buf/int/buf/int/int */
-
-void xfs_dir2_trace_args(char *where, struct xfs_da_args *args);
-void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args,
-			   struct xfs_dabuf *bp);
-void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args,
-			    struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
-void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args,
-			       struct xfs_dabuf *bs, int ss,
-			       struct xfs_dabuf *bd, int sd, int c);
-void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args,
-			    xfs_dir2_db_t db, struct xfs_dabuf *bp);
-void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i);
-void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s);
-void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s,
-			    struct xfs_dabuf *bp);
-
-#else	/* XFS_DIR2_TRACE */
-
-#define	xfs_dir2_trace_args(where, args)
-#define	xfs_dir2_trace_args_b(where, args, bp)
-#define	xfs_dir2_trace_args_bb(where, args, lbp, dbp)
-#define	xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c)
-#define	xfs_dir2_trace_args_db(where, args, db, bp)
-#define	xfs_dir2_trace_args_i(where, args, i)
-#define	xfs_dir2_trace_args_s(where, args, s)
-#define	xfs_dir2_trace_args_sb(where, args, s, bp)
-
-#endif	/* XFS_DIR2_TRACE */
-
-#endif	/* __XFS_DIR2_TRACE_H__ */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index edf8bdf4141..a631e1451ab 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -34,6 +34,7 @@
 #include "xfs_utils.h"
 #include "xfs_mru_cache.h"
 #include "xfs_filestream.h"
+#include "xfs_trace.h"
 
 #ifdef XFS_FILESTREAMS_TRACE
 
@@ -394,9 +395,7 @@ xfs_filestream_init(void)
 	item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
 	if (!item_zone)
 		return -ENOMEM;
-#ifdef XFS_FILESTREAMS_TRACE
-	xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS);
-#endif
+
 	return 0;
 }
 
@@ -407,9 +406,6 @@ xfs_filestream_init(void)
 void
 xfs_filestream_uninit(void)
 {
-#ifdef XFS_FILESTREAMS_TRACE
-	ktrace_free(xfs_filestreams_trace_buf);
-#endif
 	kmem_zone_destroy(item_zone);
 }
 
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index f655f7dc334..4aba67c5f64 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -79,7 +79,7 @@ extern ktrace_t *xfs_filestreams_trace_buf;
  * the cache that reference per-ag array elements that have since been
  * reallocated.
  */
-STATIC_INLINE int
+static inline int
 xfs_filestream_peek_ag(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno)
@@ -87,7 +87,7 @@ xfs_filestream_peek_ag(
 	return atomic_read(&mp->m_perag[agno].pagf_fstrms);
 }
 
-STATIC_INLINE int
+static inline int
 xfs_filestream_get_ag(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno)
@@ -95,7 +95,7 @@ xfs_filestream_get_ag(
 	return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms);
 }
 
-STATIC_INLINE int
+static inline int
 xfs_filestream_put_ag(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno)
@@ -122,7 +122,7 @@ int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
 
 
 /* filestreams for the inode? */
-STATIC_INLINE int
+static inline int
 xfs_inode_is_filestream(
 	struct xfs_inode	*ip)
 {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 2d0b3e1da9e..a13919a6a36 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -45,6 +45,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_rw.h"
 #include "xfs_filestream.h"
+#include "xfs_trace.h"
 
 /*
  * File system operations
@@ -201,8 +202,8 @@ xfs_growfs_data_private(
 		 * AG freelist header block
 		 */
 		bp = xfs_buf_get(mp->m_ddev_targp,
-				  XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-				  XFS_FSS_TO_BB(mp, 1), 0);
+				 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+				 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
 		agf = XFS_BUF_TO_AGF(bp);
 		memset(agf, 0, mp->m_sb.sb_sectsize);
 		agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -233,8 +234,8 @@ xfs_growfs_data_private(
 		 * AG inode header block
 		 */
 		bp = xfs_buf_get(mp->m_ddev_targp,
-				  XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-				  XFS_FSS_TO_BB(mp, 1), 0);
+				 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+				 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
 		agi = XFS_BUF_TO_AGI(bp);
 		memset(agi, 0, mp->m_sb.sb_sectsize);
 		agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -257,8 +258,9 @@ xfs_growfs_data_private(
 		 * BNO btree root block
 		 */
 		bp = xfs_buf_get(mp->m_ddev_targp,
-			XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
-			BTOBB(mp->m_sb.sb_blocksize), 0);
+				 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+				 BTOBB(mp->m_sb.sb_blocksize),
+				 XBF_LOCK | XBF_MAPPED);
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
@@ -278,8 +280,9 @@ xfs_growfs_data_private(
 		 * CNT btree root block
 		 */
 		bp = xfs_buf_get(mp->m_ddev_targp,
-			XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
-			BTOBB(mp->m_sb.sb_blocksize), 0);
+				 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+				 BTOBB(mp->m_sb.sb_blocksize),
+				 XBF_LOCK | XBF_MAPPED);
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
@@ -300,8 +303,9 @@ xfs_growfs_data_private(
 		 * INO btree root block
 		 */
 		bp = xfs_buf_get(mp->m_ddev_targp,
-			XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
-			BTOBB(mp->m_sb.sb_blocksize), 0);
+				 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+				 BTOBB(mp->m_sb.sb_blocksize),
+				 XBF_LOCK | XBF_MAPPED);
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
@@ -344,6 +348,7 @@ xfs_growfs_data_private(
 		be32_add_cpu(&agf->agf_length, new);
 		ASSERT(be32_to_cpu(agf->agf_length) ==
 		       be32_to_cpu(agi->agi_length));
+
 		xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
 		/*
 		 * Free the new space.
@@ -611,7 +616,7 @@ xfs_fs_log_dummy(
 	xfs_inode_t	*ip;
 	int		error;
 
-	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
 	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 0785797db82..cb907ba69c4 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -425,7 +425,7 @@ xfs_ialloc_ag_alloc(
 	return 0;
 }
 
-STATIC_INLINE xfs_agnumber_t
+STATIC xfs_agnumber_t
 xfs_ialloc_next_ag(
 	xfs_mount_t	*mp)
 {
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 80e526489be..f5c904a10c1 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,7 +43,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_btree_trace.h"
-#include "xfs_dir2_trace.h"
+#include "xfs_trace.h"
 
 
 /*
@@ -73,6 +73,9 @@ xfs_inode_alloc(
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
+	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+
+	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 
 	/* initialise the xfs inode */
 	ip->i_ino = ino;
@@ -87,28 +90,6 @@ xfs_inode_alloc(
 	ip->i_size = 0;
 	ip->i_new_size = 0;
 
-	/*
-	 * Initialize inode's trace buffers.
-	 */
-#ifdef	XFS_INODE_TRACE
-	ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_BMAP_TRACE
-	ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_BTREE_TRACE
-	ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_RW_TRACE
-	ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_ILOCK_TRACE
-	ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_DIR2_TRACE
-	ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
-#endif
-
 	/* prevent anyone from using this yet */
 	VFS_I(ip)->i_state = I_NEW|I_LOCK;
 
@@ -130,25 +111,6 @@ xfs_inode_free(
 	if (ip->i_afp)
 		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 
-#ifdef XFS_INODE_TRACE
-	ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-	ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BTREE_TRACE
-	ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-	ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-	ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-	ktrace_free(ip->i_dir_trace);
-#endif
-
 	if (ip->i_itemp) {
 		/*
 		 * Only if we are shutting down the fs will we see an
@@ -207,6 +169,7 @@ xfs_iget_cache_hit(
 	 *	     instead of polling for it.
 	 */
 	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+		trace_xfs_iget_skip(ip);
 		XFS_STATS_INC(xs_ig_frecycle);
 		error = EAGAIN;
 		goto out_error;
@@ -225,7 +188,7 @@ xfs_iget_cache_hit(
 	 * Need to carefully get it back into useable state.
 	 */
 	if (ip->i_flags & XFS_IRECLAIMABLE) {
-		xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+		trace_xfs_iget_reclaim(ip);
 
 		/*
 		 * We need to set XFS_INEW atomically with clearing the
@@ -251,6 +214,7 @@ xfs_iget_cache_hit(
 			ip->i_flags &= ~XFS_INEW;
 			ip->i_flags |= XFS_IRECLAIMABLE;
 			__xfs_inode_set_reclaim_tag(pag, ip);
+			trace_xfs_iget_reclaim(ip);
 			goto out_error;
 		}
 		inode->i_state = I_LOCK|I_NEW;
@@ -270,8 +234,9 @@ xfs_iget_cache_hit(
 		xfs_ilock(ip, lock_flags);
 
 	xfs_iflags_clear(ip, XFS_ISTALE);
-	xfs_itrace_exit_tag(ip, "xfs_iget.found");
 	XFS_STATS_INC(xs_ig_found);
+
+	trace_xfs_iget_found(ip);
 	return 0;
 
 out_error:
@@ -290,7 +255,7 @@ xfs_iget_cache_miss(
 	struct xfs_inode	**ipp,
 	xfs_daddr_t		bno,
 	int			flags,
-	int			lock_flags) __releases(pag->pag_ici_lock)
+	int			lock_flags)
 {
 	struct xfs_inode	*ip;
 	int			error;
@@ -305,7 +270,7 @@ xfs_iget_cache_miss(
 	if (error)
 		goto out_destroy;
 
-	xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+	xfs_itrace_entry(ip);
 
 	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
 		error = ENOENT;
@@ -350,6 +315,8 @@ xfs_iget_cache_miss(
 
 	write_unlock(&pag->pag_ici_lock);
 	radix_tree_preload_end();
+
+	trace_xfs_iget_alloc(ip);
 	*ipp = ip;
 	return 0;
 
@@ -636,7 +603,7 @@ xfs_ilock(
 	else if (lock_flags & XFS_ILOCK_SHARED)
 		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 
-	xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address);
+	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 }
 
 /*
@@ -681,7 +648,7 @@ xfs_ilock_nowait(
 		if (!mrtryaccess(&ip->i_lock))
 			goto out_undo_iolock;
 	}
-	xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address);
+	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
 	return 1;
 
  out_undo_iolock:
@@ -743,7 +710,7 @@ xfs_iunlock(
 		xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
 					(xfs_log_item_t*)(ip->i_itemp));
 	}
-	xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
+	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 }
 
 /*
@@ -762,6 +729,8 @@ xfs_ilock_demote(
 		mrdemote(&ip->i_lock);
 	if (lock_flags & XFS_IOLOCK_EXCL)
 		mrdemote(&ip->i_iolock);
+
+	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 }
 
 #ifdef DEBUG
@@ -792,52 +761,3 @@ xfs_isilocked(
 	return 1;
 }
 #endif
-
-#ifdef	XFS_INODE_TRACE
-
-#define KTRACE_ENTER(ip, vk, s, line, ra)			\
-	ktrace_enter((ip)->i_trace,				\
-/*  0 */		(void *)(__psint_t)(vk),		\
-/*  1 */		(void *)(s),				\
-/*  2 */		(void *)(__psint_t) line,		\
-/*  3 */		(void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \
-/*  4 */		(void *)(ra),				\
-/*  5 */		NULL,					\
-/*  6 */		(void *)(__psint_t)current_cpu(),	\
-/*  7 */		(void *)(__psint_t)current_pid(),	\
-/*  8 */		(void *)__return_address,		\
-/*  9 */		NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-
-/*
- * Vnode tracing code.
- */
-void
-_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-	KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
-}
-
-void
-_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-	KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
-}
-
-void
-xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-	KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
-}
-
-void
-_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-	KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
-}
-
-void
-xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-	KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
-}
-#endif	/* XFS_INODE_TRACE */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b92a4fa2a0a..ce278b3ae7f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -47,10 +47,10 @@
 #include "xfs_rw.h"
 #include "xfs_error.h"
 #include "xfs_utils.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_quota.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
@@ -1291,42 +1291,6 @@ xfs_file_last_byte(
 	return last_byte;
 }
 
-#if defined(XFS_RW_TRACE)
-STATIC void
-xfs_itrunc_trace(
-	int		tag,
-	xfs_inode_t	*ip,
-	int		flag,
-	xfs_fsize_t	new_size,
-	xfs_off_t	toss_start,
-	xfs_off_t	toss_finish)
-{
-	if (ip->i_rwtrace == NULL) {
-		return;
-	}
-
-	ktrace_enter(ip->i_rwtrace,
-		     (void*)((long)tag),
-		     (void*)ip,
-		     (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
-		     (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
-		     (void*)((long)flag),
-		     (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
-		     (void*)(unsigned long)(new_size & 0xffffffff),
-		     (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
-		     (void*)(unsigned long)(toss_start & 0xffffffff),
-		     (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
-		     (void*)(unsigned long)(toss_finish & 0xffffffff),
-		     (void*)(unsigned long)current_cpu(),
-		     (void*)(unsigned long)current_pid(),
-		     (void*)NULL,
-		     (void*)NULL,
-		     (void*)NULL);
-}
-#else
-#define	xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
-#endif
-
 /*
  * Start the truncation of the file to new_size.  The new size
  * must be smaller than the current size.  This routine will
@@ -1409,8 +1373,7 @@ xfs_itruncate_start(
 		return 0;
 	}
 	last_byte = xfs_file_last_byte(ip);
-	xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
-			 last_byte);
+	trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte);
 	if (last_byte > toss_start) {
 		if (flags & XFS_ITRUNC_DEFINITE) {
 			xfs_tosspages(ip, toss_start,
@@ -1514,7 +1477,8 @@ xfs_itruncate_finish(
 		new_size = 0LL;
 	}
 	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
-	xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
+	trace_xfs_itruncate_finish_start(ip, new_size);
+
 	/*
 	 * The first thing we do is set the size to new_size permanently
 	 * on disk.  This way we don't have to worry about anyone ever
@@ -1731,7 +1695,7 @@ xfs_itruncate_finish(
 	ASSERT((new_size != 0) ||
 	       (fork == XFS_ATTR_FORK) ||
 	       (ip->i_d.di_nextents == 0));
-	xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
+	trace_xfs_itruncate_finish_end(ip, new_size);
 	return 0;
 }
 
@@ -3252,23 +3216,6 @@ corrupt_out:
 	return XFS_ERROR(EFSCORRUPTED);
 }
 
-
-
-#ifdef XFS_ILOCK_TRACE
-void
-xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
-{
-	ktrace_enter(ip->i_lock_trace,
-		     (void *)ip,
-		     (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
-		     (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
-		     (void *)ra,		/* caller of ilock */
-		     (void *)(unsigned long)current_cpu(),
-		     (void *)(unsigned long)current_pid(),
-		     NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
-}
-#endif
-
 /*
  * Return a pointer to the extent record at file index idx.
  */
@@ -3300,13 +3247,17 @@ xfs_iext_get_ext(
  */
 void
 xfs_iext_insert(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
 	xfs_extnum_t	idx,		/* starting index of new items */
 	xfs_extnum_t	count,		/* number of inserted items */
-	xfs_bmbt_irec_t	*new)		/* items to insert */
+	xfs_bmbt_irec_t	*new,		/* items to insert */
+	int		state)		/* type of extent conversion */
 {
+	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
 	xfs_extnum_t	i;		/* extent record index */
 
+	trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+
 	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
 	xfs_iext_add(ifp, idx, count);
 	for (i = idx; i < idx + count; i++, new++)
@@ -3549,13 +3500,17 @@ xfs_iext_add_indirect_multi(
  */
 void
 xfs_iext_remove(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
 	xfs_extnum_t	idx,		/* index to begin removing exts */
-	int		ext_diff)	/* number of extents to remove */
+	int		ext_diff,	/* number of extents to remove */
+	int		state)		/* type of extent conversion */
 {
+	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	int		new_size;	/* size of extents after removal */
 
+	trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+
 	ASSERT(ext_diff > 0);
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 41555de1d1d..ec1f28c4fc4 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -213,7 +213,6 @@ typedef struct xfs_icdinode {
 
 struct bhv_desc;
 struct cred;
-struct ktrace;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -222,13 +221,6 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot;
 
-#if defined(XFS_ILOCK_TRACE)
-#define XFS_ILOCK_KTRACE_SIZE	32
-extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
-#else
-#define	xfs_ilock_trace(i,n,f,ra)
-#endif
-
 typedef struct dm_attrs_s {
 	__uint32_t	da_dmevmask;	/* DMIG event mask */
 	__uint16_t	da_dmstate;	/* DMIG state info */
@@ -271,26 +263,6 @@ typedef struct xfs_inode {
 
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
-
-	/* Trace buffers per inode. */
-#ifdef XFS_INODE_TRACE
-	struct ktrace		*i_trace;	/* general inode trace */
-#endif
-#ifdef XFS_BMAP_TRACE
-	struct ktrace		*i_xtrace;	/* inode extent list trace */
-#endif
-#ifdef XFS_BTREE_TRACE
-	struct ktrace		*i_btrace;	/* inode bmap btree trace */
-#endif
-#ifdef XFS_RW_TRACE
-	struct ktrace		*i_rwtrace;	/* inode read/write trace */
-#endif
-#ifdef XFS_ILOCK_TRACE
-	struct ktrace		*i_lock_trace;	/* inode lock/unlock trace */
-#endif
-#ifdef XFS_DIR2_TRACE
-	struct ktrace		*i_dir_trace;	/* inode directory trace */
-#endif
 } xfs_inode_t;
 
 #define XFS_ISIZE(ip)	(((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
@@ -406,6 +378,14 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_LOCK_MASK		(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
 				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
 
+#define XFS_LOCK_FLAGS \
+	{ XFS_IOLOCK_EXCL,	"IOLOCK_EXCL" }, \
+	{ XFS_IOLOCK_SHARED,	"IOLOCK_SHARED" }, \
+	{ XFS_ILOCK_EXCL,	"ILOCK_EXCL" }, \
+	{ XFS_ILOCK_SHARED,	"ILOCK_SHARED" }, \
+	{ XFS_IUNLOCK_NONOTIFY,	"IUNLOCK_NONOTIFY" }
+
+
 /*
  * Flags for lockdep annotations.
  *
@@ -455,6 +435,10 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define	XFS_ITRUNC_DEFINITE	0x1
 #define	XFS_ITRUNC_MAYBE	0x2
 
+#define XFS_ITRUNC_FLAGS \
+	{ XFS_ITRUNC_DEFINITE,	"DEFINITE" }, \
+	{ XFS_ITRUNC_MAYBE,	"MAYBE" }
+
 /*
  * For multiple groups support: if S_ISGID bit is set in the parent
  * directory, group of new file is set to that of the parent, and
@@ -507,48 +491,16 @@ void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void		xfs_synchronize_times(xfs_inode_t *);
 void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 
-#if defined(XFS_INODE_TRACE)
-
-#define	INODE_TRACE_SIZE	16		/* number of trace entries */
-#define	INODE_KTRACE_ENTRY	1
-#define	INODE_KTRACE_EXIT	2
-#define	INODE_KTRACE_HOLD	3
-#define	INODE_KTRACE_REF	4
-#define	INODE_KTRACE_RELE	5
-
-extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
-extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
-extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
-extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
-extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
-#define xfs_itrace_entry(ip)	\
-	_xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit(ip)	\
-	_xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit_tag(ip, tag)	\
-	_xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
-#define xfs_itrace_ref(ip)	\
-	_xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
-
-#else
-#define	xfs_itrace_entry(a)
-#define	xfs_itrace_exit(a)
-#define	xfs_itrace_exit_tag(a, b)
-#define	xfs_itrace_hold(a, b, c, d)
-#define	xfs_itrace_ref(a)
-#define	xfs_itrace_rele(a, b, c, d)
-#endif
-
 #define IHOLD(ip) \
 do { \
 	ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
 	atomic_inc(&(VFS_I(ip)->i_count)); \
-	xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+	trace_xfs_ihold(ip, _THIS_IP_); \
 } while (0)
 
 #define IRELE(ip) \
 do { \
-	xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+	trace_xfs_irele(ip, _THIS_IP_); \
 	iput(VFS_I(ip)); \
 } while (0)
 
@@ -577,11 +529,11 @@ int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int		xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
 
 xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
-void		xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t,
-				xfs_bmbt_irec_t *);
+void		xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t,
+				xfs_bmbt_irec_t *, int);
 void		xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
-void		xfs_iext_remove(xfs_ifork_t *, xfs_extnum_t, int);
+void		xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int);
 void		xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9794b876d6f..f38855d21ea 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -41,6 +41,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_rw.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 
 kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
@@ -800,7 +801,9 @@ xfs_inode_item_pushbuf(
 				  !completion_done(&ip->i_flush));
 			iip->ili_pushbuf_flag = 0;
 			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-			xfs_buftrace("INODE ITEM PUSH", bp);
+
+			trace_xfs_inode_item_push(bp, _RET_IP_);
+
 			if (XFS_BUF_ISPINNED(bp)) {
 				xfs_log_force(mp, (xfs_lsn_t)0,
 					      XFS_LOG_FORCE);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67ae5555a30..0b65039951a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,72 +47,8 @@
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_iomap.h"
+#include "xfs_trace.h"
 
-#if defined(XFS_RW_TRACE)
-void
-xfs_iomap_enter_trace(
-	int		tag,
-	xfs_inode_t	*ip,
-	xfs_off_t	offset,
-	ssize_t		count)
-{
-	if (!ip->i_rwtrace)
-		return;
-
-	ktrace_enter(ip->i_rwtrace,
-		(void *)((unsigned long)tag),
-		(void *)ip,
-		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(offset & 0xffffffff)),
-		(void *)((unsigned long)count),
-		(void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(ip->i_new_size & 0xffffffff)),
-		(void *)((unsigned long)current_pid()),
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL);
-}
-
-void
-xfs_iomap_map_trace(
-	int		tag,
-	xfs_inode_t	*ip,
-	xfs_off_t	offset,
-	ssize_t		count,
-	xfs_iomap_t	*iomapp,
-	xfs_bmbt_irec_t	*imapp,
-	int		flags)
-{
-	if (!ip->i_rwtrace)
-		return;
-
-	ktrace_enter(ip->i_rwtrace,
-		(void *)((unsigned long)tag),
-		(void *)ip,
-		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(offset & 0xffffffff)),
-		(void *)((unsigned long)count),
-		(void *)((unsigned long)flags),
-		(void *)((unsigned long)((iomapp->iomap_offset >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(iomapp->iomap_offset & 0xffffffff)),
-		(void *)((unsigned long)(iomapp->iomap_delta)),
-		(void *)((unsigned long)(iomapp->iomap_bsize)),
-		(void *)((unsigned long)(iomapp->iomap_bn)),
-		(void *)(__psint_t)(imapp->br_startoff),
-		(void *)((unsigned long)(imapp->br_blockcount)),
-		(void *)(__psint_t)(imapp->br_startblock));
-}
-#else
-#define xfs_iomap_enter_trace(tag, io, offset, count)
-#define xfs_iomap_map_trace(tag, io, offset, count, iomapp, imapp, flags)
-#endif
 
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
 						<< mp->m_writeio_log)
@@ -187,21 +123,20 @@ xfs_iomap(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
+	trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
+
 	switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
 	case BMAPI_READ:
-		xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, ip, offset, count);
 		lockmode = xfs_ilock_map_shared(ip);
 		bmapi_flags = XFS_BMAPI_ENTIRE;
 		break;
 	case BMAPI_WRITE:
-		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, ip, offset, count);
 		lockmode = XFS_ILOCK_EXCL;
 		if (flags & BMAPI_IGNSTATE)
 			bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
 		xfs_ilock(ip, lockmode);
 		break;
 	case BMAPI_ALLOCATE:
-		xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, ip, offset, count);
 		lockmode = XFS_ILOCK_SHARED;
 		bmapi_flags = XFS_BMAPI_ENTIRE;
 
@@ -237,8 +172,7 @@ xfs_iomap(
 		if (nimaps &&
 		    (imap.br_startblock != HOLESTARTBLOCK) &&
 		    (imap.br_startblock != DELAYSTARTBLOCK)) {
-			xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip,
-					offset, count, iomapp, &imap, flags);
+			trace_xfs_iomap_found(ip, offset, count, flags, &imap);
 			break;
 		}
 
@@ -250,8 +184,7 @@ xfs_iomap(
 						      &imap, &nimaps);
 		}
 		if (!error) {
-			xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, ip,
-					offset, count, iomapp, &imap, flags);
+			trace_xfs_iomap_alloc(ip, offset, count, flags, &imap);
 		}
 		iomap_flags = IOMAP_NEW;
 		break;
@@ -261,8 +194,7 @@ xfs_iomap(
 		lockmode = 0;
 
 		if (nimaps && !isnullstartblock(imap.br_startblock)) {
-			xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip,
-					offset, count, iomapp, &imap, flags);
+			trace_xfs_iomap_found(ip, offset, count, flags, &imap);
 			break;
 		}
 
@@ -623,8 +555,7 @@ retry:
 	 * delalloc blocks and retry without EOF preallocation.
 	 */
 	if (nimaps == 0) {
-		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
-					ip, offset, count);
+		trace_xfs_delalloc_enospc(ip, offset, count);
 		if (flushed)
 			return XFS_ERROR(ENOSPC);
 
@@ -837,7 +768,7 @@ xfs_iomap_write_unwritten(
 	int		committed;
 	int		error;
 
-	xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, ip, offset, count);
+	trace_xfs_unwritten_convert(ip, offset, count);
 
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
@@ -860,8 +791,15 @@ xfs_iomap_write_unwritten(
 		 * set up a transaction to convert the range of extents
 		 * from unwritten to real. Do allocations in a loop until
 		 * we have covered the range passed in.
+		 *
+		 * Note that we open code the transaction allocation here
+		 * to pass KM_NOFS--we can't risk to recursing back into
+		 * the filesystem here as we might be asked to write out
+		 * the same inode that we complete here and might deadlock
+		 * on the iolock.
 		 */
-		tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+		xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+		tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
 		tp->t_flags |= XFS_TRANS_RESERVE;
 		error = xfs_trans_reserve(tp, resblks,
 				XFS_WRITE_LOG_RES(mp), 0,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index fdcf7b82747..174f2999099 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -43,6 +43,14 @@ typedef enum {
 	BMAPI_TRYLOCK = (1 << 7),	/* non-blocking request */
 } bmapi_flags_t;
 
+#define BMAPI_FLAGS \
+	{ BMAPI_READ,		"READ" }, \
+	{ BMAPI_WRITE,		"WRITE" }, \
+	{ BMAPI_ALLOCATE,	"ALLOCATE" }, \
+	{ BMAPI_IGNSTATE,	"IGNSTATE" }, \
+	{ BMAPI_DIRECT,		"DIRECT" }, \
+	{ BMAPI_MMAP,		"MMAP" }, \
+	{ BMAPI_TRYLOCK,	"TRYLOCK" }
 
 /*
  * xfs_iomap_t:  File system I/O map
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 9dbdff3ea48..4cb1792040e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -40,6 +40,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_rw.h"
+#include "xfs_trace.h"
 
 kmem_zone_t	*xfs_log_ticket_zone;
 
@@ -122,85 +123,6 @@ STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
 
 STATIC int	xlog_iclogs_empty(xlog_t *log);
 
-#if defined(XFS_LOG_TRACE)
-
-#define XLOG_TRACE_LOGGRANT_SIZE	2048
-#define XLOG_TRACE_ICLOG_SIZE		256
-
-void
-xlog_trace_loggrant_alloc(xlog_t *log)
-{
-	log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
-}
-
-void
-xlog_trace_loggrant_dealloc(xlog_t *log)
-{
-	ktrace_free(log->l_grant_trace);
-}
-
-void
-xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
-{
-	unsigned long cnts;
-
-	/* ticket counts are 1 byte each */
-	cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
-
-	ktrace_enter(log->l_grant_trace,
-		     (void *)tic,
-		     (void *)log->l_reserve_headq,
-		     (void *)log->l_write_headq,
-		     (void *)((unsigned long)log->l_grant_reserve_cycle),
-		     (void *)((unsigned long)log->l_grant_reserve_bytes),
-		     (void *)((unsigned long)log->l_grant_write_cycle),
-		     (void *)((unsigned long)log->l_grant_write_bytes),
-		     (void *)((unsigned long)log->l_curr_cycle),
-		     (void *)((unsigned long)log->l_curr_block),
-		     (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)),
-		     (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)),
-		     (void *)string,
-		     (void *)((unsigned long)tic->t_trans_type),
-		     (void *)cnts,
-		     (void *)((unsigned long)tic->t_curr_res),
-		     (void *)((unsigned long)tic->t_unit_res));
-}
-
-void
-xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
-{
-	iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
-}
-
-void
-xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
-{
-	ktrace_free(iclog->ic_trace);
-}
-
-void
-xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
-{
-	ktrace_enter(iclog->ic_trace,
-		     (void *)((unsigned long)state),
-		     (void *)((unsigned long)current_pid()),
-		     (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL,
-		     (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL,
-		     (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL,
-		     (void *)NULL, (void *)NULL);
-}
-#else
-
-#define	xlog_trace_loggrant_alloc(log)
-#define	xlog_trace_loggrant_dealloc(log)
-#define	xlog_trace_loggrant(log,tic,string)
-
-#define	xlog_trace_iclog_alloc(iclog)
-#define	xlog_trace_iclog_dealloc(iclog)
-#define	xlog_trace_iclog(iclog,state)
-
-#endif /* XFS_LOG_TRACE */
-
 
 static void
 xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
@@ -353,15 +275,17 @@ xfs_log_done(xfs_mount_t	*mp,
 
 	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
 	    (flags & XFS_LOG_REL_PERM_RESERV)) {
+		trace_xfs_log_done_nonperm(log, ticket);
+
 		/*
 		 * Release ticket if not permanent reservation or a specific
 		 * request has been made to release a permanent reservation.
 		 */
-		xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
 		xlog_ungrant_log_space(log, ticket);
 		xfs_log_ticket_put(ticket);
 	} else {
-		xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
+		trace_xfs_log_done_perm(log, ticket);
+
 		xlog_regrant_reserve_log_space(log, ticket);
 		/* If this ticket was a permanent reservation and we aren't
 		 * trying to release it, reset the inited flags; so next time
@@ -505,10 +429,13 @@ xfs_log_reserve(xfs_mount_t	 *mp,
 
 	XFS_STATS_INC(xs_try_logspace);
 
+
 	if (*ticket != NULL) {
 		ASSERT(flags & XFS_LOG_PERM_RESERV);
 		internal_ticket = (xlog_ticket_t *)*ticket;
-		xlog_trace_loggrant(log, internal_ticket, "xfs_log_reserve: existing ticket (permanent trans)");
+
+		trace_xfs_log_reserve(log, internal_ticket);
+
 		xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
 		retval = xlog_regrant_write_log_space(log, internal_ticket);
 	} else {
@@ -519,10 +446,9 @@ xfs_log_reserve(xfs_mount_t	 *mp,
 			return XFS_ERROR(ENOMEM);
 		internal_ticket->t_trans_type = t_type;
 		*ticket = internal_ticket;
-		xlog_trace_loggrant(log, internal_ticket, 
-			(internal_ticket->t_flags & XLOG_TIC_PERM_RESERV) ?
-			"xfs_log_reserve: create new ticket (permanent trans)" :
-			"xfs_log_reserve: create new ticket");
+
+		trace_xfs_log_reserve(log, internal_ticket);
+
 		xlog_grant_push_ail(mp,
 				    (internal_ticket->t_unit_res *
 				     internal_ticket->t_cnt));
@@ -734,7 +660,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 			spin_unlock(&log->l_icloglock);
 		}
 		if (tic) {
-			xlog_trace_loggrant(log, tic, "unmount rec");
+			trace_xfs_log_umount_write(log, tic);
 			xlog_ungrant_log_space(log, tic);
 			xfs_log_ticket_put(tic);
 		}
@@ -1030,7 +956,6 @@ xlog_iodone(xfs_buf_t *bp)
 		xfs_fs_cmn_err(CE_WARN, l->l_mp,
 				"xlog_iodone: Barriers are no longer supported"
 				" by device. Disabling barriers\n");
-		xfs_buftrace("XLOG_IODONE BARRIERS OFF", bp);
 	}
 
 	/*
@@ -1085,13 +1010,10 @@ xlog_bdstrat_cb(struct xfs_buf *bp)
 		return 0;
 	}
 
-	xfs_buftrace("XLOG__BDSTRAT IOERROR", bp);
 	XFS_BUF_ERROR(bp, EIO);
 	XFS_BUF_STALE(bp);
 	xfs_biodone(bp);
 	return XFS_ERROR(EIO);
-
-
 }
 
 /*
@@ -1246,7 +1168,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	spin_lock_init(&log->l_grant_lock);
 	sv_init(&log->l_flush_wait, 0, "flush_wait");
 
-	xlog_trace_loggrant_alloc(log);
 	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
 	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
 
@@ -1305,8 +1226,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
 		sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
 
-		xlog_trace_iclog_alloc(iclog);
-
 		iclogp = &iclog->ic_next;
 	}
 	*iclogp = log->l_iclog;			/* complete ring */
@@ -1321,13 +1240,11 @@ out_free_iclog:
 			sv_destroy(&iclog->ic_force_wait);
 			sv_destroy(&iclog->ic_write_wait);
 			xfs_buf_free(iclog->ic_bp);
-			xlog_trace_iclog_dealloc(iclog);
 		}
 		kmem_free(iclog);
 	}
 	spinlock_destroy(&log->l_icloglock);
 	spinlock_destroy(&log->l_grant_lock);
-	xlog_trace_loggrant_dealloc(log);
 	xfs_buf_free(log->l_xbuf);
 out_free_log:
 	kmem_free(log);
@@ -1607,7 +1524,6 @@ xlog_dealloc_log(xlog_t *log)
 		sv_destroy(&iclog->ic_force_wait);
 		sv_destroy(&iclog->ic_write_wait);
 		xfs_buf_free(iclog->ic_bp);
-		xlog_trace_iclog_dealloc(iclog);
 		next_iclog = iclog->ic_next;
 		kmem_free(iclog);
 		iclog = next_iclog;
@@ -1616,7 +1532,6 @@ xlog_dealloc_log(xlog_t *log)
 	spinlock_destroy(&log->l_grant_lock);
 
 	xfs_buf_free(log->l_xbuf);
-	xlog_trace_loggrant_dealloc(log);
 	log->l_mp->m_log = NULL;
 	kmem_free(log);
 }	/* xlog_dealloc_log */
@@ -2414,7 +2329,6 @@ restart:
 
 	iclog = log->l_iclog;
 	if (iclog->ic_state != XLOG_STATE_ACTIVE) {
-		xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
 		XFS_STATS_INC(xs_log_noiclogs);
 
 		/* Wait for log writes to have flushed */
@@ -2520,13 +2434,15 @@ xlog_grant_log_space(xlog_t	   *log,
 
 	/* Is there space or do we need to sleep? */
 	spin_lock(&log->l_grant_lock);
-	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter");
+
+	trace_xfs_log_grant_enter(log, tic);
 
 	/* something is already sleeping; insert new transaction at end */
 	if (log->l_reserve_headq) {
 		xlog_ins_ticketq(&log->l_reserve_headq, tic);
-		xlog_trace_loggrant(log, tic,
-				    "xlog_grant_log_space: sleep 1");
+
+		trace_xfs_log_grant_sleep1(log, tic);
+
 		/*
 		 * Gotta check this before going to sleep, while we're
 		 * holding the grant lock.
@@ -2540,8 +2456,7 @@ xlog_grant_log_space(xlog_t	   *log,
 		 * If we got an error, and the filesystem is shutting down,
 		 * we'll catch it down below. So just continue...
 		 */
-		xlog_trace_loggrant(log, tic,
-				    "xlog_grant_log_space: wake 1");
+		trace_xfs_log_grant_wake1(log, tic);
 		spin_lock(&log->l_grant_lock);
 	}
 	if (tic->t_flags & XFS_LOG_PERM_RESERV)
@@ -2558,8 +2473,9 @@ redo:
 	if (free_bytes < need_bytes) {
 		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
 			xlog_ins_ticketq(&log->l_reserve_headq, tic);
-		xlog_trace_loggrant(log, tic,
-				    "xlog_grant_log_space: sleep 2");
+
+		trace_xfs_log_grant_sleep2(log, tic);
+
 		spin_unlock(&log->l_grant_lock);
 		xlog_grant_push_ail(log->l_mp, need_bytes);
 		spin_lock(&log->l_grant_lock);
@@ -2571,8 +2487,8 @@ redo:
 		if (XLOG_FORCED_SHUTDOWN(log))
 			goto error_return;
 
-		xlog_trace_loggrant(log, tic,
-				    "xlog_grant_log_space: wake 2");
+		trace_xfs_log_grant_wake2(log, tic);
+
 		goto redo;
 	} else if (tic->t_flags & XLOG_TIC_IN_Q)
 		xlog_del_ticketq(&log->l_reserve_headq, tic);
@@ -2592,7 +2508,7 @@ redo:
 		ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
 	}
 #endif
-	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit");
+	trace_xfs_log_grant_exit(log, tic);
 	xlog_verify_grant_head(log, 1);
 	spin_unlock(&log->l_grant_lock);
 	return 0;
@@ -2600,7 +2516,9 @@ redo:
  error_return:
 	if (tic->t_flags & XLOG_TIC_IN_Q)
 		xlog_del_ticketq(&log->l_reserve_headq, tic);
-	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret");
+
+	trace_xfs_log_grant_error(log, tic);
+
 	/*
 	 * If we are failing, make sure the ticket doesn't have any
 	 * current reservations. We don't want to add this back when
@@ -2640,7 +2558,8 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 #endif
 
 	spin_lock(&log->l_grant_lock);
-	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter");
+
+	trace_xfs_log_regrant_write_enter(log, tic);
 
 	if (XLOG_FORCED_SHUTDOWN(log))
 		goto error_return;
@@ -2669,8 +2588,8 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 			if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
 				xlog_ins_ticketq(&log->l_write_headq, tic);
 
-			xlog_trace_loggrant(log, tic,
-				    "xlog_regrant_write_log_space: sleep 1");
+			trace_xfs_log_regrant_write_sleep1(log, tic);
+
 			spin_unlock(&log->l_grant_lock);
 			xlog_grant_push_ail(log->l_mp, need_bytes);
 			spin_lock(&log->l_grant_lock);
@@ -2685,8 +2604,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 			if (XLOG_FORCED_SHUTDOWN(log))
 				goto error_return;
 
-			xlog_trace_loggrant(log, tic,
-				    "xlog_regrant_write_log_space: wake 1");
+			trace_xfs_log_regrant_write_wake1(log, tic);
 		}
 	}
 
@@ -2704,6 +2622,8 @@ redo:
 		spin_lock(&log->l_grant_lock);
 
 		XFS_STATS_INC(xs_sleep_logspace);
+		trace_xfs_log_regrant_write_sleep2(log, tic);
+
 		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
 
 		/* If we're shutting down, this tic is already off the queue */
@@ -2711,8 +2631,7 @@ redo:
 		if (XLOG_FORCED_SHUTDOWN(log))
 			goto error_return;
 
-		xlog_trace_loggrant(log, tic,
-				    "xlog_regrant_write_log_space: wake 2");
+		trace_xfs_log_regrant_write_wake2(log, tic);
 		goto redo;
 	} else if (tic->t_flags & XLOG_TIC_IN_Q)
 		xlog_del_ticketq(&log->l_write_headq, tic);
@@ -2727,7 +2646,8 @@ redo:
 	}
 #endif
 
-	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit");
+	trace_xfs_log_regrant_write_exit(log, tic);
+
 	xlog_verify_grant_head(log, 1);
 	spin_unlock(&log->l_grant_lock);
 	return 0;
@@ -2736,7 +2656,9 @@ redo:
  error_return:
 	if (tic->t_flags & XLOG_TIC_IN_Q)
 		xlog_del_ticketq(&log->l_reserve_headq, tic);
-	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret");
+
+	trace_xfs_log_regrant_write_error(log, tic);
+
 	/*
 	 * If we are failing, make sure the ticket doesn't have any
 	 * current reservations. We don't want to add this back when
@@ -2760,8 +2682,8 @@ STATIC void
 xlog_regrant_reserve_log_space(xlog_t	     *log,
 			       xlog_ticket_t *ticket)
 {
-	xlog_trace_loggrant(log, ticket,
-			    "xlog_regrant_reserve_log_space: enter");
+	trace_xfs_log_regrant_reserve_enter(log, ticket);
+
 	if (ticket->t_cnt > 0)
 		ticket->t_cnt--;
 
@@ -2769,8 +2691,9 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
 	xlog_grant_sub_space(log, ticket->t_curr_res);
 	ticket->t_curr_res = ticket->t_unit_res;
 	xlog_tic_reset_res(ticket);
-	xlog_trace_loggrant(log, ticket,
-			    "xlog_regrant_reserve_log_space: sub current res");
+
+	trace_xfs_log_regrant_reserve_sub(log, ticket);
+
 	xlog_verify_grant_head(log, 1);
 
 	/* just return if we still have some of the pre-reserved space */
@@ -2780,8 +2703,9 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
 	}
 
 	xlog_grant_add_space_reserve(log, ticket->t_unit_res);
-	xlog_trace_loggrant(log, ticket,
-			    "xlog_regrant_reserve_log_space: exit");
+
+	trace_xfs_log_regrant_reserve_exit(log, ticket);
+
 	xlog_verify_grant_head(log, 0);
 	spin_unlock(&log->l_grant_lock);
 	ticket->t_curr_res = ticket->t_unit_res;
@@ -2811,11 +2735,11 @@ xlog_ungrant_log_space(xlog_t	     *log,
 		ticket->t_cnt--;
 
 	spin_lock(&log->l_grant_lock);
-	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter");
+	trace_xfs_log_ungrant_enter(log, ticket);
 
 	xlog_grant_sub_space(log, ticket->t_curr_res);
 
-	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current");
+	trace_xfs_log_ungrant_sub(log, ticket);
 
 	/* If this is a permanent reservation ticket, we may be able to free
 	 * up more space based on the remaining count.
@@ -2825,7 +2749,8 @@ xlog_ungrant_log_space(xlog_t	     *log,
 		xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
 	}
 
-	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit");
+	trace_xfs_log_ungrant_exit(log, ticket);
+
 	xlog_verify_grant_head(log, 1);
 	spin_unlock(&log->l_grant_lock);
 	xfs_log_move_tail(log->l_mp, 1);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 679c7c4926a..d55662db707 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -19,7 +19,6 @@
 #define __XFS_LOG_PRIV_H__
 
 struct xfs_buf;
-struct ktrace;
 struct log;
 struct xlog_ticket;
 struct xfs_buf_cancel;
@@ -135,6 +134,12 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_TIC_INITED		0x1	/* has been initialized */
 #define XLOG_TIC_PERM_RESERV	0x2	/* permanent reservation */
 #define XLOG_TIC_IN_Q		0x4
+
+#define XLOG_TIC_FLAGS \
+	{ XLOG_TIC_INITED,	"XLOG_TIC_INITED" }, \
+	{ XLOG_TIC_PERM_RESERV,	"XLOG_TIC_PERM_RESERV" }, \
+	{ XLOG_TIC_IN_Q,	"XLOG_TIC_IN_Q" }
+
 #endif	/* __KERNEL__ */
 
 #define XLOG_UNMOUNT_TYPE	0x556e	/* Un for Unmount */
@@ -361,9 +366,6 @@ typedef struct xlog_in_core {
 	int			ic_bwritecnt;
 	unsigned short		ic_state;
 	char			*ic_datap;	/* pointer to iclog data */
-#ifdef XFS_LOG_TRACE
-	struct ktrace		*ic_trace;
-#endif
 
 	/* Callback structures need their own cacheline */
 	spinlock_t		ic_callback_lock ____cacheline_aligned_in_smp;
@@ -429,10 +431,6 @@ typedef struct log {
 	int			l_grant_write_cycle;
 	int			l_grant_write_bytes;
 
-#ifdef XFS_LOG_TRACE
-	struct ktrace		*l_grant_trace;
-#endif
-
 	/* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
 	char			*l_iclog_bak[XLOG_MAX_ICLOGS];
@@ -456,12 +454,6 @@ extern void	 xlog_put_bp(struct xfs_buf *);
 
 extern kmem_zone_t	*xfs_log_ticket_zone;
 
-/* iclog tracing */
-#define XLOG_TRACE_GRAB_FLUSH  1
-#define XLOG_TRACE_REL_FLUSH   2
-#define XLOG_TRACE_SLEEP_FLUSH 3
-#define XLOG_TRACE_WAKE_FLUSH  4
-
 /*
  * Unmount record type is used as a pseudo transaction type for the ticket.
  * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index fb17f8226b0..69ac2e5ef20 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -46,6 +46,7 @@
 #include "xfs_quota.h"
 #include "xfs_rw.h"
 #include "xfs_utils.h"
+#include "xfs_trace.h"
 
 STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
@@ -225,16 +226,10 @@ xlog_header_check_dump(
 	xfs_mount_t		*mp,
 	xlog_rec_header_t	*head)
 {
-	int			b;
-
-	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
-	for (b = 0; b < 16; b++)
-		cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]);
-	cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
-	cmn_err(CE_DEBUG, "    log : uuid = ");
-	for (b = 0; b < 16; b++)
-		cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]);
-	cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
+	cmn_err(CE_DEBUG, "%s:  SB : uuid = %pU, fmt = %d\n",
+		__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
+	cmn_err(CE_DEBUG, "    log : uuid = %pU, fmt = %d\n",
+		&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 }
 #else
 #define xlog_header_check_dump(mp, head)
@@ -2206,6 +2201,7 @@ xlog_recover_do_buffer_trans(
 	xfs_daddr_t		blkno;
 	int			len;
 	ushort			flags;
+	uint			buf_flags;
 
 	buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
 
@@ -2246,12 +2242,11 @@ xlog_recover_do_buffer_trans(
 	}
 
 	mp = log->l_mp;
-	if (flags & XFS_BLI_INODE_BUF) {
-		bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
-								XFS_BUF_LOCK);
-	} else {
-		bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
-	}
+	buf_flags = XFS_BUF_LOCK;
+	if (!(flags & XFS_BLI_INODE_BUF))
+		buf_flags |= XFS_BUF_MAPPED;
+
+	bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
 	if (XFS_BUF_ISERROR(bp)) {
 		xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
 				  bp, blkno);
@@ -2350,8 +2345,8 @@ xlog_recover_do_inode_trans(
 		goto error;
 	}
 
-	bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
-				in_f->ilf_len, XFS_BUF_LOCK);
+	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
+			  XFS_BUF_LOCK);
 	if (XFS_BUF_ISERROR(bp)) {
 		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
 				  bp, in_f->ilf_blkno);
@@ -3517,7 +3512,7 @@ xlog_do_recovery_pass(
 {
 	xlog_rec_header_t	*rhead;
 	xfs_daddr_t		blk_no;
-	xfs_caddr_t		bufaddr, offset;
+	xfs_caddr_t		offset;
 	xfs_buf_t		*hbp, *dbp;
 	int			error = 0, h_size;
 	int			bblks, split_bblks;
@@ -3610,7 +3605,7 @@ xlog_do_recovery_pass(
 			/*
 			 * Check for header wrapping around physical end-of-log
 			 */
-			offset = NULL;
+			offset = XFS_BUF_PTR(hbp);
 			split_hblks = 0;
 			wrapped_hblks = 0;
 			if (blk_no + hblks <= log->l_logBBsize) {
@@ -3646,9 +3641,8 @@ xlog_do_recovery_pass(
 				 *   - order is important.
 				 */
 				wrapped_hblks = hblks - split_hblks;
-				bufaddr = XFS_BUF_PTR(hbp);
 				error = XFS_BUF_SET_PTR(hbp,
-						bufaddr + BBTOB(split_hblks),
+						offset + BBTOB(split_hblks),
 						BBTOB(hblks - split_hblks));
 				if (error)
 					goto bread_err2;
@@ -3658,14 +3652,10 @@ xlog_do_recovery_pass(
 				if (error)
 					goto bread_err2;
 
-				error = XFS_BUF_SET_PTR(hbp, bufaddr,
+				error = XFS_BUF_SET_PTR(hbp, offset,
 							BBTOB(hblks));
 				if (error)
 					goto bread_err2;
-
-				if (!offset)
-					offset = xlog_align(log, 0,
-							wrapped_hblks, hbp);
 			}
 			rhead = (xlog_rec_header_t *)offset;
 			error = xlog_valid_rec_header(log, rhead,
@@ -3685,7 +3675,7 @@ xlog_do_recovery_pass(
 			} else {
 				/* This log record is split across the
 				 * physical end of log */
-				offset = NULL;
+				offset = XFS_BUF_PTR(dbp);
 				split_bblks = 0;
 				if (blk_no != log->l_logBBsize) {
 					/* some data is before the physical
@@ -3714,9 +3704,8 @@ xlog_do_recovery_pass(
 				 *   _first_, then the log start (LR header end)
 				 *   - order is important.
 				 */
-				bufaddr = XFS_BUF_PTR(dbp);
 				error = XFS_BUF_SET_PTR(dbp,
-						bufaddr + BBTOB(split_bblks),
+						offset + BBTOB(split_bblks),
 						BBTOB(bblks - split_bblks));
 				if (error)
 					goto bread_err2;
@@ -3727,13 +3716,9 @@ xlog_do_recovery_pass(
 				if (error)
 					goto bread_err2;
 
-				error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
+				error = XFS_BUF_SET_PTR(dbp, offset, h_size);
 				if (error)
 					goto bread_err2;
-
-				if (!offset)
-					offset = xlog_align(log, wrapped_hblks,
-						bblks - split_bblks, dbp);
 			}
 			xlog_unpack_data(rhead, offset, log);
 			if ((error = xlog_recover_process_data(log, rhash,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8b6c9e807ef..eb403b40e12 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,6 +44,8 @@
 #include "xfs_quota.h"
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
+#include "xfs_trace.h"
+
 
 STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
 
@@ -583,8 +585,8 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
 	extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
 
-	bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
-				BTOBB(sector_size), extra_flags);
+	bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
+			  extra_flags);
 	if (!bp || XFS_BUF_ISERROR(bp)) {
 		xfs_fs_mount_cmn_err(flags, "SB read failed");
 		error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
@@ -624,8 +626,8 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 		XFS_BUF_UNMANAGE(bp);
 		xfs_buf_relse(bp);
 		sector_size = mp->m_sb.sb_sectsize;
-		bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
-					BTOBB(sector_size), extra_flags);
+		bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR,
+				  BTOBB(sector_size), extra_flags);
 		if (!bp || XFS_BUF_ISERROR(bp)) {
 			xfs_fs_mount_cmn_err(flags, "SB re-read failed");
 			error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
@@ -1471,7 +1473,7 @@ xfs_log_sbcount(
 	if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
 		return 0;
 
-	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
+	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
 	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
 					XFS_DEFAULT_LOG_COUNT);
 	if (error) {
@@ -2123,7 +2125,7 @@ xfs_icsb_destroy_counters(
 	mutex_destroy(&mp->m_icsb_mutex);
 }
 
-STATIC_INLINE void
+STATIC void
 xfs_icsb_lock_cntr(
 	xfs_icsb_cnts_t	*icsbp)
 {
@@ -2132,7 +2134,7 @@ xfs_icsb_lock_cntr(
 	}
 }
 
-STATIC_INLINE void
+STATIC void
 xfs_icsb_unlock_cntr(
 	xfs_icsb_cnts_t	*icsbp)
 {
@@ -2140,7 +2142,7 @@ xfs_icsb_unlock_cntr(
 }
 
 
-STATIC_INLINE void
+STATIC void
 xfs_icsb_lock_all_counters(
 	xfs_mount_t	*mp)
 {
@@ -2153,7 +2155,7 @@ xfs_icsb_lock_all_counters(
 	}
 }
 
-STATIC_INLINE void
+STATIC void
 xfs_icsb_unlock_all_counters(
 	xfs_mount_t	*mp)
 {
@@ -2389,12 +2391,12 @@ xfs_icsb_modify_counters(
 {
 	xfs_icsb_cnts_t	*icsbp;
 	long long	lcounter;	/* long counter for 64 bit fields */
-	int		cpu, ret = 0;
+	int		ret = 0;
 
 	might_sleep();
 again:
-	cpu = get_cpu();
-	icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu);
+	preempt_disable();
+	icsbp = this_cpu_ptr(mp->m_sb_cnts);
 
 	/*
 	 * if the counter is disabled, go to slow path
@@ -2438,11 +2440,11 @@ again:
 		break;
 	}
 	xfs_icsb_unlock_cntr(icsbp);
-	put_cpu();
+	preempt_enable();
 	return 0;
 
 slow_path:
-	put_cpu();
+	preempt_enable();
 
 	/*
 	 * serialise with a mutex so we don't burn lots of cpu on
@@ -2490,7 +2492,7 @@ slow_path:
 
 balance_counter:
 	xfs_icsb_unlock_cntr(icsbp);
-	put_cpu();
+	preempt_enable();
 
 	/*
 	 * We may have multiple threads here if multiple per-cpu
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a6c023bc0fb..1df7e450296 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -93,6 +93,9 @@ typedef struct xfs_dmops {
 	xfs_send_unmount_t	xfs_send_unmount;
 } xfs_dmops_t;
 
+#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \
+	(((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED)
+
 #define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
 	(*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
 #define XFS_SEND_MMAP(mp, vma,fl) \
@@ -101,12 +104,24 @@ typedef struct xfs_dmops {
 	(*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
 #define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
 	(*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
-#define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
-	(*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl)
 #define XFS_SEND_MOUNT(mp,right,path,name) \
 	(*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \
-	(*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
+#define XFS_SEND_PREUNMOUNT(mp) \
+do { \
+	if (mp->m_flags & XFS_MOUNT_DMAPI) { \
+		(*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \
+			(mp)->m_rootip, DM_RIGHT_NULL, \
+			(mp)->m_rootip, DM_RIGHT_NULL, \
+			NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
+	} \
+} while (0)
+#define XFS_SEND_UNMOUNT(mp) \
+do { \
+	if (mp->m_flags & XFS_MOUNT_DMAPI) { \
+		(*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \
+			DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
+	} \
+} while (0)
 
 
 #ifdef HAVE_PERCPU_SB
@@ -387,13 +402,13 @@ xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
  * Per-cpu superblock locking functions
  */
 #ifdef HAVE_PERCPU_SB
-STATIC_INLINE void
+static inline void
 xfs_icsb_lock(xfs_mount_t *mp)
 {
 	mutex_lock(&mp->m_icsb_mutex);
 }
 
-STATIC_INLINE void
+static inline void
 xfs_icsb_unlock(xfs_mount_t *mp)
 {
 	mutex_unlock(&mp->m_icsb_mutex);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 3ec91ac74c2..91bfd60f4c7 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -92,6 +92,14 @@ typedef struct xfs_dqblk {
 
 #define XFS_DQ_ALLTYPES		(XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
 
+#define XFS_DQ_FLAGS \
+	{ XFS_DQ_USER,		"USER" }, \
+	{ XFS_DQ_PROJ,		"PROJ" }, \
+	{ XFS_DQ_GROUP,		"GROUP" }, \
+	{ XFS_DQ_DIRTY,		"DIRTY" }, \
+	{ XFS_DQ_WANT,		"WANT" }, \
+	{ XFS_DQ_INACTIVE,	"INACTIVE" }
+
 /*
  * In the worst case, when both user and group quotas are on,
  * we can have a max of three dquots changing in a single transaction.
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index b81deea0ce1..fc1cda23b81 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -39,6 +39,7 @@
 #include "xfs_utils.h"
 #include "xfs_trans_space.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 
 
 /*
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 385f6dceba5..9e15a118536 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -45,6 +45,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
+#include "xfs_trace.h"
 
 
 /*
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 3f816ad7ff1..5aa07caea5f 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -44,6 +44,7 @@
 #include "xfs_error.h"
 #include "xfs_buf_item.h"
 #include "xfs_rw.h"
+#include "xfs_trace.h"
 
 /*
  * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
@@ -171,7 +172,6 @@ xfs_bioerror(
 	 * No need to wait until the buffer is unpinned.
 	 * We aren't flushing it.
 	 */
-	xfs_buftrace("XFS IOERROR", bp);
 	XFS_BUF_ERROR(bp, EIO);
 	/*
 	 * We're calling biodone, so delete B_DONE flag. Either way
@@ -205,7 +205,6 @@ xfs_bioerror_relse(
 	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
 	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
 
-	xfs_buftrace("XFS IOERRELSE", bp);
 	fl = XFS_BUF_BFLAGS(bp);
 	/*
 	 * No need to wait until the buffer is unpinned.
@@ -277,10 +276,10 @@ xfs_read_buf(
 	xfs_buf_t	 *bp;
 	int		 error;
 
-	if (flags)
-		bp = xfs_buf_read_flags(target, blkno, len, flags);
-	else
-		bp = xfs_buf_read(target, blkno, len, flags);
+	if (!flags)
+		flags = XBF_LOCK | XBF_MAPPED;
+
+	bp = xfs_buf_read(target, blkno, len, flags);
 	if (!bp)
 		return XFS_ERROR(EIO);
 	error = XFS_BUF_GETERROR(bp);
@@ -336,3 +335,25 @@ xfs_bwrite(
 	}
 	return (error);
 }
+
+/*
+ * helper function to extract extent size hint from inode
+ */
+xfs_extlen_t
+xfs_get_extsz_hint(
+	struct xfs_inode	*ip)
+{
+	xfs_extlen_t		extsz;
+
+	if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
+		extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
+				? ip->i_d.di_extsize
+				: ip->i_mount->m_sb.sb_rextsize;
+		ASSERT(extsz);
+	} else {
+		extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
+				? ip->i_d.di_extsize : 0;
+	}
+
+	return extsz;
+}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f5e4874c37d..571f2174435 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -37,34 +37,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 }
 
 /*
- * Flags for xfs_free_eofblocks
- */
-#define XFS_FREE_EOF_LOCK	(1<<0)
-#define XFS_FREE_EOF_NOLOCK	(1<<1)
-
-
-/*
- * helper function to extract extent size hint from inode
- */
-STATIC_INLINE xfs_extlen_t
-xfs_get_extsz_hint(
-	xfs_inode_t	*ip)
-{
-	xfs_extlen_t	extsz;
-
-	if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
-		extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-				? ip->i_d.di_extsize
-				: ip->i_mount->m_sb.sb_rextsize;
-		ASSERT(extsz);
-	} else {
-		extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-				? ip->i_d.di_extsize : 0;
-	}
-	return extsz;
-}
-
-/*
  * Prototypes for functions in xfs_rw.c.
  */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
@@ -76,5 +48,6 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
 			struct xfs_buf **bpp);
 extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
 				xfs_buf_t *bp, xfs_daddr_t blkno);
+extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
 
 #endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 66b849358e6..237badcbac3 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -236,19 +236,20 @@ xfs_trans_alloc(
 	uint		type)
 {
 	xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
-	return _xfs_trans_alloc(mp, type);
+	return _xfs_trans_alloc(mp, type, KM_SLEEP);
 }
 
 xfs_trans_t *
 _xfs_trans_alloc(
 	xfs_mount_t	*mp,
-	uint		type)
+	uint		type,
+	uint		memflags)
 {
 	xfs_trans_t	*tp;
 
 	atomic_inc(&mp->m_active_trans);
 
-	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+	tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
 	tp->t_magic = XFS_TRANS_MAGIC;
 	tp->t_type = type;
 	tp->t_mountp = mp;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ed47fc77759..ca64f33c63a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -100,6 +100,49 @@ typedef struct xfs_trans_header {
 #define	XFS_TRANS_TYPE_MAX		41
 /* new transaction types need to be reflected in xfs_logprint(8) */
 
+#define XFS_TRANS_TYPES \
+	{ XFS_TRANS_SETATTR_NOT_SIZE,	"SETATTR_NOT_SIZE" }, \
+	{ XFS_TRANS_SETATTR_SIZE,	"SETATTR_SIZE" }, \
+	{ XFS_TRANS_INACTIVE,		"INACTIVE" }, \
+	{ XFS_TRANS_CREATE,		"CREATE" }, \
+	{ XFS_TRANS_CREATE_TRUNC,	"CREATE_TRUNC" }, \
+	{ XFS_TRANS_TRUNCATE_FILE,	"TRUNCATE_FILE" }, \
+	{ XFS_TRANS_REMOVE,		"REMOVE" }, \
+	{ XFS_TRANS_LINK,		"LINK" }, \
+	{ XFS_TRANS_RENAME,		"RENAME" }, \
+	{ XFS_TRANS_MKDIR,		"MKDIR" }, \
+	{ XFS_TRANS_RMDIR,		"RMDIR" }, \
+	{ XFS_TRANS_SYMLINK,		"SYMLINK" }, \
+	{ XFS_TRANS_SET_DMATTRS,	"SET_DMATTRS" }, \
+	{ XFS_TRANS_GROWFS,		"GROWFS" }, \
+	{ XFS_TRANS_STRAT_WRITE,	"STRAT_WRITE" }, \
+	{ XFS_TRANS_DIOSTRAT,		"DIOSTRAT" }, \
+	{ XFS_TRANS_WRITEID,		"WRITEID" }, \
+	{ XFS_TRANS_ADDAFORK,		"ADDAFORK" }, \
+	{ XFS_TRANS_ATTRINVAL,		"ATTRINVAL" }, \
+	{ XFS_TRANS_ATRUNCATE,		"ATRUNCATE" }, \
+	{ XFS_TRANS_ATTR_SET,		"ATTR_SET" }, \
+	{ XFS_TRANS_ATTR_RM,		"ATTR_RM" }, \
+	{ XFS_TRANS_ATTR_FLAG,		"ATTR_FLAG" }, \
+	{ XFS_TRANS_CLEAR_AGI_BUCKET,	"CLEAR_AGI_BUCKET" }, \
+	{ XFS_TRANS_QM_SBCHANGE,	"QM_SBCHANGE" }, \
+	{ XFS_TRANS_QM_QUOTAOFF,	"QM_QUOTAOFF" }, \
+	{ XFS_TRANS_QM_DQALLOC,		"QM_DQALLOC" }, \
+	{ XFS_TRANS_QM_SETQLIM,		"QM_SETQLIM" }, \
+	{ XFS_TRANS_QM_DQCLUSTER,	"QM_DQCLUSTER" }, \
+	{ XFS_TRANS_QM_QINOCREATE,	"QM_QINOCREATE" }, \
+	{ XFS_TRANS_QM_QUOTAOFF_END,	"QM_QOFF_END" }, \
+	{ XFS_TRANS_SB_UNIT,		"SB_UNIT" }, \
+	{ XFS_TRANS_FSYNC_TS,		"FSYNC_TS" }, \
+	{ XFS_TRANS_GROWFSRT_ALLOC,	"GROWFSRT_ALLOC" }, \
+	{ XFS_TRANS_GROWFSRT_ZERO,	"GROWFSRT_ZERO" }, \
+	{ XFS_TRANS_GROWFSRT_FREE,	"GROWFSRT_FREE" }, \
+	{ XFS_TRANS_SWAPEXT,		"SWAPEXT" }, \
+	{ XFS_TRANS_SB_COUNT,		"SB_COUNT" }, \
+	{ XFS_TRANS_DUMMY1,		"DUMMY1" }, \
+	{ XFS_TRANS_DUMMY2,		"DUMMY2" }, \
+	{ XLOG_UNMOUNT_REC_TYPE,	"UNMOUNT" }
+
 /*
  * This structure is used to track log items associated with
  * a transaction.  It points to the log item and keeps some
@@ -782,6 +825,10 @@ typedef struct xfs_log_item {
 #define	XFS_LI_IN_AIL	0x1
 #define XFS_LI_ABORTED	0x2
 
+#define XFS_LI_FLAGS \
+	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
+	{ XFS_LI_ABORTED,	"ABORTED" }
+
 typedef struct xfs_item_ops {
 	uint (*iop_size)(xfs_log_item_t *);
 	void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
@@ -924,7 +971,7 @@ typedef struct xfs_trans {
  * XFS transaction mechanism exported interfaces.
  */
 xfs_trans_t	*xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint, uint);
 xfs_trans_t	*xfs_trans_dup(xfs_trans_t *);
 int		xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
 				  uint, uint);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 218829e6a15..49130628d5e 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -38,6 +38,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
+#include "xfs_trace.h"
 
 
 STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
@@ -79,11 +80,8 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	/*
 	 * Default to a normal get_buf() call if the tp is NULL.
 	 */
-	if (tp == NULL) {
-		bp = xfs_buf_get_flags(target_dev, blkno, len,
-							flags | BUF_BUSY);
-		return(bp);
-	}
+	if (tp == NULL)
+		return xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY);
 
 	/*
 	 * If we find the buffer in the cache with this transaction
@@ -98,26 +96,23 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	}
 	if (bp != NULL) {
 		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
-		if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
-			xfs_buftrace("TRANS GET RECUR SHUT", bp);
+		if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
 			XFS_BUF_SUPER_STALE(bp);
-		}
+
 		/*
 		 * If the buffer is stale then it was binval'ed
 		 * since last read.  This doesn't matter since the
 		 * caller isn't allowed to use the data anyway.
 		 */
-		else if (XFS_BUF_ISSTALE(bp)) {
-			xfs_buftrace("TRANS GET RECUR STALE", bp);
+		else if (XFS_BUF_ISSTALE(bp))
 			ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
-		}
+
 		ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
 		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 		ASSERT(bip != NULL);
 		ASSERT(atomic_read(&bip->bli_refcount) > 0);
 		bip->bli_recur++;
-		xfs_buftrace("TRANS GET RECUR", bp);
-		xfs_buf_item_trace("GET RECUR", bip);
+		trace_xfs_trans_get_buf_recur(bip);
 		return (bp);
 	}
 
@@ -129,7 +124,7 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	 * easily deadlock with our current transaction as well as cause
 	 * us to run out of stack space.
 	 */
-	bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY);
+	bp = xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY);
 	if (bp == NULL) {
 		return NULL;
 	}
@@ -169,8 +164,7 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	 */
 	XFS_BUF_SET_FSPRIVATE2(bp, tp);
 
-	xfs_buftrace("TRANS GET", bp);
-	xfs_buf_item_trace("GET", bip);
+	trace_xfs_trans_get_buf(bip);
 	return (bp);
 }
 
@@ -210,7 +204,7 @@ xfs_trans_getsb(xfs_trans_t	*tp,
 		ASSERT(bip != NULL);
 		ASSERT(atomic_read(&bip->bli_refcount) > 0);
 		bip->bli_recur++;
-		xfs_buf_item_trace("GETSB RECUR", bip);
+		trace_xfs_trans_getsb_recur(bip);
 		return (bp);
 	}
 
@@ -252,7 +246,7 @@ xfs_trans_getsb(xfs_trans_t	*tp,
 	 */
 	XFS_BUF_SET_FSPRIVATE2(bp, tp);
 
-	xfs_buf_item_trace("GETSB", bip);
+	trace_xfs_trans_getsb(bip);
 	return (bp);
 }
 
@@ -302,7 +296,7 @@ xfs_trans_read_buf(
 	 * Default to a normal get_buf() call if the tp is NULL.
 	 */
 	if (tp == NULL) {
-		bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
+		bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY);
 		if (!bp)
 			return (flags & XFS_BUF_TRYLOCK) ?
 					EAGAIN : XFS_ERROR(ENOMEM);
@@ -350,7 +344,7 @@ xfs_trans_read_buf(
 		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
 		ASSERT((XFS_BUF_ISERROR(bp)) == 0);
 		if (!(XFS_BUF_ISDONE(bp))) {
-			xfs_buftrace("READ_BUF_INCORE !DONE", bp);
+			trace_xfs_trans_read_buf_io(bp, _RET_IP_);
 			ASSERT(!XFS_BUF_ISASYNC(bp));
 			XFS_BUF_READ(bp);
 			xfsbdstrat(tp->t_mountp, bp);
@@ -375,7 +369,7 @@ xfs_trans_read_buf(
 		 * brelse it either. Just get out.
 		 */
 		if (XFS_FORCED_SHUTDOWN(mp)) {
-			xfs_buftrace("READ_BUF_INCORE XFSSHUTDN", bp);
+			trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
 			*bpp = NULL;
 			return XFS_ERROR(EIO);
 		}
@@ -385,7 +379,7 @@ xfs_trans_read_buf(
 		bip->bli_recur++;
 
 		ASSERT(atomic_read(&bip->bli_refcount) > 0);
-		xfs_buf_item_trace("READ RECUR", bip);
+		trace_xfs_trans_read_buf_recur(bip);
 		*bpp = bp;
 		return 0;
 	}
@@ -398,14 +392,13 @@ xfs_trans_read_buf(
 	 * easily deadlock with our current transaction as well as cause
 	 * us to run out of stack space.
 	 */
-	bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
+	bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY);
 	if (bp == NULL) {
 		*bpp = NULL;
 		return 0;
 	}
 	if (XFS_BUF_GETERROR(bp) != 0) {
 	    XFS_BUF_SUPER_STALE(bp);
-		xfs_buftrace("READ ERROR", bp);
 		error = XFS_BUF_GETERROR(bp);
 
 		xfs_ioerror_alert("xfs_trans_read_buf", mp,
@@ -464,8 +457,7 @@ xfs_trans_read_buf(
 	 */
 	XFS_BUF_SET_FSPRIVATE2(bp, tp);
 
-	xfs_buftrace("TRANS READ", bp);
-	xfs_buf_item_trace("READ", bip);
+	trace_xfs_trans_read_buf(bip);
 	*bpp = bp;
 	return 0;
 
@@ -483,7 +475,7 @@ shutdown_abort:
 	ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) !=
 						(XFS_B_STALE|XFS_B_DELWRI));
 
-	xfs_buftrace("READ_BUF XFSSHUTDN", bp);
+	trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
 	xfs_buf_relse(bp);
 	*bpp = NULL;
 	return XFS_ERROR(EIO);
@@ -549,13 +541,14 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
 	ASSERT(lidp != NULL);
 
+	trace_xfs_trans_brelse(bip);
+
 	/*
 	 * If the release is just for a recursive lock,
 	 * then decrement the count and return.
 	 */
 	if (bip->bli_recur > 0) {
 		bip->bli_recur--;
-		xfs_buf_item_trace("RELSE RECUR", bip);
 		return;
 	}
 
@@ -563,10 +556,8 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 	 * If the buffer is dirty within this transaction, we can't
 	 * release it until we commit.
 	 */
-	if (lidp->lid_flags & XFS_LID_DIRTY) {
-		xfs_buf_item_trace("RELSE DIRTY", bip);
+	if (lidp->lid_flags & XFS_LID_DIRTY)
 		return;
-	}
 
 	/*
 	 * If the buffer has been invalidated, then we can't release
@@ -574,13 +565,10 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 	 * as part of this transaction.  This prevents us from pulling
 	 * the item from the AIL before we should.
 	 */
-	if (bip->bli_flags & XFS_BLI_STALE) {
-		xfs_buf_item_trace("RELSE STALE", bip);
+	if (bip->bli_flags & XFS_BLI_STALE)
 		return;
-	}
 
 	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-	xfs_buf_item_trace("RELSE", bip);
 
 	/*
 	 * Free up the log item descriptor tracking the released item.
@@ -677,7 +665,7 @@ xfs_trans_bjoin(xfs_trans_t	*tp,
 	 */
 	XFS_BUF_SET_FSPRIVATE2(bp, tp);
 
-	xfs_buf_item_trace("BJOIN", bip);
+	trace_xfs_trans_bjoin(bip);
 }
 
 /*
@@ -701,7 +689,7 @@ xfs_trans_bhold(xfs_trans_t	*tp,
 	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	bip->bli_flags |= XFS_BLI_HOLD;
-	xfs_buf_item_trace("BHOLD", bip);
+	trace_xfs_trans_bhold(bip);
 }
 
 /*
@@ -724,7 +712,8 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	ASSERT(bip->bli_flags & XFS_BLI_HOLD);
 	bip->bli_flags &= ~XFS_BLI_HOLD;
-	xfs_buf_item_trace("BHOLD RELEASE", bip);
+
+	trace_xfs_trans_bhold_release(bip);
 }
 
 /*
@@ -770,6 +759,8 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 	bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
 
+	trace_xfs_trans_log_buf(bip);
+
 	/*
 	 * If we invalidated the buffer within this transaction, then
 	 * cancel the invalidation now that we're dirtying the buffer
@@ -777,7 +768,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 	 * because we have a reference to the buffer this entire time.
 	 */
 	if (bip->bli_flags & XFS_BLI_STALE) {
-		xfs_buf_item_trace("BLOG UNSTALE", bip);
 		bip->bli_flags &= ~XFS_BLI_STALE;
 		ASSERT(XFS_BUF_ISSTALE(bp));
 		XFS_BUF_UNSTALE(bp);
@@ -792,7 +782,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 	lidp->lid_flags &= ~XFS_LID_BUF_STALE;
 	bip->bli_flags |= XFS_BLI_LOGGED;
 	xfs_buf_item_log(bip, first, last);
-	xfs_buf_item_trace("BLOG", bip);
 }
 
 
@@ -831,6 +820,8 @@ xfs_trans_binval(
 	ASSERT(lidp != NULL);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
+	trace_xfs_trans_binval(bip);
+
 	if (bip->bli_flags & XFS_BLI_STALE) {
 		/*
 		 * If the buffer is already invalidated, then
@@ -843,8 +834,6 @@ xfs_trans_binval(
 		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
 		ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
 		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
-		xfs_buftrace("XFS_BINVAL RECUR", bp);
-		xfs_buf_item_trace("BINVAL RECUR", bip);
 		return;
 	}
 
@@ -878,8 +867,6 @@ xfs_trans_binval(
 	      (bip->bli_format.blf_map_size * sizeof(uint)));
 	lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	xfs_buftrace("XFS_BINVAL", bp);
-	xfs_buf_item_trace("BINVAL", bip);
 }
 
 /*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index b572f7e840e..6558ffd8d14 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -53,6 +53,7 @@
 #include "xfs_log_priv.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 
 int
 xfs_setattr(
@@ -538,9 +539,8 @@ xfs_readlink_bmap(
 		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 
-		bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt),
-					XBF_LOCK | XBF_MAPPED |
-					XBF_DONT_BLOCK);
+		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
+				  XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
 		error = XFS_BUF_GETERROR(bp);
 		if (error) {
 			xfs_ioerror_alert("xfs_readlink",
@@ -709,6 +709,11 @@ xfs_fsync(
 }
 
 /*
+ * Flags for xfs_free_eofblocks
+ */
+#define XFS_FREE_EOF_TRYLOCK	(1<<0)
+
+/*
  * This is called by xfs_inactive to free any blocks beyond eof
  * when the link count isn't zero and by xfs_dm_punch_hole() when
  * punching a hole to EOF.
@@ -726,7 +731,6 @@ xfs_free_eofblocks(
 	xfs_filblks_t	map_len;
 	int		nimaps;
 	xfs_bmbt_irec_t	imap;
-	int		use_iolock = (flags & XFS_FREE_EOF_LOCK);
 
 	/*
 	 * Figure out if there are any blocks beyond the end
@@ -768,14 +772,19 @@ xfs_free_eofblocks(
 		 * cache and we can't
 		 * do that within a transaction.
 		 */
-		if (use_iolock)
+		if (flags & XFS_FREE_EOF_TRYLOCK) {
+			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+				xfs_trans_cancel(tp, 0);
+				return 0;
+			}
+		} else {
 			xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		}
 		error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
 				    ip->i_size);
 		if (error) {
 			xfs_trans_cancel(tp, 0);
-			if (use_iolock)
-				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 			return error;
 		}
 
@@ -812,8 +821,7 @@ xfs_free_eofblocks(
 			error = xfs_trans_commit(tp,
 						XFS_TRANS_RELEASE_LOG_RES);
 		}
-		xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
-					    : XFS_ILOCK_EXCL));
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
 	}
 	return error;
 }
@@ -1113,7 +1121,17 @@ xfs_release(
 		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
 		    (!(ip->i_d.di_flags &
 				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-			error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
+
+			/*
+			 * If we can't get the iolock just skip truncating
+			 * the blocks past EOF because we could deadlock
+			 * with the mmap_sem otherwise.  We'll get another
+			 * chance to drop them once the last reference to
+			 * the inode is dropped, so we'll never leak blocks
+			 * permanently.
+			 */
+			error = xfs_free_eofblocks(mp, ip,
+						   XFS_FREE_EOF_TRYLOCK);
 			if (error)
 				return error;
 		}
@@ -1184,7 +1202,7 @@ xfs_inactive(
 		     (!(ip->i_d.di_flags &
 				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
 		      (ip->i_delayed_blks != 0)))) {
-			error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
+			error = xfs_free_eofblocks(mp, ip, 0);
 			if (error)
 				return VN_INACTIVE_CACHE;
 		}
@@ -1380,7 +1398,6 @@ xfs_lookup(
 	if (error)
 		goto out_free_name;
 
-	xfs_itrace_ref(*ipp);
 	return 0;
 
 out_free_name:
@@ -1526,7 +1543,6 @@ xfs_create(
 	 * At this point, we've gotten a newly allocated inode.
 	 * It is locked (and joined to the transaction).
 	 */
-	xfs_itrace_ref(ip);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	/*
@@ -1986,9 +2002,6 @@ xfs_remove(
 	if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
 		xfs_filestream_deassociate(ip);
 
-	xfs_itrace_exit(ip);
-	xfs_itrace_exit(dp);
-
  std_return:
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
 		XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
@@ -2285,7 +2298,6 @@ xfs_symlink(
 			goto error_return;
 		goto error1;
 	}
-	xfs_itrace_ref(ip);
 
 	/*
 	 * An error after we've joined dp to the transaction will result in the
@@ -2456,46 +2468,6 @@ xfs_set_dmattrs(
 	return error;
 }
 
-int
-xfs_reclaim(
-	xfs_inode_t	*ip)
-{
-
-	xfs_itrace_entry(ip);
-
-	ASSERT(!VN_MAPPED(VFS_I(ip)));
-
-	/* bad inode, get out here ASAP */
-	if (is_bad_inode(VFS_I(ip))) {
-		xfs_ireclaim(ip);
-		return 0;
-	}
-
-	xfs_ioend_wait(ip);
-
-	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
-
-	/*
-	 * If we have nothing to flush with this inode then complete the
-	 * teardown now, otherwise break the link between the xfs inode and the
-	 * linux inode and clean up the xfs inode later. This avoids flushing
-	 * the inode to disk during the delete operation itself.
-	 *
-	 * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
-	 * first to ensure that xfs_iunpin() will never see an xfs inode
-	 * that has a linux inode being reclaimed. Synchronisation is provided
-	 * by the i_flags_lock.
-	 */
-	if (!ip->i_update_core && (ip->i_itemp == NULL)) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_iflock(ip);
-		xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-		return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
-	}
-	xfs_inode_set_reclaim_tag(ip);
-	return 0;
-}
-
 /*
  * xfs_alloc_file_space()
  *      This routine allocates disk space for the given file.
@@ -2868,7 +2840,6 @@ xfs_free_file_space(
 	ioffset = offset & ~(rounding - 1);
 
 	if (VN_CACHED(VFS_I(ip)) != 0) {
-		xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
 		error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
 		if (error)
 			goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index a9e102de71a..167a467403a 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
 		const char *target_path, mode_t mode, struct xfs_inode **ipp,
 		cred_t *credp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
-int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 		xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,