From 18496e80f729be5f536d0315751b3bbb95ca913e Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Thu, 7 Aug 2008 00:11:12 +0300
Subject: [PATCH] ocfs2/cluster/tcp.c: make some functions static

Commit 0f475b2abed6cbccee1da20a0bef2895eb2a0edd (ocfs2/net: Silence build
warnings) made sense as far as it fixed compile warnings, but it was not
required that it made the functions global.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/tcp.c          | 44 ++++++++++++++++++++++++++++++++++-------
 fs/ocfs2/cluster/tcp_internal.h | 32 ------------------------------
 2 files changed, 37 insertions(+), 39 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index a27d61581bd..2bcf706d9dd 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 
 #ifdef CONFIG_DEBUG_FS
-void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-		    u32 msgkey, struct task_struct *task, u8 node)
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+			   u32 msgkey, struct task_struct *task, u8 node)
 {
 	INIT_LIST_HEAD(&nst->st_net_debug_item);
 	nst->st_task = task;
@@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
 	nst->st_node = node;
 }
 
-void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
 	do_gettimeofday(&nst->st_sock_time);
 }
 
-void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
 	do_gettimeofday(&nst->st_send_time);
 }
 
-void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
 	do_gettimeofday(&nst->st_status_time);
 }
 
-void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
 					 struct o2net_sock_container *sc)
 {
 	nst->st_sc = sc;
 }
 
-void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
 {
 	nst->st_id = msg_id;
 }
+
+#else  /* CONFIG_DEBUG_FS */
+
+static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+				  u32 msgkey, struct task_struct *task, u8 node)
+{
+}
+
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+}
+
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+}
+
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+}
+
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+						struct o2net_sock_container *sc)
+{
+}
+
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+					u32 msg_id)
+{
+}
+
 #endif /* CONFIG_DEBUG_FS */
 
 static inline int o2net_reconnect_delay(void)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 18307ff81b7..8d58cfe410b 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -224,42 +224,10 @@ struct o2net_send_tracking {
 	struct timeval			st_send_time;
 	struct timeval			st_status_time;
 };
-
-void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-		    u32 msgkey, struct task_struct *task, u8 node);
-void o2net_set_nst_sock_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_send_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_status_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-				  struct o2net_sock_container *sc);
-void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id);
-
 #else
 struct o2net_send_tracking {
 	u32	dummy;
 };
-
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-				  u32 msgkey, struct task_struct *task, u8 node)
-{
-}
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-						struct o2net_sock_container *sc)
-{
-}
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
-					u32 msg_id)
-{
-}
 #endif	/* CONFIG_DEBUG_FS */
 
 #endif /* O2CLUSTER_TCP_INTERNAL_H */
-- 
cgit v1.2.3


From a57a874b04e27cb530a0e18c244387452e73ccce Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Wed, 6 Aug 2008 00:50:41 +0400
Subject: [PATCH] ocfs2/cluster/netdebug.c: fix warning

ocfs2/cluster/netdebug.c: fix warning

fs/ocfs2/cluster/netdebug.c:154: warning: format '%lu' expects
     type 'long unsigned int', but argument 17 has type 'suseconds_t'

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/netdebug.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index d8bfa0eb41b..52276c02f71 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -138,20 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v)
 			   "  message id:   %d\n"
 			   "  message type: %u\n"
 			   "  message key:  0x%08x\n"
-			   "  sock acquiry: %lu.%lu\n"
-			   "  send start:   %lu.%lu\n"
-			   "  wait start:   %lu.%lu\n",
+			   "  sock acquiry: %lu.%ld\n"
+			   "  send start:   %lu.%ld\n"
+			   "  wait start:   %lu.%ld\n",
 			   nst, (unsigned long)nst->st_task->pid,
 			   (unsigned long)nst->st_task->tgid,
 			   nst->st_task->comm, nst->st_node,
 			   nst->st_sc, nst->st_id, nst->st_msg_type,
 			   nst->st_msg_key,
 			   nst->st_sock_time.tv_sec,
-			   (unsigned long)nst->st_sock_time.tv_usec,
+			   (long)nst->st_sock_time.tv_usec,
 			   nst->st_send_time.tv_sec,
-			   (unsigned long)nst->st_send_time.tv_usec,
+			   (long)nst->st_send_time.tv_usec,
 			   nst->st_status_time.tv_sec,
-			   nst->st_status_time.tv_usec);
+			   (long)nst->st_status_time.tv_usec);
 	}
 
 	spin_unlock(&o2net_debug_lock);
@@ -276,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	return sc; /* unused, just needs to be null when done */
 }
 
-#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec
+#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
 
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
@@ -309,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v)
 			   "  remote node:     %s\n"
 			   "  page off:        %zu\n"
 			   "  handshake ok:    %u\n"
-			   "  timer:           %lu.%lu\n"
-			   "  data ready:      %lu.%lu\n"
-			   "  advance start:   %lu.%lu\n"
-			   "  advance stop:    %lu.%lu\n"
-			   "  func start:      %lu.%lu\n"
-			   "  func stop:       %lu.%lu\n"
+			   "  timer:           %lu.%ld\n"
+			   "  data ready:      %lu.%ld\n"
+			   "  advance start:   %lu.%ld\n"
+			   "  advance stop:    %lu.%ld\n"
+			   "  func start:      %lu.%ld\n"
+			   "  func stop:       %lu.%ld\n"
 			   "  func key:        %u\n"
 			   "  func type:       %u\n",
 			   sc,
-- 
cgit v1.2.3


From a1af7d15a18d1e375b0a6fee93789a0bbfe088b4 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 19 Aug 2008 17:20:28 -0700
Subject: ocfs2: Fix sleep-with-spinlock recovery regression

This fixes a bug introduced with 539d8264093560b917ee3afe4c7f74e5da09d6a5:
    [PATCH 2/2] ocfs2: Fix race between mount and recovery

ocfs2_mark_dead_nodes() was reading journal inodes while holding the
spinlock protecting our in-memory recovery state. The fix is very simple -
the disk state is protected by a cluster lock that's already held, so we
just move the spinlock down past the read.

Reviewed-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/journal.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7a37240f7a3..c47bc2a809c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1418,13 +1418,13 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
 	unsigned int node_num;
 	int status, i;
+	u32 gen;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dinode *di;
 
 	/* This is called with the super block cluster lock, so we
 	 * know that the slot map can't change underneath us. */
 
-	spin_lock(&osb->osb_lock);
 	for (i = 0; i < osb->max_slots; i++) {
 		/* Read journal inode to get the recovery generation */
 		status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
@@ -1433,23 +1433,31 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 			goto bail;
 		}
 		di = (struct ocfs2_dinode *)bh->b_data;
-		osb->slot_recovery_generations[i] =
-					ocfs2_get_recovery_generation(di);
+		gen = ocfs2_get_recovery_generation(di);
 		brelse(bh);
 		bh = NULL;
 
+		spin_lock(&osb->osb_lock);
+		osb->slot_recovery_generations[i] = gen;
+
 		mlog(0, "Slot %u recovery generation is %u\n", i,
 		     osb->slot_recovery_generations[i]);
 
-		if (i == osb->slot_num)
+		if (i == osb->slot_num) {
+			spin_unlock(&osb->osb_lock);
 			continue;
+		}
 
 		status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
-		if (status == -ENOENT)
+		if (status == -ENOENT) {
+			spin_unlock(&osb->osb_lock);
 			continue;
+		}
 
-		if (__ocfs2_recovery_map_test(osb, node_num))
+		if (__ocfs2_recovery_map_test(osb, node_num)) {
+			spin_unlock(&osb->osb_lock);
 			continue;
+		}
 		spin_unlock(&osb->osb_lock);
 
 		/* Ok, we have a slot occupied by another node which
@@ -1465,10 +1473,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 			mlog_errno(status);
 			goto bail;
 		}
-
-		spin_lock(&osb->osb_lock);
 	}
-	spin_unlock(&osb->osb_lock);
 
 	status = 0;
 bail:
-- 
cgit v1.2.3


From 83cab5338fa8c74f979223698c8d4cc88f2ab68e Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 21 Aug 2008 14:14:27 +0800
Subject: ocfs2: Jump to correct label in ocfs2_expand_inline_dir()

When we fail to insert extent in ocfs2_expand_inline_dir(), we should go to
out_commit, not out.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8a187584808..8e9c4a47d81 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1310,7 +1310,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 				  NULL);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		goto out_commit;
 	}
 
 	ret = ocfs2_journal_dirty(handle, di_bh);
@@ -1336,7 +1336,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 					  len, 0, NULL);
 		if (ret) {
 			mlog_errno(ret);
-			goto out;
+			goto out_commit;
 		}
 	}
 
-- 
cgit v1.2.3


From 9780eb6cfaf7d2d5ccc061eaf94e7aec6a17791e Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 5 Aug 2008 11:32:46 -0700
Subject: ocfs2: correctly set i_blocks after inline dir gets expanded

We were setting i_blocks based on allocation before the extent insert, which
is wrong as the value is a calculation based on ip_clusters which gets
updated as a result of the insert. This patch moves the line in question
to just after the call to ocfs2_insert_extent().

Without this fix, inline directories were temporarily having an i_blocks
value of zero immediately after expansion to extents.

Reported-and-tested-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dir.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8e9c4a47d81..9cce563fd62 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1300,7 +1300,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	di->i_size = cpu_to_le64(sb->s_blocksize);
 	di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
-	dir->i_blocks = ocfs2_inode_sector_count(dir);
 
 	/*
 	 * This should never fail as our extent list is empty and all
@@ -1313,6 +1312,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		goto out_commit;
 	}
 
+	/*
+	 * Set i_blocks after the extent insert for the most up to
+	 * date ip_clusters value.
+	 */
+	dir->i_blocks = ocfs2_inode_sector_count(dir);
+
 	ret = ocfs2_journal_dirty(handle, di_bh);
 	if (ret) {
 		mlog_errno(ret);
-- 
cgit v1.2.3


From d6817cdbd143f87f9d7c59a4c3194091190eeb84 Mon Sep 17 00:00:00 2001
From: Joel Becker <Joel.Becker@oracle.com>
Date: Fri, 22 Aug 2008 14:30:10 -0700
Subject: ocfs2: Increment the reference count of an already-active stack.

The ocfs2_stack_driver_request() function failed to increment the
refcount of an already-active stack.  It only did the increment on the
first reference.  Whoops.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Tested-by: Marcos Matsunaga <marcos.matsunaga@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stackglue.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 10e149ae5e3..07f348b8d72 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name,
 		goto out;
 	}
 
-	/* Ok, the stack is pinned */
-	p->sp_count++;
 	active_stack = p;
-
 	rc = 0;
 
 out:
+	/* If we found it, pin it */
+	if (!rc)
+		active_stack->sp_count++;
+
 	spin_unlock(&ocfs2_stack_lock);
 	return rc;
 }
-- 
cgit v1.2.3


From 0e116227a01580acf47437adba3afadf21b6bd5f Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 3 Sep 2008 01:57:14 +0800
Subject: ocfs2: Fix a bug in direct IO read.

ocfs2 will become read-only if we try to read the bytes which pass
the end of i_size. This can be easily reproduced by following steps:
1. mkfs a ocfs2 volume with bs=4k cs=4k and nosparse.
2. create a small file(say less than 100 bytes) and we will create the file
   which is allocated 1 cluster.
3. read 8196 bytes from the kernel using O_DIRECT which exceeds the limit.
4. The ocfs2 volume becomes read-only and dmesg shows:
OCFS2: ERROR (device sda13): ocfs2_direct_IO_get_blocks:
Inode 66010 has a hole at block 1
File system is now read-only due to the potential of on-disk corruption.
Please run fsck.ocfs2 once the file system is unmounted.

So suppress the ERROR message.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/aops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 506c24fb507..a53da146627 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -594,7 +594,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %llu has a hole at block %llu\n",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-- 
cgit v1.2.3


From 00dc417fa3e763345b34ccb6034d72de76eea0a1 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Fri, 3 Oct 2008 17:32:11 -0400
Subject: ocfs2: fiemap support

Plug ocfs2 into ->fiemap. Some portions of ocfs2_get_clusters() had to be
refactored so that the extent cache can be skipped in favor of going
directly to the on-disk records. This makes it easier for us to determine
which extent is the last one in the btree. Also, I'm not sure we want to be
caching fiemap lookups anyway as they're not directly related to data
read/write.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: ocfs2-devel@oss.oracle.com
Cc: linux-fsdevel@vger.kernel.org
---
 fs/ocfs2/alloc.c      |   9 --
 fs/ocfs2/alloc.h      |   9 ++
 fs/ocfs2/extent_map.c | 346 ++++++++++++++++++++++++++++++++++++++++++--------
 fs/ocfs2/extent_map.h |   3 +
 fs/ocfs2/file.c       |   1 +
 5 files changed, 306 insertions(+), 62 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb466e06..29ff57ec5d1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -989,15 +989,6 @@ out:
 	return ret;
 }
 
-/*
- * This is only valid for leaf nodes, which are the only ones that can
- * have empty extents anyway.
- */
-static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
-{
-	return !rec->e_leaf_clusters;
-}
-
 /*
  * This function will discard the rightmost extent record.
  */
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94bd801..60cd3d59230 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -146,4 +146,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
 		return le16_to_cpu(rec->e_leaf_clusters);
 }
 
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+	return !rec->e_leaf_clusters;
+}
+
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a326f..aed268e80b4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/fiemap.h>
 
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
@@ -32,6 +33,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
@@ -282,6 +284,51 @@ out:
 		kfree(new_emi);
 }
 
+static int ocfs2_last_eb_is_empty(struct inode *inode,
+				  struct ocfs2_dinode *di)
+{
+	int ret, next_free;
+	u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk,
+			       &eb_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+	el = &eb->h_list;
+
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		ret = -EROFS;
+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+		goto out;
+	}
+
+	if (el->l_tree_depth) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %lu has non zero tree depth in "
+			    "leaf block %llu\n", inode->i_ino,
+			    (unsigned long long)eb_bh->b_blocknr);
+		ret = -EROFS;
+		goto out;
+	}
+
+	next_free = le16_to_cpu(el->l_next_free_rec);
+
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
+		ret = 1;
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
 /*
  * Return the 1st index within el which contains an extent start
  * larger than v_cluster.
@@ -373,42 +420,28 @@ out:
 	return ret;
 }
 
-int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
-		       u32 *p_cluster, u32 *num_clusters,
-		       unsigned int *extent_flags)
+static int ocfs2_get_clusters_nocache(struct inode *inode,
+				      struct buffer_head *di_bh,
+				      u32 v_cluster, unsigned int *hole_len,
+				      struct ocfs2_extent_rec *ret_rec,
+				      unsigned int *is_last)
 {
-	int ret, i;
-	unsigned int flags = 0;
-	struct buffer_head *di_bh = NULL;
-	struct buffer_head *eb_bh = NULL;
+	int i, ret, tree_height, len;
 	struct ocfs2_dinode *di;
-	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_block *uninitialized_var(eb);
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_rec *rec;
-	u32 coff;
-
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = -ERANGE;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
-				      num_clusters, extent_flags);
-	if (ret == 0)
-		goto out;
+	struct buffer_head *eb_bh = NULL;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	memset(ret_rec, 0, sizeof(*ret_rec));
+	if (is_last)
+		*is_last = 0;
 
 	di = (struct ocfs2_dinode *) di_bh->b_data;
 	el = &di->id2.i_list;
+	tree_height = le16_to_cpu(el->l_tree_depth);
 
-	if (el->l_tree_depth) {
+	if (tree_height > 0) {
 		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
 		if (ret) {
 			mlog_errno(ret);
@@ -431,46 +464,143 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	i = ocfs2_search_extent_list(el, v_cluster);
 	if (i == -1) {
 		/*
-		 * A hole was found. Return some canned values that
-		 * callers can key on. If asked for, num_clusters will
-		 * be populated with the size of the hole.
+		 * Holes can be larger than the maximum size of an
+		 * extent, so we return their lengths in a seperate
+		 * field.
 		 */
-		*p_cluster = 0;
-		if (num_clusters) {
+		if (hole_len) {
 			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
-							 v_cluster,
-							 num_clusters);
+							 v_cluster, &len);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
+
+			*hole_len = len;
 		}
-	} else {
-		rec = &el->l_recs[i];
+		goto out_hole;
+	}
 
-		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+	rec = &el->l_recs[i];
 
-		if (!rec->e_blkno) {
-			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-				    "record (%u, %u, 0)", inode->i_ino,
-				    le32_to_cpu(rec->e_cpos),
-				    ocfs2_rec_clusters(el, rec));
-			ret = -EROFS;
-			goto out;
+	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+	if (!rec->e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0)", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*ret_rec = *rec;
+
+	/*
+	 * Checking for last extent is potentially expensive - we
+	 * might have to look at the next leaf over to see if it's
+	 * empty.
+	 *
+	 * The first two checks are to see whether the caller even
+	 * cares for this information, and if the extent is at least
+	 * the last in it's list.
+	 *
+	 * If those hold true, then the extent is last if any of the
+	 * additional conditions hold true:
+	 *  - Extent list is in-inode
+	 *  - Extent list is right-most
+	 *  - Extent list is 2nd to rightmost, with empty right-most
+	 */
+	if (is_last) {
+		if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
+			if (tree_height == 0)
+				*is_last = 1;
+			else if (eb->h_blkno == di->i_last_eb_blk)
+				*is_last = 1;
+			else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
+				ret = ocfs2_last_eb_is_empty(inode, di);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+				if (ret == 1)
+					*is_last = 1;
+			}
 		}
+	}
+
+out_hole:
+	ret = 0;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+static void ocfs2_relative_extent_offsets(struct super_block *sb,
+					  u32 v_cluster,
+					  struct ocfs2_extent_rec *rec,
+					  u32 *p_cluster, u32 *num_clusters)
+
+{
+	u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
+
+	*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
+	*p_cluster = *p_cluster + coff;
+
+	if (num_clusters)
+		*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
+}
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+		       u32 *p_cluster, u32 *num_clusters,
+		       unsigned int *extent_flags)
+{
+	int ret;
+	unsigned int uninitialized_var(hole_len), flags = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
 
-		coff = v_cluster - le32_to_cpu(rec->e_cpos);
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = -ERANGE;
+		mlog_errno(ret);
+		goto out;
+	}
 
-		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
-						    le64_to_cpu(rec->e_blkno));
-		*p_cluster = *p_cluster + coff;
+	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+				      num_clusters, extent_flags);
+	if (ret == 0)
+		goto out;
 
-		if (num_clusters)
-			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		flags = rec->e_flags;
+	ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
+					 &rec, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		ocfs2_extent_map_insert_rec(inode, rec);
+	if (rec.e_blkno == 0ULL) {
+		/*
+		 * A hole was found. Return some canned values that
+		 * callers can key on. If asked for, num_clusters will
+		 * be populated with the size of the hole.
+		 */
+		*p_cluster = 0;
+		if (num_clusters) {
+			*num_clusters = hole_len;
+		}
+	} else {
+		ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
+					      p_cluster, num_clusters);
+		flags = rec.e_flags;
+
+		ocfs2_extent_map_insert_rec(inode, &rec);
 	}
 
 	if (extent_flags)
@@ -478,7 +608,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 
 out:
 	brelse(di_bh);
-	brelse(eb_bh);
 	return ret;
 }
 
@@ -521,3 +650,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 out:
 	return ret;
 }
+
+static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
+			       struct fiemap_extent_info *fieinfo,
+			       u64 map_start)
+{
+	int ret;
+	unsigned int id_count;
+	struct ocfs2_dinode *di;
+	u64 phys;
+	u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	id_count = le16_to_cpu(di->id2.i_data.id_count);
+
+	if (map_start < id_count) {
+		phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
+		phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+
+		ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
+					      flags);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+#define OCFS2_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len)
+{
+	int ret, is_last;
+	u32 mapping_end, cpos;
+	unsigned int hole_size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u64 len_bytes, phys_bytes, virt_bytes;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * Handle inline-data separately.
+	 */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
+		goto out_unlock;
+	}
+
+	cpos = map_start >> osb->s_clustersize_bits;
+	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+					       map_start + map_len);
+	mapping_end -= cpos;
+	is_last = 0;
+	while (cpos < mapping_end && !is_last) {
+		u32 fe_flags;
+
+		ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+						 &hole_size, &rec, &is_last);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (rec.e_blkno == 0ULL) {
+			cpos += hole_size;
+			continue;
+		}
+
+		fe_flags = 0;
+		if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
+			fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+		if (is_last)
+			fe_flags |= FIEMAP_EXTENT_LAST;
+		len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
+		phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
+		virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
+
+		ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
+					      len_bytes, fe_flags);
+		if (ret)
+			break;
+
+		cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+out_unlock:
+	brelse(di_bh);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_inode_unlock(inode, 0);
+out:
+
+	return ret;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e41a2..1b97490e1ea 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 				u64 *ret_count, unsigned int *extent_flags);
 
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len);
+
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ec2ed15c3da..ed38796052d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2228,6 +2228,7 @@ const struct inode_operations ocfs2_file_iops = {
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 	.fallocate	= ocfs2_fallocate,
+	.fiemap		= ocfs2_fiemap,
 };
 
 const struct inode_operations ocfs2_special_file_iops = {
-- 
cgit v1.2.3


From a447c0932445f92ce6f4c1bd020f62c5097a7842 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 13 Oct 2008 10:46:57 +0100
Subject: vfs: Use const for kernel parser table

This is a much better version of a previous patch to make the parser
tables constant. Rather than changing the typedef, we put the "const" in
all the various places where its required, allowing the __initconst
exception for nfsroot which was the cause of the previous trouble.

This was posted for review some time ago and I believe its been in -mm
since then.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Alexander Viro <aviro@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 88255d3f52b..70334d85aff 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -157,7 +157,7 @@ enum {
 	Opt_err,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_barrier, "barrier=%u"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
-- 
cgit v1.2.3


From 53da4939f349d4edd283b043219221ca5b78e4d4 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 21 Jul 2008 14:29:16 -0700
Subject: ocfs2: POSIX file locks support

This is actually pretty easy since fs/dlm already handles the bulk of the
work. The Ocfs2 userspace cluster stack module already uses fs/dlm as the
underlying lock manager, so I only had to add the right calls.

Cluster-aware POSIX locks ("plocks") can be turned off by the same means at
UNIX locks - mount with 'noflocks', or create a local-only Ocfs2 volume.
Internally, the file system uses two sets of file_operations, depending on
whether cluster aware plocks is required. This turns out to be easier than
implementing local-only versions of ->lock.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/file.c       | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/file.h       |  2 ++
 fs/ocfs2/inode.c      | 15 +++++++++++++--
 fs/ocfs2/locks.c      | 15 +++++++++++++++
 fs/ocfs2/locks.h      |  1 +
 fs/ocfs2/stack_user.c | 33 +++++++++++++++++++++++++++++++++
 fs/ocfs2/stackglue.c  | 20 ++++++++++++++++++++
 fs/ocfs2/stackglue.h  | 19 +++++++++++++++++++
 8 files changed, 154 insertions(+), 2 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed38796052d..1015ef16a8b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2237,6 +2237,10 @@ const struct inode_operations ocfs2_special_file_iops = {
 	.permission	= ocfs2_permission,
 };
 
+/*
+ * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
+ * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
+ */
 const struct file_operations ocfs2_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -2251,6 +2255,7 @@ const struct file_operations ocfs2_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+	.lock		= ocfs2_lock,
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
@@ -2266,6 +2271,52 @@ const struct file_operations ocfs2_dops = {
 	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.lock		= ocfs2_lock,
+	.flock		= ocfs2_flock,
+};
+
+/*
+ * POSIX-lockless variants of our file_operations.
+ *
+ * These will be used if the underlying cluster stack does not support
+ * posix file locking, if the user passes the "localflocks" mount
+ * option, or if we have a local-only fs.
+ *
+ * ocfs2_flock is in here because all stacks handle UNIX file locks,
+ * so we still want it in the case of no stack support for
+ * plocks. Internally, it will do the right thing when asked to ignore
+ * the cluster.
+ */
+const struct file_operations ocfs2_fops_no_plocks = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.flock		= ocfs2_flock,
+	.splice_read	= ocfs2_file_splice_read,
+	.splice_write	= ocfs2_file_splice_write,
+};
+
+const struct file_operations ocfs2_dops_no_plocks = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_dir_release,
+	.open		= ocfs2_dir_open,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.flock		= ocfs2_flock,
 };
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 1e27b4d017e..5a6d3e48e4b 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -28,6 +28,8 @@
 
 extern const struct file_operations ocfs2_fops;
 extern const struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops_no_plocks;
+extern const struct file_operations ocfs2_dops_no_plocks;
 extern const struct inode_operations ocfs2_file_iops;
 extern const struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c79aec..99f012a0f20 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -219,6 +219,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	struct super_block *sb;
 	struct ocfs2_super *osb;
 	int status = -EINVAL;
+	int use_plocks = 1;
 
 	mlog_entry("(0x%p, size:%llu)\n", inode,
 		   (unsigned long long)le64_to_cpu(fe->i_size));
@@ -226,6 +227,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	sb = inode->i_sb;
 	osb = OCFS2_SB(sb);
 
+	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+	    ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
+		use_plocks = 0;
+
 	/* this means that read_inode cannot create a superblock inode
 	 * today.  change if needed. */
 	if (!OCFS2_IS_VALID_DINODE(fe) ||
@@ -295,13 +300,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	switch (inode->i_mode & S_IFMT) {
 	    case S_IFREG:
-		    inode->i_fop = &ocfs2_fops;
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_fops;
+		    else
+			    inode->i_fop = &ocfs2_fops_no_plocks;
 		    inode->i_op = &ocfs2_file_iops;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
 		    break;
 	    case S_IFDIR:
 		    inode->i_op = &ocfs2_dir_iops;
-		    inode->i_fop = &ocfs2_dops;
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_dops;
+		    else
+			    inode->i_fop = &ocfs2_dops_no_plocks;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
 		    break;
 	    case S_IFLNK:
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 203f8714387..544ac624517 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -24,6 +24,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/fcntl.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -32,6 +33,7 @@
 
 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
 #include "locks.h"
 
 static int ocfs2_do_flock(struct file *file, struct inode *inode,
@@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 	else
 		return ocfs2_do_flock(file, inode, cmd, fl);
 }
+
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if (__mandatory_lock(inode))
+		return -ENOLCK;
+
+	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
+}
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 9743ef2324e..496d488b271 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -27,5 +27,6 @@
 #define OCFS2_LOCKS_H
 
 int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
 
 #endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 353fc35c674..faec2d87935 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -28,6 +28,7 @@
 #include "ocfs2.h"  /* For struct ocfs2_lock_res */
 #include "stackglue.h"
 
+#include <linux/dlm_plock.h>
 
 /*
  * The control protocol starts with a handshake.  Until the handshake
@@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 {
 }
 
+static int user_plock(struct ocfs2_cluster_connection *conn,
+		      u64 ino,
+		      struct file *file,
+		      int cmd,
+		      struct file_lock *fl)
+{
+	/*
+	 * This more or less just demuxes the plock request into any
+	 * one of three dlm calls.
+	 *
+	 * Internally, fs/dlm will pass these to a misc device, which
+	 * a userspace daemon will read and write to.
+	 *
+	 * For now, cancel requests (which happen internally only),
+	 * are turned into unlocks. Most of this function taken from
+	 * gfs2_lock.
+	 */
+
+	if (cmd == F_CANCELLK) {
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
+
+	if (IS_GETLK(cmd))
+		return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
+	else
+		return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
+}
+
 /*
  * Compare a requested locking protocol version against the current one.
  *
@@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
 	.dlm_unlock	= user_dlm_unlock,
 	.lock_status	= user_dlm_lock_status,
 	.lock_lvb	= user_dlm_lvb,
+	.plock		= user_plock,
 	.dump_lksb	= user_dlm_dump_lksb,
 };
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 07f348b8d72..7150f5dce95 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -288,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
 
+int ocfs2_stack_supports_plocks(void)
+{
+	return !!(active_stack && active_stack->sp_ops->plock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
+
+/*
+ * ocfs2_plock() can only be safely called if
+ * ocfs2_stack_supports_plocks() returned true
+ */
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl)
+{
+	WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
+	if (active_stack->sp_ops->plock)
+		return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(ocfs2_plock);
+
 int ocfs2_cluster_connect(const char *stack_name,
 			  const char *group,
 			  int grouplen,
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index db56281dd1b..c571af375ef 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -28,6 +28,10 @@
 #include "dlm/dlmapi.h"
 #include <linux/dlm.h>
 
+/* Needed for plock-related prototypes */
+struct file;
+struct file_lock;
+
 /*
  * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
  * some day, but right now we need it.  Let's fake it.  This value is larger
@@ -186,6 +190,17 @@ struct ocfs2_stack_operations {
 	 */
 	void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
 
+	/*
+	 * Cluster-aware posix locks
+	 *
+	 * This is NULL for stacks which do not support posix locks.
+	 */
+	int (*plock)(struct ocfs2_cluster_connection *conn,
+		     u64 ino,
+		     struct file *file,
+		     int cmd,
+		     struct file_lock *fl);
+
 	/*
 	 * This is an optoinal debugging hook.  If provided, the
 	 * stack can dump debugging information about this lock.
@@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
 void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
 
+int ocfs2_stack_supports_plocks(void);
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl);
+
 void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
 
 
-- 
cgit v1.2.3


From ebcee4b5c9136096f64ee6f691a013d7c0a4bc34 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 28 Jul 2008 14:55:20 -0700
Subject: ocfs2: Track local alloc bits internally

Do this instead of tracking absolute local alloc size. This avoids
needless re-calculatiion of bits from bytes in localalloc.c. Additionally,
the value is now in a more natural unit for internal file system bitmap
work.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/localalloc.c | 34 ++++++++++++----------------------
 fs/ocfs2/ocfs2.h      | 10 +++++++++-
 fs/ocfs2/super.c      |  8 +++++---
 3 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 28e492e4ec8..b05ce664291 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -47,8 +47,6 @@
 
 #define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
 
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
-
 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
@@ -75,21 +73,13 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 					  struct inode *local_alloc_inode);
 
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
-{
-	BUG_ON(osb->s_clustersize_bits > 20);
-
-	/* Size local alloc windows by the megabyte */
-	return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
-}
-
 /*
  * Tell us whether a given allocation should use the local alloc
  * file. Otherwise, it has to go to the main bitmap.
  */
 int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 {
-	int la_bits = ocfs2_local_alloc_window_bits(osb);
+	int la_bits = osb->local_alloc_bits;
 	int ret = 0;
 
 	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
@@ -120,14 +110,16 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	if (osb->local_alloc_size == 0)
+	if (osb->local_alloc_bits == 0)
 		goto bail;
 
-	if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+	if (osb->local_alloc_bits >= osb->bitmap_cpg) {
 		mlog(ML_NOTICE, "Requested local alloc window %d is larger "
 		     "than max possible %u. Using defaults.\n",
-		     ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
-		osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+		     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
+		osb->local_alloc_bits =
+			ocfs2_megabytes_to_clusters(osb->sb,
+						    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
 	}
 
 	/* read the alloc off disk */
@@ -190,8 +182,7 @@ bail:
 	if (inode)
 		iput(inode);
 
-	mlog(0, "Local alloc window bits = %d\n",
-	     ocfs2_local_alloc_window_bits(osb));
+	mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
 
 	mlog_exit(status);
 	return status;
@@ -490,7 +481,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
+	if (bits_wanted > osb->local_alloc_bits) {
 		mlog(0, "Asking for more than my max window size!\n");
 		status = -ENOSPC;
 		goto bail;
@@ -803,7 +794,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
 
 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
 	if (status < 0) {
@@ -849,7 +840,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 		     "one\n");
 
 	mlog(0, "Allocating %u clusters for a new window.\n",
-	     ocfs2_local_alloc_window_bits(osb));
+	     osb->local_alloc_bits);
 
 	/* Instruct the allocation code to try the most recently used
 	 * cluster group. We'll re-record the group used this pass
@@ -859,8 +850,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 	/* we used the generic suballoc reserve function, but we set
 	 * everything up nicely, so there's no reason why we can't use
 	 * the more specific cluster api to claim bits. */
-	status = ocfs2_claim_clusters(osb, handle, ac,
-				      ocfs2_local_alloc_window_bits(osb),
+	status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
 				      &cluster_off, &cluster_count);
 	if (status < 0) {
 		if (status != -ENOSPC)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7f625f2b111..43dd42e313a 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -252,7 +252,7 @@ struct ocfs2_super
 	struct ocfs2_journal *journal;
 	unsigned long osb_commit_interval;
 
-	int local_alloc_size;
+	unsigned int local_alloc_bits;
 	enum ocfs2_local_alloc_state local_alloc_state;
 	struct buffer_head *local_alloc_bh;
 	u64 la_last_gd;
@@ -554,6 +554,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
 	return pages_per_cluster;
 }
 
+static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
+						       unsigned int megs)
+{
+	BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
+
+	return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
 static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
 {
 	spin_lock(&osb->osb_lock);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 70334d85aff..3dee61ebd69 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -637,7 +637,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
 	osb->osb_commit_interval = parsed_options.commit_interval;
-	osb->local_alloc_size = parsed_options.localalloc_opt;
+	osb->local_alloc_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
 
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
@@ -938,6 +938,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
 	struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
 	unsigned long opts = osb->s_mount_opt;
+	unsigned int local_alloc_megs;
 
 	if (opts & OCFS2_MOUNT_HB_LOCAL)
 		seq_printf(s, ",_netdev,heartbeat=local");
@@ -970,8 +971,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",commit=%u",
 			   (unsigned) (osb->osb_commit_interval / HZ));
 
-	if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
-		seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+	local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
+	if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+		seq_printf(s, ",localalloc=%d", local_alloc_megs);
 
 	if (opts & OCFS2_MOUNT_LOCALFLOCKS)
 		seq_printf(s, ",localflocks,");
-- 
cgit v1.2.3


From 9c7af40b210e87f8fddd97b0badc0a352862234a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 28 Jul 2008 18:02:53 -0700
Subject: ocfs2: throttle back local alloc when low on disk space

Ocfs2's local allocator disables itself for the duration of a mount point
when it has trouble allocating a large enough area from the primary bitmap.
That can cause performance problems, especially for disks which were only
temporarily full or fragmented. This patch allows for the allocator to
shrink it's window first, before being disabled. Later, it can also be
re-enabled so that any performance drop is minimized.

To do this, we allow the value of osb->local_alloc_bits to be shrunk when
needed. The default value is recorded in a mostly read-only variable so that
we can re-initialize when required.

Locking had to be updated so that we could protect changes to
local_alloc_bits. Mostly this involves protecting various local alloc values
with the osb spinlock. A new state is also added, OCFS2_LA_THROTTLED, which
is used when the local allocator is has shrunk, but is not disabled. If the
available space dips below 1 megabyte, the local alloc file is disabled. In
either case, local alloc is re-enabled 30 seconds after the event, or when
an appropriate amount of bits is seen in the primary bitmap.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/localalloc.c | 198 +++++++++++++++++++++++++++++++++++++++++++++++---
 fs/ocfs2/localalloc.h |   4 +
 fs/ocfs2/ocfs2.h      |  23 +++++-
 fs/ocfs2/suballoc.c   |  31 ++++----
 fs/ocfs2/suballoc.h   |   1 +
 fs/ocfs2/super.c      |   4 +-
 6 files changed, 230 insertions(+), 31 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index b05ce664291..f71658adddb 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -73,16 +73,51 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 					  struct inode *local_alloc_inode);
 
+static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
+{
+	return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
+		osb->local_alloc_state == OCFS2_LA_ENABLED);
+}
+
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters)
+{
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+	    osb->local_alloc_state == OCFS2_LA_THROTTLED)
+		if (num_clusters >= osb->local_alloc_default_bits) {
+			cancel_delayed_work(&osb->la_enable_wq);
+			osb->local_alloc_state = OCFS2_LA_ENABLED;
+		}
+	spin_unlock(&osb->osb_lock);
+}
+
+void ocfs2_la_enable_worker(struct work_struct *work)
+{
+	struct ocfs2_super *osb =
+		container_of(work, struct ocfs2_super,
+			     la_enable_wq.work);
+	spin_lock(&osb->osb_lock);
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+	spin_unlock(&osb->osb_lock);
+}
+
 /*
  * Tell us whether a given allocation should use the local alloc
  * file. Otherwise, it has to go to the main bitmap.
+ *
+ * This function does semi-dirty reads of local alloc size and state!
+ * This is ok however, as the values are re-checked once under mutex.
  */
 int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 {
-	int la_bits = osb->local_alloc_bits;
 	int ret = 0;
+	int la_bits;
+
+	spin_lock(&osb->osb_lock);
+	la_bits = osb->local_alloc_bits;
 
-	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+	if (!ocfs2_la_state_enabled(osb))
 		goto bail;
 
 	/* la_bits should be at least twice the size (in clusters) of
@@ -96,6 +131,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 bail:
 	mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
 	     osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+	spin_unlock(&osb->osb_lock);
 	return ret;
 }
 
@@ -208,6 +244,9 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
+	cancel_delayed_work(&osb->la_enable_wq);
+	flush_workqueue(ocfs2_wq);
+
 	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
 		goto out;
 
@@ -445,7 +484,7 @@ out:
 }
 
 /*
- * make sure we've got at least bitswanted contiguous bits in the
+ * make sure we've got at least bits_wanted contiguous bits in the
  * local alloc. You lose them when you drop i_mutex.
  *
  * We will add ourselves to the transaction passed in, but may start
@@ -476,16 +515,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 
 	mutex_lock(&local_alloc_inode->i_mutex);
 
-	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
-		status = -ENOSPC;
-		goto bail;
-	}
-
-	if (bits_wanted > osb->local_alloc_bits) {
-		mlog(0, "Asking for more than my max window size!\n");
+	/*
+	 * We must double check state and allocator bits because
+	 * another process may have changed them while holding i_mutex.
+	 */
+	spin_lock(&osb->osb_lock);
+	if (!ocfs2_la_state_enabled(osb) ||
+	    (bits_wanted > osb->local_alloc_bits)) {
+		spin_unlock(&osb->osb_lock);
 		status = -ENOSPC;
 		goto bail;
 	}
+	spin_unlock(&osb->osb_lock);
 
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 
@@ -513,6 +554,21 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 				mlog_errno(status);
 			goto bail;
 		}
+
+		/*
+		 * Under certain conditions, the window slide code
+		 * might have reduced the number of bits available or
+		 * disabled the the local alloc entirely. Re-check
+		 * here and return -ENOSPC if necessary.
+		 */
+		status = -ENOSPC;
+		if (!ocfs2_la_state_enabled(osb))
+			goto bail;
+
+		free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+			le32_to_cpu(alloc->id1.bitmap1.i_used);
+		if (bits_wanted > free_bits)
+			goto bail;
 	}
 
 	ac->ac_inode = local_alloc_inode;
@@ -780,6 +836,85 @@ bail:
 	return status;
 }
 
+enum ocfs2_la_event {
+	OCFS2_LA_EVENT_SLIDE,		/* Normal window slide. */
+	OCFS2_LA_EVENT_FRAGMENTED,	/* The global bitmap has
+					 * enough bits theoretically
+					 * free, but a contiguous
+					 * allocation could not be
+					 * found. */
+	OCFS2_LA_EVENT_ENOSPC,		/* Global bitmap doesn't have
+					 * enough bits free to satisfy
+					 * our request. */
+};
+#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
+/*
+ * Given an event, calculate the size of our next local alloc window.
+ *
+ * This should always be called under i_mutex of the local alloc inode
+ * so that local alloc disabling doesn't race with processes trying to
+ * use the allocator.
+ *
+ * Returns the state which the local alloc was left in. This value can
+ * be ignored by some paths.
+ */
+static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
+				  enum ocfs2_la_event event)
+{
+	unsigned int bits;
+	int state;
+
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+		WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
+		goto out_unlock;
+	}
+
+	/*
+	 * ENOSPC and fragmentation are treated similarly for now.
+	 */
+	if (event == OCFS2_LA_EVENT_ENOSPC ||
+	    event == OCFS2_LA_EVENT_FRAGMENTED) {
+		/*
+		 * We ran out of contiguous space in the primary
+		 * bitmap. Drastically reduce the number of bits used
+		 * by local alloc until we have to disable it.
+		 */
+		bits = osb->local_alloc_bits >> 1;
+		if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+			/*
+			 * By setting state to THROTTLED, we'll keep
+			 * the number of local alloc bits used down
+			 * until an event occurs which would give us
+			 * reason to assume the bitmap situation might
+			 * have changed.
+			 */
+			osb->local_alloc_state = OCFS2_LA_THROTTLED;
+			osb->local_alloc_bits = bits;
+		} else {
+			osb->local_alloc_state = OCFS2_LA_DISABLED;
+		}
+		queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+				   OCFS2_LA_ENABLE_INTERVAL);
+		goto out_unlock;
+	}
+
+	/*
+	 * Don't increase the size of the local alloc window until we
+	 * know we might be able to fulfill the request. Otherwise, we
+	 * risk bouncing around the global bitmap during periods of
+	 * low space.
+	 */
+	if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
+		osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+out_unlock:
+	state = osb->local_alloc_state;
+	spin_unlock(&osb->osb_lock);
+
+	return state;
+}
+
 static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 						struct ocfs2_alloc_context **ac,
 						struct inode **bitmap_inode,
@@ -794,12 +929,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+retry_enospc:
 	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
 
 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+	if (status == -ENOSPC) {
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		ocfs2_free_ac_resource(*ac);
+		memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
+		goto retry_enospc;
+	}
 	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
+		mlog_errno(status);
 		goto bail;
 	}
 
@@ -852,6 +996,34 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 	 * the more specific cluster api to claim bits. */
 	status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
 				      &cluster_off, &cluster_count);
+	if (status == -ENOSPC) {
+retry_enospc:
+		/*
+		 * Note: We could also try syncing the journal here to
+		 * allow use of any free bits which the current
+		 * transaction can't give us access to. --Mark
+		 */
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		status = ocfs2_claim_clusters(osb, handle, ac,
+					      osb->local_alloc_bits,
+					      &cluster_off,
+					      &cluster_count);
+		if (status == -ENOSPC)
+			goto retry_enospc;
+		/*
+		 * We only shrunk the *minimum* number of in our
+		 * request - it's entirely possible that the allocator
+		 * might give us more than we asked for.
+		 */
+		if (status == 0) {
+			spin_lock(&osb->osb_lock);
+			osb->local_alloc_bits = cluster_count;
+			spin_unlock(&osb->osb_lock);
+		}
+	}
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -895,6 +1067,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 
 	mlog_entry_void();
 
+	ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
+
 	/* This will lock the main bitmap for us. */
 	status = ocfs2_local_alloc_reserve_for_window(osb,
 						      &ac,
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 3f76631e110..ac5ea9f8665 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 				 u32 *bit_off,
 				 u32 *num_bits);
 
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters);
+void ocfs2_la_enable_worker(struct work_struct *work);
+
 #endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 43dd42e313a..4d6e200a484 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -171,9 +171,13 @@ struct ocfs2_alloc_stats
 
 enum ocfs2_local_alloc_state
 {
-	OCFS2_LA_UNUSED = 0,
-	OCFS2_LA_ENABLED,
-	OCFS2_LA_DISABLED
+	OCFS2_LA_UNUSED = 0,	/* Local alloc will never be used for
+				 * this mountpoint. */
+	OCFS2_LA_ENABLED,	/* Local alloc is in use. */
+	OCFS2_LA_THROTTLED,	/* Local alloc is in use, but number
+				 * of bits has been reduced. */
+	OCFS2_LA_DISABLED	/* Local alloc has temporarily been
+				 * disabled. */
 };
 
 enum ocfs2_mount_options
@@ -252,9 +256,20 @@ struct ocfs2_super
 	struct ocfs2_journal *journal;
 	unsigned long osb_commit_interval;
 
+	struct delayed_work		la_enable_wq;
+
+	/*
+	 * Must hold local alloc i_mutex and osb->osb_lock to change
+	 * local_alloc_bits. Reads can be done under either lock.
+	 */
 	unsigned int local_alloc_bits;
-	enum ocfs2_local_alloc_state local_alloc_state;
+	unsigned int local_alloc_default_bits;
+
+	enum ocfs2_local_alloc_state local_alloc_state; /* protected
+							 * by osb_lock */
+
 	struct buffer_head *local_alloc_bh;
+
 	u64 la_last_gd;
 
 	/* Next two fields are for local node slot recovery during
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d2d278fb981..de7b93d76d1 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -111,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 						u64 *bg_blkno,
 						u16 *bg_bit_off);
 
-static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 {
 	struct inode *inode = ac->ac_inode;
 
@@ -686,15 +686,6 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
 		if ((status < 0) && (status != -ENOSPC)) {
 			mlog_errno(status);
 			goto bail;
-		} else if (status == -ENOSPC) {
-			/* reserve_local_bits will return enospc with
-			 * the local alloc inode still locked, so we
-			 * can change this safely here. */
-			mlog(0, "Disabling local alloc\n");
-			/* We set to OCFS2_LA_DISABLED so that umount
-			 * can clean up what's left of the local
-			 * allocation */
-			osb->local_alloc_state = OCFS2_LA_DISABLED;
 		}
 	}
 
@@ -1005,6 +996,7 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 	int search = -ENOSPC;
 	int ret;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u16 tmp_off, tmp_found;
 	unsigned int max_bits, gd_cluster_off;
 
@@ -1045,6 +1037,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 			*bit_off = tmp_off;
 			*bits_found = tmp_found;
 			search = 0; /* success */
+		} else if (tmp_found) {
+			/*
+			 * Don't show bits which we'll be returning
+			 * for allocation to the local alloc bitmap.
+			 */
+			ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
 		}
 	}
 
@@ -1203,9 +1201,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	status = -ENOSPC;
 	/* for now, the chain search is a bit simplistic. We just use
 	 * the 1st group with any empty bits. */
-	while ((status = ac->ac_group_search(alloc_inode, group_bh,
-					     bits_wanted, min_bits, bit_off,
-					     &tmp_bits)) == -ENOSPC) {
+	while ((status = ac->ac_group_search(alloc_inode, group_bh, bits_wanted,
+					     min_bits, bit_off, &tmp_bits)) == -ENOSPC) {
 		if (!bg->bg_next_group)
 			break;
 
@@ -1838,9 +1835,15 @@ int ocfs2_free_clusters(handle_t *handle,
 	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
 					  bg_start_bit, bg_blkno,
 					  num_clusters);
-	if (status < 0)
+	if (status < 0) {
 		mlog_errno(status);
+		goto out;
+	}
+
+	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
+					 num_clusters);
 
+out:
 	mlog_exit(status);
 	return status;
 }
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 544c600662b..40d51daf5fb 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -147,6 +147,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
  * apis above. */
 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 				      struct ocfs2_alloc_context *ac);
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
 
 /* given a cluster offset, calculate which block group it belongs to
  * and return that block offset. */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3dee61ebd69..a2d3dcf7025 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -637,7 +637,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
 	osb->osb_commit_interval = parsed_options.commit_interval;
-	osb->local_alloc_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
+	osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
+	osb->local_alloc_bits = osb->local_alloc_default_bits;
 
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
@@ -1425,6 +1426,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
 	osb->local_alloc_state = OCFS2_LA_UNUSED;
 	osb->local_alloc_bh = NULL;
+	INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
 
 	init_waitqueue_head(&osb->osb_mount_event);
 
-- 
cgit v1.2.3


From 9a8ff578fb430a8816dfbc73c77e5e09c6d9c343 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 29 Jul 2008 18:29:18 -0700
Subject: ocfs2: track local alloc state via debugfs

A per-mount debugfs file, "local_alloc" is created which when read will
expose live state of the nodes local alloc file. Performance impact is
minimal, only a bit of memory overhead per mount point. Still, the code is
hidden behind CONFIG_OCFS2_FS_STATS. This feature will help us debug
local alloc performance problems on a live system.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/localalloc.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/ocfs2.h      |  5 +++
 2 files changed, 92 insertions(+)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index f71658adddb..b889f10d809 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -73,6 +74,85 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 					  struct inode *local_alloc_inode);
 
+#ifdef CONFIG_OCFS2_FS_STATS
+
+DEFINE_MUTEX(la_debug_mutex);
+
+static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+#define LA_DEBUG_BUF_SZ	PAGE_CACHE_SIZE
+#define LA_DEBUG_VER	1
+static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
+				   size_t count, loff_t *ppos)
+{
+	struct ocfs2_super *osb = file->private_data;
+	int written, ret;
+	char *buf = osb->local_alloc_debug_buf;
+
+	mutex_lock(&la_debug_mutex);
+	memset(buf, 0, LA_DEBUG_BUF_SZ);
+
+	written = snprintf(buf, LA_DEBUG_BUF_SZ,
+			   "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
+			   LA_DEBUG_VER,
+			   (unsigned long long)osb->la_last_gd,
+			   osb->local_alloc_default_bits,
+			   osb->local_alloc_bits, osb->local_alloc_state);
+
+	ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
+
+	mutex_unlock(&la_debug_mutex);
+	return ret;
+}
+
+static const struct file_operations ocfs2_la_debug_fops = {
+	.open =		ocfs2_la_debug_open,
+	.read =		ocfs2_la_debug_read,
+};
+
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+	osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
+	if (!osb->local_alloc_debug_buf)
+		return;
+
+	osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
+						     S_IFREG|S_IRUSR,
+						     osb->osb_debug_root,
+						     osb,
+						     &ocfs2_la_debug_fops);
+	if (!osb->local_alloc_debug) {
+		kfree(osb->local_alloc_debug_buf);
+		osb->local_alloc_debug_buf = NULL;
+	}
+}
+
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+	if (osb->local_alloc_debug)
+		debugfs_remove(osb->local_alloc_debug);
+
+	if (osb->local_alloc_debug_buf)
+		kfree(osb->local_alloc_debug_buf);
+
+	osb->local_alloc_debug_buf = NULL;
+	osb->local_alloc_debug = NULL;
+}
+#else	/* CONFIG_OCFS2_FS_STATS */
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+	return;
+}
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+	return;
+}
+#endif
+
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
 	return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -146,6 +226,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
+	ocfs2_init_la_debug(osb);
+
 	if (osb->local_alloc_bits == 0)
 		goto bail;
 
@@ -218,6 +300,9 @@ bail:
 	if (inode)
 		iput(inode);
 
+	if (status < 0)
+		ocfs2_shutdown_la_debug(osb);
+
 	mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
 
 	mlog_exit(status);
@@ -247,6 +332,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 	cancel_delayed_work(&osb->la_enable_wq);
 	flush_workqueue(ocfs2_wq);
 
+	ocfs2_shutdown_la_debug(osb);
+
 	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
 		goto out;
 
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4d6e200a484..128279986d6 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -272,6 +272,11 @@ struct ocfs2_super
 
 	u64 la_last_gd;
 
+#ifdef CONFIG_OCFS2_FS_STATS
+	struct dentry *local_alloc_debug;
+	char *local_alloc_debug_buf;
+#endif
+
 	/* Next two fields are for local node slot recovery during
 	 * mount. */
 	int dirty;
-- 
cgit v1.2.3


From 231b87d10920e024efaf0f9e86e1bab7bced1620 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:42 +0800
Subject: ocfs2: Modify ocfs2_num_free_extents for future xattr usage.

ocfs2_num_free_extents() is used to find the number of free extent records
in an inode btree. Hence, it takes an "ocfs2_dinode" parameter. We want to
use this for extended attribute trees in the future, so genericize the
interface the take a buffer head. A future patch will allow that buffer_head
to contain any structure rooting an ocfs2 btree.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c |  3 ++-
 fs/ocfs2/alloc.h |  2 +-
 fs/ocfs2/aops.c  |  5 +++--
 fs/ocfs2/dir.c   |  3 ++-
 fs/ocfs2/file.c  | 11 ++++++-----
 fs/ocfs2/file.h  |  2 +-
 6 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 29ff57ec5d1..377acb24f67 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -368,12 +368,13 @@ struct ocfs2_merge_ctxt {
  */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct ocfs2_dinode *fe)
+			   struct buffer_head *bh)
 {
 	int retval;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_block *eb;
 	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *)bh->b_data;
 
 	mlog_entry_void();
 
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 60cd3d59230..5c0f764b59e 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -47,7 +47,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 			struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct ocfs2_dinode *fe);
+			   struct buffer_head *bh);
 /* how many new metadata chunks would an allocation need at maximum? */
 static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
 {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a53da146627..e2008dcec75 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1712,8 +1712,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
-		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
-					    extents_to_split, &data_ac, &meta_ac);
+		ret = ocfs2_lock_allocators(inode, wc->w_di_bh,
+					    clusters_to_alloc, extents_to_split,
+					    &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 9cce563fd62..fda09c32a5f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1479,7 +1479,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	spin_lock(&OCFS2_I(dir)->ip_lock);
 	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
-		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+		num_free_extents = ocfs2_num_free_extents(osb, dir,
+							  parent_fe_bh);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1015ef16a8b..b6c483dfe61 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -521,7 +521,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	if (mark_unwritten)
 		flags = OCFS2_EXT_UNWRITTEN;
 
-	free_extents = ocfs2_num_free_extents(osb, inode, fe);
+	free_extents = ocfs2_num_free_extents(osb, inode, fe_bh);
 	if (free_extents < 0) {
 		status = free_extents;
 		mlog_errno(status);
@@ -609,7 +609,7 @@ leave:
  * File systems which don't support holes call this from
  * ocfs2_extend_allocation().
  */
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *di_bh,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac)
@@ -617,6 +617,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 	int ret = 0, num_free_extents;
 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
 	*meta_ac = NULL;
 	if (data_ac)
@@ -629,7 +630,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
 	     le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
 
-	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
+	num_free_extents = ocfs2_num_free_extents(osb, inode, di_bh);
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
 		mlog_errno(ret);
@@ -724,7 +725,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
+	status = ocfs2_lock_allocators(inode, bh, clusters_to_add, 0, &data_ac,
 				       &meta_ac);
 	if (status) {
 		mlog_errno(status);
@@ -1395,7 +1396,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
-	ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+	ret = ocfs2_lock_allocators(inode, di_bh, 0, 1, NULL, &meta_ac);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 5a6d3e48e4b..c96b8054fbe 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -57,7 +57,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *fe,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac);
-- 
cgit v1.2.3


From 811f933df1e55615fd0bb4818f31e3868a8e6e23 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:43 +0800
Subject: ocfs2: Use ocfs2_extent_list instead of ocfs2_dinode.

ocfs2_extend_meta_needed(), ocfs2_calc_extend_credits() and
ocfs2_reserve_new_metadata() are all useful for extent tree operations. But
they are all limited to an inode btree because they use a struct
ocfs2_dinode parameter. Change their parameter to struct ocfs2_extent_list
(the part of an ocfs2_dinode they actually use) so that the xattr btree code
can use these functions.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c    |  3 ++-
 fs/ocfs2/alloc.h    | 12 +++++++++---
 fs/ocfs2/aops.c     |  3 ++-
 fs/ocfs2/dir.c      |  5 +++--
 fs/ocfs2/file.c     |  9 +++++----
 fs/ocfs2/journal.h  | 17 +++++++++++------
 fs/ocfs2/suballoc.c |  4 ++--
 fs/ocfs2/suballoc.h |  7 ++++++-
 8 files changed, 40 insertions(+), 20 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 377acb24f67..dc36cd14075 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4527,7 +4527,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 	} else
 		rightmost_el = path_leaf_el(path);
 
-	credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+	credits += path->p_tree_depth +
+		   ocfs2_extend_meta_needed(&di->id2.i_list);
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 5c0f764b59e..a0e334f10cd 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -48,8 +48,14 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *bh);
-/* how many new metadata chunks would an allocation need at maximum? */
-static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
 {
 	/*
 	 * Rather than do all the work of determining how much we need
@@ -59,7 +65,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
 	 * new tree_depth==0 extent_block, and one block at the new
 	 * top-of-the tree.
 	 */
-	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+	return le16_to_cpu(root_el->l_tree_depth) + 2;
 }
 
 void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e2008dcec75..bbe3f8b2d0e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1720,7 +1720,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 			goto out;
 		}
 
-		credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+		credits = ocfs2_calc_extend_credits(inode->i_sb,
+						    &di->id2.i_list,
 						    clusters_to_alloc);
 
 	}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index fda09c32a5f..126aa219c0c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1430,6 +1430,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	int credits, num_free_extents, drop_alloc_sem = 0;
 	loff_t dir_i_size;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_extent_list *el = &fe->id2.i_list;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle = NULL;
@@ -1488,7 +1489,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		}
 
 		if (!num_free_extents) {
-			status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
+			status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
 			if (status < 0) {
 				if (status != -ENOSPC)
 					mlog_errno(status);
@@ -1503,7 +1504,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		credits = ocfs2_calc_extend_credits(sb, fe, 1);
+		credits = ocfs2_calc_extend_credits(sb, el, 1);
 	} else {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b6c483dfe61..a31bba6c557 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -540,7 +540,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 		goto leave;
 	} else if ((!free_extents)
 		   && (ocfs2_alloc_context_bits_left(meta_ac)
-		       < ocfs2_extend_meta_needed(fe))) {
+		       < ocfs2_extend_meta_needed(&fe->id2.i_list))) {
 		mlog(0, "filesystem is really fragmented...\n");
 		status = -EAGAIN;
 		reason = RESTART_META;
@@ -652,7 +652,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *di_bh,
 	 */
 	if (!num_free_extents ||
 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
-		ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
+		ret = ocfs2_reserve_new_metadata(osb, &di->id2.i_list, meta_ac);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
@@ -732,7 +732,8 @@ restart_all:
 		goto leave;
 	}
 
-	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
+					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
@@ -790,7 +791,7 @@ restarted_transaction:
 			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
-							    fe,
+							    &fe->id2.i_list,
 							    clusters_to_add);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2178ebffa05..9485f8037d9 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -340,11 +340,16 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
 			     + OCFS2_UNLINK_CREDITS)
 
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
 static inline int ocfs2_calc_extend_credits(struct super_block *sb,
-					    struct ocfs2_dinode *fe,
+					    struct ocfs2_extent_list *root_el,
 					    u32 bits_wanted)
 {
-	int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+	int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
 
 	/* bitmap dinode, group desc. + relinked group. */
 	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@@ -355,16 +360,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 	 * however many metadata chunks needed * a remaining suballoc
 	 * alloc. */
 	sysfile_bitmap_blocks = 1 +
-		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
 
 	/* this does not include *new* metadata blocks, which are
-	 * accounted for in sysfile_bitmap_blocks. fe +
+	 * accounted for in sysfile_bitmap_blocks. root_el +
 	 * prev. last_eb_blk + blocks along edge of tree.
 	 * calc_symlink_credits passes because we just need 1
 	 * credit for the dinode there. */
-	dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
 
-	return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
 }
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index de7b93d76d1..2a817bca1dd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -494,7 +494,7 @@ bail:
 }
 
 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_dinode *fe,
+			       struct ocfs2_extent_list *root_el,
 			       struct ocfs2_alloc_context **ac)
 {
 	int status;
@@ -507,7 +507,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(root_el);
 	(*ac)->ac_which = OCFS2_AC_USE_META;
 	slot = osb->slot_num;
 	(*ac)->ac_group_search = ocfs2_block_group_search;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 40d51daf5fb..3f96c875bcf 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -59,8 +59,13 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
 	return ac->ac_bits_wanted - ac->ac_bits_given;
 }
 
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_dinode *fe,
+			       struct ocfs2_extent_list *root_el,
 			       struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 			    struct ocfs2_alloc_context **ac);
-- 
cgit v1.2.3


From e7d4cb6bc19658646357eeff134645cd9bc3479f Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:44 +0800
Subject: ocfs2: Abstract ocfs2_extent_tree in b-tree operations.

In the old extent tree operation, we take the hypothesis that we
are using the ocfs2_extent_list in ocfs2_dinode as the tree root.
As xattr will also use ocfs2_extent_list to store large value
for a xattr entry, we refactor the tree operation so that xattr
can use it directly.

The refactoring includes 4 steps:
1. Abstract set/get of last_eb_blk and update_clusters since they may
   be stored in different location for dinode and xattr.
2. Add a new structure named ocfs2_extent_tree to indicate the
   extent tree the operation will work on.
3. Remove all the use of fe_bh and di, use root_bh and root_el in
   extent tree instead. So now all the fe_bh is replaced with
   et->root_bh, el with root_el accordingly.
4. Make ocfs2_lock_allocators generic. Now it is limited to be only used
   in file extend allocation. But the whole function is useful when we want
   to store large EAs.

Note: This patch doesn't touch ocfs2_commit_truncate() since it is not used
for anything other than truncate inode data btrees.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c    | 508 +++++++++++++++++++++++++++++++++-------------------
 fs/ocfs2/alloc.h    |  23 ++-
 fs/ocfs2/aops.c     |  11 +-
 fs/ocfs2/dir.c      |   7 +-
 fs/ocfs2/file.c     | 104 ++---------
 fs/ocfs2/file.h     |   4 -
 fs/ocfs2/suballoc.c |  82 +++++++++
 fs/ocfs2/suballoc.h |   5 +
 8 files changed, 456 insertions(+), 288 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index dc36cd14075..579659bae6c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,6 +49,143 @@
 
 #include "buffer_head_io.h"
 
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree;
+
+struct ocfs2_extent_tree_operations {
+	void (*set_last_eb_blk) (struct ocfs2_extent_tree *et, u64 blkno);
+	u64 (*get_last_eb_blk) (struct ocfs2_extent_tree *et);
+	void (*update_clusters) (struct inode *inode,
+				 struct ocfs2_extent_tree *et,
+				 u32 new_clusters);
+	int (*sanity_check) (struct inode *inode, struct ocfs2_extent_tree *et);
+};
+
+struct ocfs2_extent_tree {
+	enum ocfs2_extent_tree_type type;
+	struct ocfs2_extent_tree_operations *eops;
+	struct buffer_head *root_bh;
+	struct ocfs2_extent_list *root_el;
+};
+
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 blkno)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
+
+	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+	di->i_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
+
+	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+	return le64_to_cpu(di->i_last_eb_blk);
+}
+
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+					 struct ocfs2_extent_tree *et,
+					 u32 clusters)
+{
+	struct ocfs2_dinode *di =
+			(struct ocfs2_dinode *)et->root_bh->b_data;
+
+	le32_add_cpu(&di->i_clusters, clusters);
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et)
+{
+	int ret = 0;
+	struct ocfs2_dinode *di;
+
+	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+
+	di = (struct ocfs2_dinode *)et->root_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		ret = -EIO;
+		ocfs2_error(inode->i_sb,
+			"Inode %llu has invalid path root",
+			(unsigned long long)OCFS2_I(inode)->ip_blkno);
+	}
+
+	return ret;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+	.set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
+	.get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
+	.update_clusters	= ocfs2_dinode_update_clusters,
+	.sanity_check		= ocfs2_dinode_sanity_check,
+};
+
+static struct ocfs2_extent_tree*
+	 ocfs2_new_extent_tree(struct buffer_head *bh,
+			       enum ocfs2_extent_tree_type et_type)
+{
+	struct ocfs2_extent_tree *et;
+
+	et = kzalloc(sizeof(*et), GFP_NOFS);
+	if (!et)
+		return NULL;
+
+	et->type = et_type;
+	get_bh(bh);
+	et->root_bh = bh;
+
+	/* current we only support dinode extent. */
+	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+	if (et_type == OCFS2_DINODE_EXTENT) {
+		et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
+		et->eops = &ocfs2_dinode_et_ops;
+	}
+
+	return et;
+}
+
+static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
+{
+	if (et) {
+		brelse(et->root_bh);
+		kfree(et);
+	}
+}
+
+static inline void ocfs2_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 new_last_eb_blk)
+{
+	et->eops->set_last_eb_blk(et, new_last_eb_blk);
+}
+
+static inline u64 ocfs2_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	return et->eops->get_last_eb_blk(et);
+}
+
+static inline void ocfs2_update_clusters(struct inode *inode,
+					 struct ocfs2_extent_tree *et,
+					 u32 clusters)
+{
+	et->eops->update_clusters(inode, et, clusters);
+}
+
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 					 struct ocfs2_extent_block *eb);
@@ -204,17 +341,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 	return path;
 }
 
-/*
- * Allocate and initialize a new path based on a disk inode tree.
- */
-static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
-{
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	struct ocfs2_extent_list *el = &di->id2.i_list;
-
-	return ocfs2_new_path(di_bh, el);
-}
-
 /*
  * Convenience function to journal all components in a path.
  */
@@ -368,24 +494,33 @@ struct ocfs2_merge_ctxt {
  */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct buffer_head *bh)
+			   struct buffer_head *root_bh,
+			   enum ocfs2_extent_tree_type type)
 {
 	int retval;
-	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_list *el = NULL;
 	struct ocfs2_extent_block *eb;
 	struct buffer_head *eb_bh = NULL;
-	struct ocfs2_dinode *fe = (struct ocfs2_dinode *)bh->b_data;
+	u64 last_eb_blk = 0;
 
 	mlog_entry_void();
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		retval = -EIO;
-		goto bail;
+	if (type == OCFS2_DINODE_EXTENT) {
+		struct ocfs2_dinode *fe =
+				(struct ocfs2_dinode *)root_bh->b_data;
+		if (!OCFS2_IS_VALID_DINODE(fe)) {
+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+			retval = -EIO;
+			goto bail;
+		}
+
+		if (fe->i_last_eb_blk)
+			last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
+		el = &fe->id2.i_list;
 	}
 
-	if (fe->i_last_eb_blk) {
-		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+	if (last_eb_blk) {
+		retval = ocfs2_read_block(osb, last_eb_blk,
 					  &eb_bh, OCFS2_BH_CACHED, inode);
 		if (retval < 0) {
 			mlog_errno(retval);
@@ -393,8 +528,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 		}
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
-	} else
-		el = &fe->id2.i_list;
+	}
 
 	BUG_ON(el->l_tree_depth != 0);
 
@@ -532,7 +666,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 static int ocfs2_add_branch(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
-			    struct buffer_head *fe_bh,
+			    struct ocfs2_extent_tree *et,
 			    struct buffer_head *eb_bh,
 			    struct buffer_head **last_eb_bh,
 			    struct ocfs2_alloc_context *meta_ac)
@@ -541,7 +675,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	u64 next_blkno, new_last_eb_blk;
 	struct buffer_head *bh;
 	struct buffer_head **new_eb_bhs = NULL;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list  *eb_el;
 	struct ocfs2_extent_list  *el;
@@ -551,13 +684,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	BUG_ON(!last_eb_bh || !*last_eb_bh);
 
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
 	if (eb_bh) {
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
 	} else
-		el = &fe->id2.i_list;
+		el = et->root_el;
 
 	/* we never add a branch to a leaf. */
 	BUG_ON(!el->l_tree_depth);
@@ -647,7 +778,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		mlog_errno(status);
 		goto bail;
 	}
-	status = ocfs2_journal_access(handle, inode, fe_bh,
+	status = ocfs2_journal_access(handle, inode, et->root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -663,7 +794,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	}
 
 	/* Link the new branch into the rest of the tree (el will
-	 * either be on the fe, or the extent block passed in. */
+	 * either be on the root_bh, or the extent block passed in. */
 	i = le16_to_cpu(el->l_next_free_rec);
 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
@@ -672,7 +803,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	/* fe needs a new last extent block pointer, as does the
 	 * next_leaf on the previously last-extent-block. */
-	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+	ocfs2_set_last_eb_blk(et, new_last_eb_blk);
 
 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
@@ -680,7 +811,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
 	if (status < 0)
 		mlog_errno(status);
-	status = ocfs2_journal_dirty(handle, fe_bh);
+	status = ocfs2_journal_dirty(handle, et->root_bh);
 	if (status < 0)
 		mlog_errno(status);
 	if (eb_bh) {
@@ -718,16 +849,15 @@ bail:
 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 				  handle_t *handle,
 				  struct inode *inode,
-				  struct buffer_head *fe_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct ocfs2_alloc_context *meta_ac,
 				  struct buffer_head **ret_new_eb_bh)
 {
 	int status, i;
 	u32 new_clusters;
 	struct buffer_head *new_eb_bh = NULL;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_list  *fe_el;
+	struct ocfs2_extent_list  *root_el;
 	struct ocfs2_extent_list  *eb_el;
 
 	mlog_entry_void();
@@ -747,8 +877,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	}
 
 	eb_el = &eb->h_list;
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	fe_el = &fe->id2.i_list;
+	root_el = et->root_el;
 
 	status = ocfs2_journal_access(handle, inode, new_eb_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -757,11 +886,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	/* copy the fe data into the new extent block */
-	eb_el->l_tree_depth = fe_el->l_tree_depth;
-	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-		eb_el->l_recs[i] = fe_el->l_recs[i];
+	/* copy the root extent list data into the new extent block */
+	eb_el->l_tree_depth = root_el->l_tree_depth;
+	eb_el->l_next_free_rec = root_el->l_next_free_rec;
+	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		eb_el->l_recs[i] = root_el->l_recs[i];
 
 	status = ocfs2_journal_dirty(handle, new_eb_bh);
 	if (status < 0) {
@@ -769,7 +898,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
+	status = ocfs2_journal_access(handle, inode, et->root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -778,21 +907,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 
 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
 
-	/* update fe now */
-	le16_add_cpu(&fe_el->l_tree_depth, 1);
-	fe_el->l_recs[0].e_cpos = 0;
-	fe_el->l_recs[0].e_blkno = eb->h_blkno;
-	fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
-	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-		memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
-	fe_el->l_next_free_rec = cpu_to_le16(1);
+	/* update root_bh now */
+	le16_add_cpu(&root_el->l_tree_depth, 1);
+	root_el->l_recs[0].e_cpos = 0;
+	root_el->l_recs[0].e_blkno = eb->h_blkno;
+	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+	root_el->l_next_free_rec = cpu_to_le16(1);
 
 	/* If this is our 1st tree depth shift, then last_eb_blk
 	 * becomes the allocated extent block */
-	if (fe_el->l_tree_depth == cpu_to_le16(1))
-		fe->i_last_eb_blk = eb->h_blkno;
+	if (root_el->l_tree_depth == cpu_to_le16(1))
+		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
-	status = ocfs2_journal_dirty(handle, fe_bh);
+	status = ocfs2_journal_dirty(handle, et->root_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -818,22 +947,21 @@ bail:
  * 1) a lowest extent block is found, then we pass it back in
  *    *lowest_eb_bh and return '0'
  *
- * 2) the search fails to find anything, but the dinode has room. We
+ * 2) the search fails to find anything, but the root_el has room. We
  *    pass NULL back in *lowest_eb_bh, but still return '0'
  *
- * 3) the search fails to find anything AND the dinode is full, in
+ * 3) the search fails to find anything AND the root_el is full, in
  *    which case we return > 0
  *
  * return status < 0 indicates an error.
  */
 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 				    struct inode *inode,
-				    struct buffer_head *fe_bh,
+				    struct ocfs2_extent_tree *et,
 				    struct buffer_head **target_bh)
 {
 	int status = 0, i;
 	u64 blkno;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list  *el;
 	struct buffer_head *bh = NULL;
@@ -843,8 +971,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	*target_bh = NULL;
 
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	el = &fe->id2.i_list;
+	el = et->root_el;
 
 	while(le16_to_cpu(el->l_tree_depth) > 1) {
 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
@@ -896,8 +1023,8 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	/* If we didn't find one and the fe doesn't have any room,
 	 * then return '1' */
-	if (!lowest_bh
-	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+	el = et->root_el;
+	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
 		status = 1;
 
 	*target_bh = lowest_bh;
@@ -920,19 +1047,19 @@ bail:
  * *last_eb_bh will be updated by ocfs2_add_branch().
  */
 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
-			   struct buffer_head *di_bh, int *final_depth,
+			   struct ocfs2_extent_tree *et, int *final_depth,
 			   struct buffer_head **last_eb_bh,
 			   struct ocfs2_alloc_context *meta_ac)
 {
 	int ret, shift;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+	struct ocfs2_extent_list *el = et->root_el;
+	int depth = le16_to_cpu(el->l_tree_depth);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *bh = NULL;
 
 	BUG_ON(meta_ac == NULL);
 
-	shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+	shift = ocfs2_find_branch_target(osb, inode, et, &bh);
 	if (shift < 0) {
 		ret = shift;
 		mlog_errno(ret);
@@ -949,7 +1076,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
 		/* ocfs2_shift_tree_depth will return us a buffer with
 		 * the new extent block (so we can pass that to
 		 * ocfs2_add_branch). */
-		ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+		ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
 					     meta_ac, &bh);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -976,7 +1103,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
 	/* call ocfs2_add_branch to add the final part of the tree with
 	 * the new data. */
 	mlog(0, "add branch. bh = %p\n", bh);
-	ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+	ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
 			       meta_ac);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -2059,11 +2186,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 				     struct ocfs2_path *right_path,
 				     int subtree_index,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
-				     int *deleted)
+				     int *deleted,
+				     struct ocfs2_extent_tree *et)
 {
 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
-	struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
 	struct ocfs2_extent_block *eb;
 
@@ -2115,7 +2242,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		 * We have to update i_last_eb_blk during the meta
 		 * data delete.
 		 */
-		ret = ocfs2_journal_access(handle, inode, di_bh,
+		ret = ocfs2_journal_access(handle, inode, et_root_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
@@ -2190,7 +2317,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		ocfs2_update_edge_lengths(inode, handle, left_path);
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-		di->i_last_eb_blk = eb->h_blkno;
+		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
 		/*
 		 * Removal of the extent in the left leaf was skipped
@@ -2200,7 +2327,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		if (right_has_empty)
 			ocfs2_remove_empty_extent(left_leaf_el);
 
-		ret = ocfs2_journal_dirty(handle, di_bh);
+		ret = ocfs2_journal_dirty(handle, et_root_bh);
 		if (ret)
 			mlog_errno(ret);
 
@@ -2323,7 +2450,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 				    handle_t *handle, int orig_credits,
 				    struct ocfs2_path *path,
 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
-				    struct ocfs2_path **empty_extent_path)
+				    struct ocfs2_path **empty_extent_path,
+				    struct ocfs2_extent_tree *et)
 {
 	int ret, subtree_root, deleted;
 	u32 right_cpos;
@@ -2396,7 +2524,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 
 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
 						right_path, subtree_root,
-						dealloc, &deleted);
+						dealloc, &deleted, et);
 		if (ret == -EAGAIN) {
 			/*
 			 * The rotation has to temporarily stop due to
@@ -2439,29 +2567,20 @@ out:
 }
 
 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
-				       struct ocfs2_path *path,
-				       struct ocfs2_cached_dealloc_ctxt *dealloc)
+				struct ocfs2_path *path,
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_extent_tree *et)
 {
 	int ret, subtree_index;
 	u32 cpos;
 	struct ocfs2_path *left_path = NULL;
-	struct ocfs2_dinode *di;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 
-	/*
-	 * XXX: This code assumes that the root is an inode, which is
-	 * true for now but may change as tree code gets generic.
-	 */
-	di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		ret = -EIO;
-		ocfs2_error(inode->i_sb,
-			    "Inode %llu has invalid path root",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		goto out;
-	}
 
+	ret = et->eops->sanity_check(inode, et);
+	if (ret)
+		goto out;
 	/*
 	 * There's two ways we handle this depending on
 	 * whether path is the only existing one.
@@ -2518,7 +2637,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		ocfs2_update_edge_lengths(inode, handle, left_path);
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-		di->i_last_eb_blk = eb->h_blkno;
+		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 	} else {
 		/*
 		 * 'path' is also the leftmost path which
@@ -2529,12 +2648,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		 */
 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
 
-		el = &di->id2.i_list;
+		el = et->root_el;
 		el->l_tree_depth = 0;
 		el->l_next_free_rec = 0;
 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
 
-		di->i_last_eb_blk = 0;
+		ocfs2_set_last_eb_blk(et, 0);
 	}
 
 	ocfs2_journal_dirty(handle, path_root_bh(path));
@@ -2562,7 +2681,8 @@ out:
  */
 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
 				  struct ocfs2_path *path,
-				  struct ocfs2_cached_dealloc_ctxt *dealloc)
+				  struct ocfs2_cached_dealloc_ctxt *dealloc,
+				  struct ocfs2_extent_tree *et)
 {
 	int ret, orig_credits = handle->h_buffer_credits;
 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -2576,7 +2696,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
 	if (path->p_tree_depth == 0) {
 rightmost_no_delete:
 		/*
-		 * In-inode extents. This is trivially handled, so do
+		 * Inline extents. This is trivially handled, so do
 		 * it up front.
 		 */
 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
@@ -2630,7 +2750,7 @@ rightmost_no_delete:
 		 */
 
 		ret = ocfs2_remove_rightmost_path(inode, handle, path,
-						  dealloc);
+						  dealloc, et);
 		if (ret)
 			mlog_errno(ret);
 		goto out;
@@ -2642,7 +2762,7 @@ rightmost_no_delete:
 	 */
 try_rotate:
 	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
-				       dealloc, &restart_path);
+				       dealloc, &restart_path, et);
 	if (ret && ret != -EAGAIN) {
 		mlog_errno(ret);
 		goto out;
@@ -2654,7 +2774,7 @@ try_rotate:
 
 		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
 					       tmp_path, dealloc,
-					       &restart_path);
+					       &restart_path, et);
 		if (ret && ret != -EAGAIN) {
 			mlog_errno(ret);
 			goto out;
@@ -2940,6 +3060,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 				handle_t *handle,
 				struct ocfs2_extent_rec *split_rec,
 				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_extent_tree *et,
 				int index)
 {
 	int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3060,7 +3181,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 		    le16_to_cpu(el->l_next_free_rec) == 1) {
 
 			ret = ocfs2_remove_rightmost_path(inode, handle,
-							  right_path, dealloc);
+							  right_path,
+							  dealloc, et);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -3087,7 +3209,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 				     int split_index,
 				     struct ocfs2_extent_rec *split_rec,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
-				     struct ocfs2_merge_ctxt *ctxt)
+				     struct ocfs2_merge_ctxt *ctxt,
+				     struct ocfs2_extent_tree *et)
 
 {
 	int ret = 0;
@@ -3105,7 +3228,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 * illegal.
 		 */
 		ret = ocfs2_rotate_tree_left(inode, handle, path,
-					     dealloc);
+					     dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3148,7 +3271,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
 
 		/* The merge left us with an empty extent, remove it. */
-		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+		ret = ocfs2_rotate_tree_left(inode, handle, path,
+					     dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3162,7 +3286,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 */
 		ret = ocfs2_merge_rec_left(inode, path,
 					   handle, rec,
-					   dealloc,
+					   dealloc, et,
 					   split_index);
 
 		if (ret) {
@@ -3171,7 +3295,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		}
 
 		ret = ocfs2_rotate_tree_left(inode, handle, path,
-					     dealloc);
+					     dealloc, et);
 		/*
 		 * Error from this last rotate is not critical, so
 		 * print but don't bubble it up.
@@ -3191,7 +3315,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 			ret = ocfs2_merge_rec_left(inode,
 						   path,
 						   handle, split_rec,
-						   dealloc,
+						   dealloc, et,
 						   split_index);
 			if (ret) {
 				mlog_errno(ret);
@@ -3214,7 +3338,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 			 * our leaf. Try to rotate it away.
 			 */
 			ret = ocfs2_rotate_tree_left(inode, handle, path,
-						     dealloc);
+						     dealloc, et);
 			if (ret)
 				mlog_errno(ret);
 			ret = 0;
@@ -3348,16 +3472,6 @@ rotate:
 	ocfs2_rotate_leaf(el, insert_rec);
 }
 
-static inline void ocfs2_update_dinode_clusters(struct inode *inode,
-						struct ocfs2_dinode *di,
-						u32 clusters)
-{
-	le32_add_cpu(&di->i_clusters, clusters);
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
 static void ocfs2_adjust_rightmost_records(struct inode *inode,
 					   handle_t *handle,
 					   struct ocfs2_path *path,
@@ -3559,8 +3673,8 @@ static void ocfs2_split_record(struct inode *inode,
 }
 
 /*
- * This function only does inserts on an allocation b-tree. For dinode
- * lists, ocfs2_insert_at_leaf() is called directly.
+ * This function only does inserts on an allocation b-tree. For tree
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
  *
  * right_path is the path we want to do the actual insert
  * in. left_path should only be passed in if we need to update that
@@ -3657,7 +3771,7 @@ out:
 
 static int ocfs2_do_insert_extent(struct inode *inode,
 				  handle_t *handle,
-				  struct buffer_head *di_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct ocfs2_extent_rec *insert_rec,
 				  struct ocfs2_insert_type *type)
 {
@@ -3665,13 +3779,11 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 	u32 cpos;
 	struct ocfs2_path *right_path = NULL;
 	struct ocfs2_path *left_path = NULL;
-	struct ocfs2_dinode *di;
 	struct ocfs2_extent_list *el;
 
-	di = (struct ocfs2_dinode *) di_bh->b_data;
-	el = &di->id2.i_list;
+	el = et->root_el;
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
+	ret = ocfs2_journal_access(handle, inode, et->root_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
@@ -3683,7 +3795,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		goto out_update_clusters;
 	}
 
-	right_path = ocfs2_new_inode_path(di_bh);
+	right_path = ocfs2_new_path(et->root_bh, et->root_el);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -3733,7 +3845,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		 * ocfs2_rotate_tree_right() might have extended the
 		 * transaction without re-journaling our tree root.
 		 */
-		ret = ocfs2_journal_access(handle, inode, di_bh,
+		ret = ocfs2_journal_access(handle, inode, et->root_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
@@ -3758,10 +3870,10 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
 out_update_clusters:
 	if (type->ins_split == SPLIT_NONE)
-		ocfs2_update_dinode_clusters(inode, di,
-					     le16_to_cpu(insert_rec->e_leaf_clusters));
+		ocfs2_update_clusters(inode, et,
+				      le16_to_cpu(insert_rec->e_leaf_clusters));
 
-	ret = ocfs2_journal_dirty(handle, di_bh);
+	ret = ocfs2_journal_dirty(handle, et->root_bh);
 	if (ret)
 		mlog_errno(ret);
 
@@ -3915,8 +4027,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
  * ocfs2_figure_appending_type() will figure out whether we'll have to
  * insert at the tail of the rightmost leaf.
  *
- * This should also work against the dinode list for tree's with 0
- * depth. If we consider the dinode list to be the rightmost leaf node
+ * This should also work against the root extent list for tree's with 0
+ * depth. If we consider the root extent list to be the rightmost leaf node
  * then the logic here makes sense.
  */
 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
@@ -3967,14 +4079,13 @@ set_tail_append:
  * structure.
  */
 static int ocfs2_figure_insert_type(struct inode *inode,
-				    struct buffer_head *di_bh,
+				    struct ocfs2_extent_tree *et,
 				    struct buffer_head **last_eb_bh,
 				    struct ocfs2_extent_rec *insert_rec,
 				    int *free_records,
 				    struct ocfs2_insert_type *insert)
 {
 	int ret;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_path *path = NULL;
@@ -3982,7 +4093,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 
 	insert->ins_split = SPLIT_NONE;
 
-	el = &di->id2.i_list;
+	el = et->root_el;
 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
 
 	if (el->l_tree_depth) {
@@ -3993,7 +4104,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		 * may want it later.
 		 */
 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk), &bh,
+				       ocfs2_get_last_eb_blk(et), &bh,
 				       OCFS2_BH_CACHED, inode);
 		if (ret) {
 			mlog_exit(ret);
@@ -4020,7 +4131,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		return 0;
 	}
 
-	path = ocfs2_new_inode_path(di_bh);
+	path = ocfs2_new_path(et->root_bh, et->root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4070,7 +4181,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 	 * the case that we're doing a tail append, so maybe we can
 	 * take advantage of that information somehow.
 	 */
-	if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+	if (ocfs2_get_last_eb_blk(et) ==
+	    path_leaf_bh(path)->b_blocknr) {
 		/*
 		 * Ok, ocfs2_find_path() returned us the rightmost
 		 * tree path. This might be an appending insert. There are
@@ -4100,21 +4212,30 @@ out:
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
-			struct buffer_head *fe_bh,
+			struct buffer_head *root_bh,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
 			u8 flags,
-			struct ocfs2_alloc_context *meta_ac)
+			struct ocfs2_alloc_context *meta_ac,
+			enum ocfs2_extent_tree_type et_type)
 {
 	int status;
 	int uninitialized_var(free_records);
 	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
+	struct ocfs2_extent_tree *et = NULL;
 
 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
 
+	et = ocfs2_new_extent_tree(root_bh, et_type);
+	if (!et) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
 	mlog(0, "add %u clusters at position %u to inode %llu\n",
 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
@@ -4132,7 +4253,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
 	rec.e_flags = flags;
 
-	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+	status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
 					  &free_records, &insert);
 	if (status < 0) {
 		mlog_errno(status);
@@ -4146,7 +4267,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	     free_records, insert.ins_tree_depth);
 
 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
-		status = ocfs2_grow_tree(inode, handle, fe_bh,
+		status = ocfs2_grow_tree(inode, handle, et,
 					 &insert.ins_tree_depth, &last_eb_bh,
 					 meta_ac);
 		if (status) {
@@ -4156,16 +4277,18 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	}
 
 	/* Finally, we can add clusters. This might rotate the tree for us. */
-	status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
+	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
 	if (status < 0)
 		mlog_errno(status);
-	else
+	else if (et->type == OCFS2_DINODE_EXTENT)
 		ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
 	if (last_eb_bh)
 		brelse(last_eb_bh);
 
+	if (et)
+		ocfs2_free_extent_tree(et);
 	mlog_exit(status);
 	return status;
 }
@@ -4193,7 +4316,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
 static int ocfs2_split_and_insert(struct inode *inode,
 				  handle_t *handle,
 				  struct ocfs2_path *path,
-				  struct buffer_head *di_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct buffer_head **last_eb_bh,
 				  int split_index,
 				  struct ocfs2_extent_rec *orig_split_rec,
@@ -4207,7 +4330,6 @@ static int ocfs2_split_and_insert(struct inode *inode,
 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
 	struct ocfs2_insert_type insert;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_dinode *di;
 
 leftright:
 	/*
@@ -4216,8 +4338,7 @@ leftright:
 	 */
 	rec = path_leaf_el(path)->l_recs[split_index];
 
-	di = (struct ocfs2_dinode *)di_bh->b_data;
-	rightmost_el = &di->id2.i_list;
+	rightmost_el = et->root_el;
 
 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
 	if (depth) {
@@ -4228,8 +4349,8 @@ leftright:
 
 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
 	    le16_to_cpu(rightmost_el->l_count)) {
-		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
-				      meta_ac);
+		ret = ocfs2_grow_tree(inode, handle, et,
+				      &depth, last_eb_bh, meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4266,8 +4387,7 @@ leftright:
 		do_leftright = 1;
 	}
 
-	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
-				     &insert);
+	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4309,8 +4429,9 @@ out:
  * of the tree is required. All other cases will degrade into a less
  * optimal tree layout.
  *
- * last_eb_bh should be the rightmost leaf block for any inode with a
- * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ * last_eb_bh should be the rightmost leaf block for any extent
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
  *
  * This code is optimized for readability - several passes might be
  * made over certain portions of the tree. All of those blocks will
@@ -4318,7 +4439,7 @@ out:
  * extra overhead is not expressed in terms of disk reads.
  */
 static int __ocfs2_mark_extent_written(struct inode *inode,
-				       struct buffer_head *di_bh,
+				       struct ocfs2_extent_tree *et,
 				       handle_t *handle,
 				       struct ocfs2_path *path,
 				       int split_index,
@@ -4358,10 +4479,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	 */
 	if (path->p_tree_depth) {
 		struct ocfs2_extent_block *eb;
-		struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk),
+				       ocfs2_get_last_eb_blk(et),
 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
 		if (ret) {
 			mlog_exit(ret);
@@ -4395,7 +4515,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 		if (ctxt.c_split_covers_rec)
 			el->l_recs[split_index] = *split_rec;
 		else
-			ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+			ret = ocfs2_split_and_insert(inode, handle, path, et,
 						     &last_eb_bh, split_index,
 						     split_rec, meta_ac);
 		if (ret)
@@ -4403,7 +4523,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	} else {
 		ret = ocfs2_try_to_merge_extent(inode, handle, path,
 						split_index, split_rec,
-						dealloc, &ctxt);
+						dealloc, &ctxt, et);
 		if (ret)
 			mlog_errno(ret);
 	}
@@ -4421,16 +4541,18 @@ out:
  *
  * The caller is responsible for passing down meta_ac if we'll need it.
  */
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
-			      struct ocfs2_cached_dealloc_ctxt *dealloc)
+			      struct ocfs2_cached_dealloc_ctxt *dealloc,
+			      enum ocfs2_extent_tree_type et_type)
 {
 	int ret, index;
 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
 	struct ocfs2_extent_rec split_rec;
 	struct ocfs2_path *left_path = NULL;
 	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_tree *et = NULL;
 
 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
@@ -4444,13 +4566,21 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
 		goto out;
 	}
 
+	et = ocfs2_new_extent_tree(root_bh, et_type);
+	if (!et) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
 	/*
 	 * XXX: This should be fixed up so that we just re-insert the
 	 * next extent records.
 	 */
-	ocfs2_extent_map_trunc(inode, 0);
+	if (et_type == OCFS2_DINODE_EXTENT)
+		ocfs2_extent_map_trunc(inode, 0);
 
-	left_path = ocfs2_new_inode_path(di_bh);
+	left_path = ocfs2_new_path(et->root_bh, et->root_el);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4481,23 +4611,25 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
 
-	ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
-					  index, &split_rec, meta_ac, dealloc);
+	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
+					  index, &split_rec, meta_ac,
+					  dealloc);
 	if (ret)
 		mlog_errno(ret);
 
 out:
 	ocfs2_free_path(left_path);
+	if (et)
+		ocfs2_free_extent_tree(et);
 	return ret;
 }
 
-static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 			    handle_t *handle, struct ocfs2_path *path,
 			    int index, u32 new_range,
 			    struct ocfs2_alloc_context *meta_ac)
 {
 	int ret, depth, credits = handle->h_buffer_credits;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *rightmost_el, *el;
@@ -4515,7 +4647,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 	depth = path->p_tree_depth;
 	if (depth > 0) {
 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk),
+				       ocfs2_get_last_eb_blk(et),
 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -4528,7 +4660,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 		rightmost_el = path_leaf_el(path);
 
 	credits += path->p_tree_depth +
-		   ocfs2_extend_meta_needed(&di->id2.i_list);
+		   ocfs2_extend_meta_needed(et->root_el);
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -4537,7 +4669,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 
 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
 	    le16_to_cpu(rightmost_el->l_count)) {
-		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+		ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
 				      meta_ac);
 		if (ret) {
 			mlog_errno(ret);
@@ -4551,7 +4683,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 	insert.ins_split = SPLIT_RIGHT;
 	insert.ins_tree_depth = depth;
 
-	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
 	if (ret)
 		mlog_errno(ret);
 
@@ -4563,7 +4695,8 @@ out:
 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 			      struct ocfs2_path *path, int index,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
-			      u32 cpos, u32 len)
+			      u32 cpos, u32 len,
+			      struct ocfs2_extent_tree *et)
 {
 	int ret;
 	u32 left_cpos, rec_range, trunc_range;
@@ -4575,7 +4708,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 	struct ocfs2_extent_block *eb;
 
 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
-		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4706,7 +4839,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 
 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
 
-	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4717,20 +4850,29 @@ out:
 	return ret;
 }
 
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
-			struct ocfs2_cached_dealloc_ctxt *dealloc)
+			struct ocfs2_cached_dealloc_ctxt *dealloc,
+			enum ocfs2_extent_tree_type et_type)
 {
 	int ret, index;
 	u32 rec_range, trunc_range;
 	struct ocfs2_extent_rec *rec;
 	struct ocfs2_extent_list *el;
-	struct ocfs2_path *path;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_tree *et = NULL;
+
+	et = ocfs2_new_extent_tree(root_bh, et_type);
+	if (!et) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
 
 	ocfs2_extent_map_trunc(inode, 0);
 
-	path = ocfs2_new_inode_path(di_bh);
+	path = ocfs2_new_path(et->root_bh, et->root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4783,13 +4925,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 
 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len);
+					 cpos, len, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	} else {
-		ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+		ret = ocfs2_split_tree(inode, et, handle, path, index,
 				       trunc_range, meta_ac);
 		if (ret) {
 			mlog_errno(ret);
@@ -4838,7 +4980,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 		}
 
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len);
+					 cpos, len, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4847,6 +4989,8 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 
 out:
 	ocfs2_free_path(path);
+	if (et)
+		ocfs2_free_extent_tree(et);
 	return ret;
 }
 
@@ -6355,7 +6499,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		 * the in-inode data from our pages.
 		 */
 		ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
-					  0, block, 1, 0, NULL);
+					  0, block, 1, 0,
+					  NULL, OCFS2_DINODE_EXTENT);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
@@ -6397,13 +6542,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 	handle_t *handle = NULL;
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_path *path = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
 
 	mlog_entry_void();
 
 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
 						     i_size_read(inode));
 
-	path = ocfs2_new_inode_path(fe_bh);
+	path = ocfs2_new_path(fe_bh, &di->id2.i_list);
 	if (!path) {
 		status = -ENOMEM;
 		mlog_errno(status);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index a0e334f10cd..473c8bcc62f 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,28 +26,37 @@
 #ifndef OCFS2_ALLOC_H
 #define OCFS2_ALLOC_H
 
+enum ocfs2_extent_tree_type {
+	OCFS2_DINODE_EXTENT = 0,
+};
+
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
-			struct buffer_head *fe_bh,
+			struct buffer_head *root_bh,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
 			u8 flags,
-			struct ocfs2_alloc_context *meta_ac);
+			struct ocfs2_alloc_context *meta_ac,
+			enum ocfs2_extent_tree_type et_type);
 struct ocfs2_cached_dealloc_ctxt;
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
-			      struct ocfs2_cached_dealloc_ctxt *dealloc);
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc,
+			      enum ocfs2_extent_tree_type et_type);
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
-			struct ocfs2_cached_dealloc_ctxt *dealloc);
+			struct ocfs2_cached_dealloc_ctxt *dealloc,
+			enum ocfs2_extent_tree_type et_type);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct buffer_head *bh);
+			   struct buffer_head *root_bh,
+			   enum ocfs2_extent_tree_type et_type);
+
 /*
  * how many new metadata chunks would an allocation need at maximum?
  *
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index bbe3f8b2d0e..44ea5eb3fdc 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1278,7 +1278,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 	} else if (unwritten) {
 		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
 						wc->w_handle, cpos, 1, phys,
-						meta_ac, &wc->w_dealloc);
+						meta_ac, &wc->w_dealloc,
+						OCFS2_DINODE_EXTENT);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1712,7 +1713,13 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
-		ret = ocfs2_lock_allocators(inode, wc->w_di_bh,
+		mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
+		     " clusters_to_add = %u, extents_to_split = %u\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
+		     clusters_to_alloc, extents_to_split);
+
+		ret = ocfs2_lock_allocators(inode, wc->w_di_bh, &di->id2.i_list,
 					    clusters_to_alloc, extents_to_split,
 					    &data_ac, &meta_ac);
 		if (ret) {
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 126aa219c0c..ba0fb9e1626 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1306,7 +1306,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * related blocks have been journaled already.
 	 */
 	ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
-				  NULL);
+				  NULL, OCFS2_DINODE_EXTENT);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1338,7 +1338,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
 
 		ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
-					  len, 0, NULL);
+					  len, 0, NULL, OCFS2_DINODE_EXTENT);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
@@ -1481,7 +1481,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		num_free_extents = ocfs2_num_free_extents(osb, dir,
-							  parent_fe_bh);
+							  parent_fe_bh,
+							  OCFS2_DINODE_EXTENT);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a31bba6c557..f567cc53d9b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -521,7 +521,8 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	if (mark_unwritten)
 		flags = OCFS2_EXT_UNWRITTEN;
 
-	free_extents = ocfs2_num_free_extents(osb, inode, fe_bh);
+	free_extents = ocfs2_num_free_extents(osb, inode, fe_bh,
+					      OCFS2_DINODE_EXTENT);
 	if (free_extents < 0) {
 		status = free_extents;
 		mlog_errno(status);
@@ -570,7 +571,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
 				     *logical_offset, block, num_bits,
-				     flags, meta_ac);
+				     flags, meta_ac, OCFS2_DINODE_EXTENT);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -599,92 +600,6 @@ leave:
 	return status;
 }
 
-/*
- * For a given allocation, determine which allocators will need to be
- * accessed, and lock them, reserving the appropriate number of bits.
- *
- * Sparse file systems call this from ocfs2_write_begin_nolock()
- * and ocfs2_allocate_unwritten_extents().
- *
- * File systems which don't support holes call this from
- * ocfs2_extend_allocation().
- */
-int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *di_bh,
-			  u32 clusters_to_add, u32 extents_to_split,
-			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac)
-{
-	int ret = 0, num_free_extents;
-	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-
-	*meta_ac = NULL;
-	if (data_ac)
-		*data_ac = NULL;
-
-	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
-
-	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-	     "clusters_to_add = %u, extents_to_split = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
-	     le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
-
-	num_free_extents = ocfs2_num_free_extents(osb, inode, di_bh);
-	if (num_free_extents < 0) {
-		ret = num_free_extents;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	/*
-	 * Sparse allocation file systems need to be more conservative
-	 * with reserving room for expansion - the actual allocation
-	 * happens while we've got a journal handle open so re-taking
-	 * a cluster lock (because we ran out of room for another
-	 * extent) will violate ordering rules.
-	 *
-	 * Most of the time we'll only be seeing this 1 cluster at a time
-	 * anyway.
-	 *
-	 * Always lock for any unwritten extents - we might want to
-	 * add blocks during a split.
-	 */
-	if (!num_free_extents ||
-	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
-		ret = ocfs2_reserve_new_metadata(osb, &di->id2.i_list, meta_ac);
-		if (ret < 0) {
-			if (ret != -ENOSPC)
-				mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	if (clusters_to_add == 0)
-		goto out;
-
-	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
-	if (ret < 0) {
-		if (ret != -ENOSPC)
-			mlog_errno(ret);
-		goto out;
-	}
-
-out:
-	if (ret) {
-		if (*meta_ac) {
-			ocfs2_free_alloc_context(*meta_ac);
-			*meta_ac = NULL;
-		}
-
-		/*
-		 * We cannot have an error and a non null *data_ac.
-		 */
-	}
-
-	return ret;
-}
-
 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 				     u32 clusters_to_add, int mark_unwritten)
 {
@@ -725,7 +640,13 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	status = ocfs2_lock_allocators(inode, bh, clusters_to_add, 0, &data_ac,
+	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
+	     clusters_to_add);
+	status = ocfs2_lock_allocators(inode, bh, &fe->id2.i_list,
+				       clusters_to_add, 0, &data_ac,
 				       &meta_ac);
 	if (status) {
 		mlog_errno(status);
@@ -1397,7 +1318,8 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
-	ret = ocfs2_lock_allocators(inode, di_bh, 0, 1, NULL, &meta_ac);
+	ret = ocfs2_lock_allocators(inode, di_bh, &di->id2.i_list,
+				    0, 1, NULL, &meta_ac);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -1428,7 +1350,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	}
 
 	ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
-				  dealloc);
+				  dealloc, OCFS2_DINODE_EXTENT);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index c96b8054fbe..18e5c80cc73 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -57,10 +57,6 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
-int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *fe,
-			  u32 clusters_to_add, u32 extents_to_split,
-			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2a817bca1dd..b642c825fb7 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1894,3 +1894,85 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
 		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
 	}
 }
+
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
+			  struct ocfs2_extent_list *root_el,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret = 0, num_free_extents;
+	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*meta_ac = NULL;
+	if (data_ac)
+		*data_ac = NULL;
+
+	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+
+	num_free_extents = ocfs2_num_free_extents(osb, inode, root_bh,
+						  OCFS2_DINODE_EXTENT);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Sparse allocation file systems need to be more conservative
+	 * with reserving room for expansion - the actual allocation
+	 * happens while we've got a journal handle open so re-taking
+	 * a cluster lock (because we ran out of room for another
+	 * extent) will violate ordering rules.
+	 *
+	 * Most of the time we'll only be seeing this 1 cluster at a time
+	 * anyway.
+	 *
+	 * Always lock for any unwritten extents - we might want to
+	 * add blocks during a split.
+	 */
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
+		ret = ocfs2_reserve_new_metadata(osb, root_el, meta_ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_to_add == 0)
+		goto out;
+
+	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null *data_ac.
+		 */
+	}
+
+	return ret;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 3f96c875bcf..a3e531e62df 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -162,4 +162,9 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 int ocfs2_check_group_descriptor(struct super_block *sb,
 				 struct ocfs2_dinode *di,
 				 struct ocfs2_group_desc *gd);
+int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
+			  struct ocfs2_extent_list *root_el,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac);
 #endif /* _CHAINALLOC_H_ */
-- 
cgit v1.2.3


From 0eb8d47e69a2211a36643b180f1843ef45f6017d Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:45 +0800
Subject: ocfs2: Make high level btree extend code generic

Factor out the non-inode specifics of ocfs2_do_extend_allocation() into a more generic
function, ocfs2_do_cluster_allocation(). ocfs2_do_extend_allocation calls
ocfs2_do_cluster_allocation() now, but the latter can be used for other
btree types as well.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/alloc.h |  17 +++++++
 fs/ocfs2/aops.c  |   8 ++--
 fs/ocfs2/dir.c   |   6 +--
 fs/ocfs2/file.c  | 136 +++++++++++--------------------------------------------
 fs/ocfs2/file.h  |  26 +++++------
 fs/ocfs2/namei.c |   8 ++--
 7 files changed, 176 insertions(+), 135 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 579659bae6c..cacfc118b71 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4293,6 +4293,116 @@ bail:
 	return status;
 }
 
+/*
+ * Allcate and add clusters into the extent b-tree.
+ * The new clusters(clusters_to_add) will be inserted at logical_offset.
+ * The extent b-tree's root is root_el and it should be in root_bh, and
+ * it is not limited to the file storage. Any extent tree can use this
+ * function if it implements the proper ocfs2_extent_tree.
+ */
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+				struct inode *inode,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct buffer_head *root_bh,
+				struct ocfs2_extent_list *root_el,
+				handle_t *handle,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret,
+				enum ocfs2_extent_tree_type type)
+{
+	int status = 0;
+	int free_extents;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
+	u32 bit_off, num_bits;
+	u64 block;
+	u8 flags = 0;
+
+	BUG_ON(!clusters_to_add);
+
+	if (mark_unwritten)
+		flags = OCFS2_EXT_UNWRITTEN;
+
+	free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		mlog(0, "we haven't reserved any metadata!\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs2_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(root_el))) {
+		mlog(0, "filesystem is really fragmented...\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	}
+
+	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+					clusters_to_add, &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	/* reserve our write early -- insert_extent may update the inode */
+	status = ocfs2_journal_access(handle, inode, root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
+	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
+				     *logical_offset, block, num_bits,
+				     flags, meta_ac, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, root_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clusters_to_add -= num_bits;
+	*logical_offset += num_bits;
+
+	if (clusters_to_add) {
+		mlog(0, "need to alloc once more, wanted = %u\n",
+		     clusters_to_add);
+		status = -EAGAIN;
+		reason = RESTART_TRANS;
+	}
+
+leave:
+	mlog_exit(status);
+	if (reason_ret)
+		*reason_ret = reason;
+	return status;
+}
+
 static void ocfs2_make_right_split_rec(struct super_block *sb,
 				       struct ocfs2_extent_rec *split_rec,
 				       u32 cpos,
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 473c8bcc62f..5e090c5d849 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -41,6 +41,23 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			u8 flags,
 			struct ocfs2_alloc_context *meta_ac,
 			enum ocfs2_extent_tree_type et_type);
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+				struct inode *inode,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct buffer_head *root_bh,
+				struct ocfs2_extent_list *root_el,
+				handle_t *handle,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret,
+				enum ocfs2_extent_tree_type type);
 struct ocfs2_cached_dealloc_ctxt;
 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 44ea5eb3fdc..e7acd286790 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1255,10 +1255,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		 * any additional semaphores or cluster locks.
 		 */
 		tmp_pos = cpos;
-		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-						 &tmp_pos, 1, 0, wc->w_di_bh,
-						 wc->w_handle, data_ac,
-						 meta_ac, NULL);
+		ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
+					   &tmp_pos, 1, 0, wc->w_di_bh,
+					   wc->w_handle, data_ac,
+					   meta_ac, NULL);
 		/*
 		 * This shouldn't happen because we must have already
 		 * calculated the correct meta data allocation required. The
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ba0fb9e1626..d17c34b0ac6 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1383,9 +1383,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	if (extend) {
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 
-		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
-						    1, 0, parent_fe_bh, handle,
-						    data_ac, meta_ac, NULL);
+		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
+					      1, 0, parent_fe_bh, handle,
+					      data_ac, meta_ac, NULL);
 		BUG_ON(status == -EAGAIN);
 		if (status < 0) {
 			mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f567cc53d9b..7bb4fde7005 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -488,7 +488,7 @@ bail:
 }
 
 /*
- * extend allocation only here.
+ * extend file allocation only here.
  * we'll update all the disk stuff, and oip->alloc_size
  *
  * expect stuff to be locked, a transaction started and enough data /
@@ -497,107 +497,25 @@ bail:
  * Will return -EAGAIN, and a reason if a restart is needed.
  * If passed in, *reason will always be set, even in error.
  */
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
-			       struct inode *inode,
-			       u32 *logical_offset,
-			       u32 clusters_to_add,
-			       int mark_unwritten,
-			       struct buffer_head *fe_bh,
-			       handle_t *handle,
-			       struct ocfs2_alloc_context *data_ac,
-			       struct ocfs2_alloc_context *meta_ac,
-			       enum ocfs2_alloc_restarted *reason_ret)
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret)
 {
-	int status = 0;
-	int free_extents;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	enum ocfs2_alloc_restarted reason = RESTART_NONE;
-	u32 bit_off, num_bits;
-	u64 block;
-	u8 flags = 0;
-
-	BUG_ON(!clusters_to_add);
-
-	if (mark_unwritten)
-		flags = OCFS2_EXT_UNWRITTEN;
-
-	free_extents = ocfs2_num_free_extents(osb, inode, fe_bh,
-					      OCFS2_DINODE_EXTENT);
-	if (free_extents < 0) {
-		status = free_extents;
-		mlog_errno(status);
-		goto leave;
-	}
-
-	/* there are two cases which could cause us to EAGAIN in the
-	 * we-need-more-metadata case:
-	 * 1) we haven't reserved *any*
-	 * 2) we are so fragmented, we've needed to add metadata too
-	 *    many times. */
-	if (!free_extents && !meta_ac) {
-		mlog(0, "we haven't reserved any metadata!\n");
-		status = -EAGAIN;
-		reason = RESTART_META;
-		goto leave;
-	} else if ((!free_extents)
-		   && (ocfs2_alloc_context_bits_left(meta_ac)
-		       < ocfs2_extend_meta_needed(&fe->id2.i_list))) {
-		mlog(0, "filesystem is really fragmented...\n");
-		status = -EAGAIN;
-		reason = RESTART_META;
-		goto leave;
-	}
+	struct ocfs2_extent_list *el = &fe->id2.i_list;
 
-	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
-					clusters_to_add, &bit_off, &num_bits);
-	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
-		goto leave;
-	}
-
-	BUG_ON(num_bits > clusters_to_add);
-
-	/* reserve our write early -- insert_extent may update the inode */
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
-	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
-				     *logical_offset, block, num_bits,
-				     flags, meta_ac, OCFS2_DINODE_EXTENT);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	clusters_to_add -= num_bits;
-	*logical_offset += num_bits;
-
-	if (clusters_to_add) {
-		mlog(0, "need to alloc once more, clusters = %u, wanted = "
-		     "%u\n", fe->i_clusters, clusters_to_add);
-		status = -EAGAIN;
-		reason = RESTART_TRANS;
-	}
-
-leave:
-	mlog_exit(status);
-	if (reason_ret)
-		*reason_ret = reason;
-	return status;
+	return ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
+					   clusters_to_add, mark_unwritten,
+					   fe_bh, el, handle,
+					   data_ac, meta_ac, reason_ret,
+					   OCFS2_DINODE_EXTENT);
 }
 
 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
@@ -676,16 +594,16 @@ restarted_transaction:
 
 	prev_clusters = OCFS2_I(inode)->ip_clusters;
 
-	status = ocfs2_do_extend_allocation(osb,
-					    inode,
-					    &logical_start,
-					    clusters_to_add,
-					    mark_unwritten,
-					    bh,
-					    handle,
-					    data_ac,
-					    meta_ac,
-					    &why);
+	status = ocfs2_add_inode_data(osb,
+				      inode,
+				      &logical_start,
+				      clusters_to_add,
+				      mark_unwritten,
+				      bh,
+				      handle,
+				      data_ac,
+				      meta_ac,
+				      &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 18e5c80cc73..e92382cbca5 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -33,6 +33,7 @@ extern const struct file_operations ocfs2_dops_no_plocks;
 extern const struct inode_operations ocfs2_file_iops;
 extern const struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;
 
 struct ocfs2_file_private {
 	struct file		*fp_file;
@@ -40,21 +41,16 @@ struct ocfs2_file_private {
 	struct ocfs2_lock_res	fp_flock;
 };
 
-enum ocfs2_alloc_restarted {
-	RESTART_NONE = 0,
-	RESTART_TRANS,
-	RESTART_META
-};
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
-			       struct inode *inode,
-			       u32 *logical_offset,
-			       u32 clusters_to_add,
-			       int mark_unwritten,
-			       struct buffer_head *fe_bh,
-			       handle_t *handle,
-			       struct ocfs2_alloc_context *data_ac,
-			       struct ocfs2_alloc_context *meta_ac,
-			       enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d5d808fe014..2cd6f501755 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1598,10 +1598,10 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;
 
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
-						    new_fe_bh,
-						    handle, data_ac, NULL,
-						    NULL);
+		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
+					      new_fe_bh,
+					      handle, data_ac, NULL,
+					      NULL);
 		if (status < 0) {
 			if (status != -ENOSPC && status != -EINTR) {
 				mlog(ML_ERROR,
-- 
cgit v1.2.3


From 5a7bc8eb29b8c759df374d97b6189e03d4ea71c5 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:46 +0800
Subject: ocfs2: Add the basic xattr disk layout in ocfs2_fs.h

Ocfs2 uses a very flexible structure for storing extended attributes on
disk. Small amount of attributes are stored directly in the inode block - up
to 256 bytes worth. If that fills up, attributes are also stored in an
external block, linked to from the inode block. That block can in turn
expand to a btree, capable of storing large numbers of attributes.

Individual attribute values are stored inline if they're small enough
(currently about 80 bytes, this can be changed though), and otherwise are
expanded to a btree. The theoretical limit to the size of an individual
attribute is about the same as an inode, though the kernel's upper bound on
the size of an attributes data is far smaller.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2_fs.h | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 4f619850ccf..1b46505e1e3 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -64,6 +64,7 @@
 #define OCFS2_INODE_SIGNATURE		"INODE01"
 #define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -715,6 +716,123 @@ struct ocfs2_group_desc
 /*40*/	__u8    bg_bitmap[0];
 };
 
+/*
+ * On disk extended attribute structure for OCFS2.
+ */
+
+/*
+ * ocfs2_xattr_entry indicates one extend attribute.
+ *
+ * Note that it can be stored in inode, one block or one xattr bucket.
+ */
+struct ocfs2_xattr_entry {
+	__le32	xe_name_hash;    /* hash value of xattr prefix+suffix. */
+	__le16	xe_name_offset;  /* byte offset from the 1st etnry in the local
+				    local xattr storage(inode, xattr block or
+				    xattr bucket). */
+	__u8	xe_name_len;	 /* xattr name len, does't include prefix. */
+	__u8	xe_type;         /* the low 7 bits indicates the name prefix's
+				  * type and the highest 1 bits indicate whether
+				  * the EA is stored in the local storage. */
+	__le64	xe_value_size;	 /* real xattr value length. */
+};
+
+/*
+ * On disk structure for xattr header.
+ *
+ * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
+ * the local xattr storage.
+ */
+struct ocfs2_xattr_header {
+	__le16	xh_count;                       /* contains the count of how
+						   many records are in the
+						   local xattr storage. */
+	__le16	xh_reserved1;
+	__le32	xh_reserved2;
+	__le64  xh_csum;
+	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+};
+
+/*
+ * On disk structure for xattr value root.
+ *
+ * It is used when one extended attribute's size is larger, and we will save it
+ * in an outside cluster. It will stored in a b-tree like file content.
+ */
+struct ocfs2_xattr_value_root {
+/*00*/	__le32	xr_clusters;              /* clusters covered by xattr value. */
+	__le32	xr_reserved0;
+	__le64	xr_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xr_list; /* Extent record list */
+};
+
+/*
+ * On disk structure for xattr tree root.
+ *
+ * It is used when there are too many extended attributes for one file. These
+ * attributes will be organized and stored in an indexed-btree.
+ */
+struct ocfs2_xattr_tree_root {
+/*00*/	__le32	xt_clusters;              /* clusters covered by xattr. */
+	__le32	xt_reserved0;
+	__le64	xt_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xt_list; /* Extent record list */
+};
+
+#define OCFS2_XATTR_INDEXED 0x1
+
+/*
+ * On disk structure for xattr block.
+ */
+struct ocfs2_xattr_block {
+/*00*/	__u8	xb_signature[8];     /* Signature for verification */
+	__le16	xb_suballoc_slot;    /* Slot suballocator this
+					block belongs to. */
+	__le16	xb_suballoc_bit;     /* Bit offset in suballocator
+					block group */
+	__le32	xb_fs_generation;    /* Must match super block */
+/*10*/	__le64	xb_blkno;            /* Offset on disk, in blocks */
+	__le64	xb_csum;
+/*20*/	__le16	xb_flags;            /* Indicates whether this block contains
+					real xattr or a xattr tree. */
+	__le16	xb_reserved0;
+	__le32  xb_reserved1;
+	__le64	xb_reserved2;
+/*30*/	union {
+		struct ocfs2_xattr_header xb_header; /* xattr header if this
+							block contains xattr */
+		struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
+							block cotains xattr
+							tree. */
+	} xb_attrs;
+};
+
+#define OCFS2_XATTR_ENTRY_LOCAL		0x80
+#define OCFS2_XATTR_TYPE_MASK		0x7F
+static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
+					 int local)
+{
+	if (local)
+		xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
+	else
+		xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
+{
+	xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
+}
+
+static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
+}
+
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
-- 
cgit v1.2.3


From ac11c827192272eabb68b8f4cf844066461d9690 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:47 +0800
Subject: ocfs2: Add helper function in uptodate.c for removing xattr clusters

The old uptodate only handles the issue of removing one buffer_head from
ocfs2 inode's buffer cache. With xattr clusters, we may need to remove
multiple buffer_head's at a time.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/uptodate.c | 32 ++++++++++++++++++++++++++------
 fs/ocfs2/uptodate.h |  3 +++
 2 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 4da8851f2b2..e26459e7d55 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -511,14 +511,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
 	ci->ci_num_cached--;
 }
 
-/* Called when we remove a chunk of metadata from an inode. We don't
- * bother reverting things to an inlined array in the case of a remove
- * which moves us back under the limit. */
-void ocfs2_remove_from_cache(struct inode *inode,
-			     struct buffer_head *bh)
+static void ocfs2_remove_block_from_cache(struct inode *inode,
+					  sector_t block)
 {
 	int index;
-	sector_t block = bh->b_blocknr;
 	struct ocfs2_meta_cache_item *item = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
@@ -544,6 +540,30 @@ void ocfs2_remove_from_cache(struct inode *inode,
 		kmem_cache_free(ocfs2_uptodate_cachep, item);
 }
 
+/*
+ * Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit.
+ */
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh)
+{
+	sector_t block = bh->b_blocknr;
+
+	ocfs2_remove_block_from_cache(inode, block);
+}
+
+/* Called when we remove xattr clusters from an inode. */
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+					    sector_t block,
+					    u32 c_len)
+{
+	u64 i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
+
+	for (i = 0; i < b_len; i++, block++)
+		ocfs2_remove_block_from_cache(inode, block);
+}
+
 int __init init_ocfs2_uptodate_cache(void)
 {
 	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 2e73206059a..531b4b3a0c4 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
 				   struct buffer_head *bh);
 void ocfs2_remove_from_cache(struct inode *inode,
 			     struct buffer_head *bh);
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+					    sector_t block,
+					    u32 c_len);
 int ocfs2_buffer_read_ahead(struct inode *inode,
 			    struct buffer_head *bh);
 
-- 
cgit v1.2.3


From f56654c435c06f2b2bd5751889b1a08a3add7d6c Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:48 +0800
Subject: ocfs2: Add extent tree operation for xattr value btrees

Add some thin wrappers around ocfs2_insert_extent() for each of the 3
different btree types, ocfs2_inode_insert_extent(),
ocfs2_xattr_value_insert_extent() and ocfs2_xattr_tree_insert_extent(). The
last is for the xattr index btree, which will be used in a followup patch.

All the old callers in file.c etc will call ocfs2_dinode_insert_extent(),
while the other two handle the xattr issue. And the init of extent tree are
handled by these functions.

When storing xattr value which is too large, we will allocate some clusters
for it and here ocfs2_extent_list and ocfs2_extent_rec will also be used. In
order to re-use the b-tree operation code, a new parameter named "private"
is added into ocfs2_extent_tree and it is used to indicate the root of
ocfs2_exent_list. The reason is that we can't deduce the root from the
buffer_head now. It may be in an inode, an ocfs2_xattr_block or even worse,
in any place in an ocfs2_xattr_bucket.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/Makefile          |   3 +-
 fs/ocfs2/alloc.c           | 184 +++++++++++++++++++++------
 fs/ocfs2/alloc.h           |  42 ++++---
 fs/ocfs2/aops.c            |   5 +-
 fs/ocfs2/cluster/masklog.c |   1 +
 fs/ocfs2/cluster/masklog.h |   1 +
 fs/ocfs2/dir.c             |  11 +-
 fs/ocfs2/extent_map.c      |  60 +++++++++
 fs/ocfs2/extent_map.h      |   4 +
 fs/ocfs2/file.c            |   9 +-
 fs/ocfs2/suballoc.c        |   5 +-
 fs/ocfs2/suballoc.h        |   3 +-
 fs/ocfs2/xattr.c           | 305 +++++++++++++++++++++++++++++++++++++++++++++
 13 files changed, 569 insertions(+), 64 deletions(-)
 create mode 100644 fs/ocfs2/xattr.c

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f6956de56fd..af63980319c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -34,7 +34,8 @@ ocfs2-objs := \
 	symlink.o 		\
 	sysfile.o 		\
 	uptodate.o		\
-	ver.o
+	ver.o			\
+	xattr.o			\
 
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index cacfc118b71..e45421fee20 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -78,6 +78,7 @@ struct ocfs2_extent_tree {
 	struct ocfs2_extent_tree_operations *eops;
 	struct buffer_head *root_bh;
 	struct ocfs2_extent_list *root_el;
+	void *private;
 };
 
 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -136,9 +137,50 @@ static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
 	.sanity_check		= ocfs2_dinode_sanity_check,
 };
 
+static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					      u64 blkno)
+{
+	struct ocfs2_xattr_value_root *xv =
+		(struct ocfs2_xattr_value_root *)et->private;
+
+	xv->xr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_value_root *xv =
+		(struct ocfs2_xattr_value_root *) et->private;
+
+	return le64_to_cpu(xv->xr_last_eb_blk);
+}
+
+static void ocfs2_xattr_value_update_clusters(struct inode *inode,
+					      struct ocfs2_extent_tree *et,
+					      u32 clusters)
+{
+	struct ocfs2_xattr_value_root *xv =
+		(struct ocfs2_xattr_value_root *)et->private;
+
+	le32_add_cpu(&xv->xr_clusters, clusters);
+}
+
+static int ocfs2_xattr_value_sanity_check(struct inode *inode,
+					  struct ocfs2_extent_tree *et)
+{
+	return 0;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
+	.set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
+	.get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
+	.update_clusters	= ocfs2_xattr_value_update_clusters,
+	.sanity_check		= ocfs2_xattr_value_sanity_check,
+};
+
 static struct ocfs2_extent_tree*
 	 ocfs2_new_extent_tree(struct buffer_head *bh,
-			       enum ocfs2_extent_tree_type et_type)
+			       enum ocfs2_extent_tree_type et_type,
+			       void *private)
 {
 	struct ocfs2_extent_tree *et;
 
@@ -149,12 +191,16 @@ static struct ocfs2_extent_tree*
 	et->type = et_type;
 	get_bh(bh);
 	et->root_bh = bh;
+	et->private = private;
 
-	/* current we only support dinode extent. */
-	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
 	if (et_type == OCFS2_DINODE_EXTENT) {
 		et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
 		et->eops = &ocfs2_dinode_et_ops;
+	} else if (et_type == OCFS2_XATTR_VALUE_EXTENT) {
+		struct ocfs2_xattr_value_root *xv =
+			(struct ocfs2_xattr_value_root *) private;
+		et->root_el = &xv->xr_list;
+		et->eops = &ocfs2_xattr_et_ops;
 	}
 
 	return et;
@@ -495,7 +541,8 @@ struct ocfs2_merge_ctxt {
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *root_bh,
-			   enum ocfs2_extent_tree_type type)
+			   enum ocfs2_extent_tree_type type,
+			   void *private)
 {
 	int retval;
 	struct ocfs2_extent_list *el = NULL;
@@ -517,6 +564,12 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 		if (fe->i_last_eb_blk)
 			last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
 		el = &fe->id2.i_list;
+	} else if (type == OCFS2_XATTR_VALUE_EXTENT) {
+		struct ocfs2_xattr_value_root *xv =
+			(struct ocfs2_xattr_value_root *) private;
+
+		last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk);
+		el = &xv->xr_list;
 	}
 
 	if (last_eb_blk) {
@@ -4209,33 +4262,25 @@ out:
  *
  * The caller needs to update fe->i_clusters
  */
-int ocfs2_insert_extent(struct ocfs2_super *osb,
-			handle_t *handle,
-			struct inode *inode,
-			struct buffer_head *root_bh,
-			u32 cpos,
-			u64 start_blk,
-			u32 new_clusters,
-			u8 flags,
-			struct ocfs2_alloc_context *meta_ac,
-			enum ocfs2_extent_tree_type et_type)
+static int ocfs2_insert_extent(struct ocfs2_super *osb,
+			       handle_t *handle,
+			       struct inode *inode,
+			       struct buffer_head *root_bh,
+			       u32 cpos,
+			       u64 start_blk,
+			       u32 new_clusters,
+			       u8 flags,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct ocfs2_extent_tree *et)
 {
 	int status;
 	int uninitialized_var(free_records);
 	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
-	struct ocfs2_extent_tree *et = NULL;
 
 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
 
-	et = ocfs2_new_extent_tree(root_bh, et_type);
-	if (!et) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
-
 	mlog(0, "add %u clusters at position %u to inode %llu\n",
 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
@@ -4287,9 +4332,68 @@ bail:
 	if (last_eb_bh)
 		brelse(last_eb_bh);
 
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
+			       handle_t *handle,
+			       struct inode *inode,
+			       struct buffer_head *root_bh,
+			       u32 cpos,
+			       u64 start_blk,
+			       u32 new_clusters,
+			       u8 flags,
+			       struct ocfs2_alloc_context *meta_ac)
+{
+	int status;
+	struct ocfs2_extent_tree *et = NULL;
+
+	et = ocfs2_new_extent_tree(root_bh, OCFS2_DINODE_EXTENT, NULL);
+	if (!et) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
+				     cpos, start_blk, new_clusters,
+				     flags, meta_ac, et);
+
 	if (et)
 		ocfs2_free_extent_tree(et);
-	mlog_exit(status);
+bail:
+	return status;
+}
+
+int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
+				    handle_t *handle,
+				    struct inode *inode,
+				    struct buffer_head *root_bh,
+				    u32 cpos,
+				    u64 start_blk,
+				    u32 new_clusters,
+				    u8 flags,
+				    struct ocfs2_alloc_context *meta_ac,
+				    void *private)
+{
+	int status;
+	struct ocfs2_extent_tree *et = NULL;
+
+	et = ocfs2_new_extent_tree(root_bh, OCFS2_XATTR_VALUE_EXTENT, private);
+	if (!et) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
+				     cpos, start_blk, new_clusters,
+				     flags, meta_ac, et);
+
+	if (et)
+		ocfs2_free_extent_tree(et);
+bail:
 	return status;
 }
 
@@ -4311,7 +4415,8 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 				struct ocfs2_alloc_context *data_ac,
 				struct ocfs2_alloc_context *meta_ac,
 				enum ocfs2_alloc_restarted *reason_ret,
-				enum ocfs2_extent_tree_type type)
+				enum ocfs2_extent_tree_type type,
+				void *private)
 {
 	int status = 0;
 	int free_extents;
@@ -4325,7 +4430,8 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 	if (mark_unwritten)
 		flags = OCFS2_EXT_UNWRITTEN;
 
-	free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type);
+	free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type,
+					      private);
 	if (free_extents < 0) {
 		status = free_extents;
 		mlog_errno(status);
@@ -4372,9 +4478,16 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
-				     *logical_offset, block, num_bits,
-				     flags, meta_ac, type);
+	if (type == OCFS2_DINODE_EXTENT)
+		status = ocfs2_dinode_insert_extent(osb, handle, inode, root_bh,
+						    *logical_offset, block,
+						    num_bits, flags, meta_ac);
+	else
+		status = ocfs2_xattr_value_insert_extent(osb, handle,
+							 inode, root_bh,
+							 *logical_offset,
+							 block, num_bits, flags,
+							 meta_ac, private);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -4655,7 +4768,8 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
-			      enum ocfs2_extent_tree_type et_type)
+			      enum ocfs2_extent_tree_type et_type,
+			      void *private)
 {
 	int ret, index;
 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
@@ -4676,7 +4790,7 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 		goto out;
 	}
 
-	et = ocfs2_new_extent_tree(root_bh, et_type);
+	et = ocfs2_new_extent_tree(root_bh, et_type, private);
 	if (!et) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4964,7 +5078,8 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc,
-			enum ocfs2_extent_tree_type et_type)
+			enum ocfs2_extent_tree_type et_type,
+			void *private)
 {
 	int ret, index;
 	u32 rec_range, trunc_range;
@@ -4973,7 +5088,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 	struct ocfs2_path *path = NULL;
 	struct ocfs2_extent_tree *et = NULL;
 
-	et = ocfs2_new_extent_tree(root_bh, et_type);
+	et = ocfs2_new_extent_tree(root_bh, et_type, private);
 	if (!et) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -6608,9 +6723,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		 * this proves to be false, we could always re-build
 		 * the in-inode data from our pages.
 		 */
-		ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
-					  0, block, 1, 0,
-					  NULL, OCFS2_DINODE_EXTENT);
+		ret = ocfs2_dinode_insert_extent(osb, handle, inode, di_bh,
+						 0, block, 1, 0, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 5e090c5d849..ec7baeb2ea7 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -28,19 +28,29 @@
 
 enum ocfs2_extent_tree_type {
 	OCFS2_DINODE_EXTENT = 0,
+	OCFS2_XATTR_VALUE_EXTENT,
 };
 
 struct ocfs2_alloc_context;
-int ocfs2_insert_extent(struct ocfs2_super *osb,
-			handle_t *handle,
-			struct inode *inode,
-			struct buffer_head *root_bh,
-			u32 cpos,
-			u64 start_blk,
-			u32 new_clusters,
-			u8 flags,
-			struct ocfs2_alloc_context *meta_ac,
-			enum ocfs2_extent_tree_type et_type);
+int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
+			       handle_t *handle,
+			       struct inode *inode,
+			       struct buffer_head *root_bh,
+			       u32 cpos,
+			       u64 start_blk,
+			       u32 new_clusters,
+			       u8 flags,
+			       struct ocfs2_alloc_context *meta_ac);
+int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
+				    handle_t *handle,
+				    struct inode *inode,
+				    struct buffer_head *root_bh,
+				    u32 cpos,
+				    u64 start_blk,
+				    u32 new_clusters,
+				    u8 flags,
+				    struct ocfs2_alloc_context *meta_ac,
+				    void *private);
 enum ocfs2_alloc_restarted {
 	RESTART_NONE = 0,
 	RESTART_TRANS,
@@ -57,22 +67,26 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 				struct ocfs2_alloc_context *data_ac,
 				struct ocfs2_alloc_context *meta_ac,
 				enum ocfs2_alloc_restarted *reason_ret,
-				enum ocfs2_extent_tree_type type);
+				enum ocfs2_extent_tree_type type,
+				void *private);
 struct ocfs2_cached_dealloc_ctxt;
 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
-			      enum ocfs2_extent_tree_type et_type);
+			      enum ocfs2_extent_tree_type et_type,
+			      void *private);
 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc,
-			enum ocfs2_extent_tree_type et_type);
+			enum ocfs2_extent_tree_type et_type,
+			void *private);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *root_bh,
-			   enum ocfs2_extent_tree_type et_type);
+			   enum ocfs2_extent_tree_type et_type,
+			   void *private);
 
 /*
  * how many new metadata chunks would an allocation need at maximum?
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e7acd286790..530b1ff599c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1279,7 +1279,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
 						wc->w_handle, cpos, 1, phys,
 						meta_ac, &wc->w_dealloc,
-						OCFS2_DINODE_EXTENT);
+						OCFS2_DINODE_EXTENT, NULL);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1721,7 +1721,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 
 		ret = ocfs2_lock_allocators(inode, wc->w_di_bh, &di->id2.i_list,
 					    clusters_to_alloc, extents_to_split,
-					    &data_ac, &meta_ac);
+					    &data_ac, &meta_ac,
+					    OCFS2_DINODE_EXTENT, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 23c732f2752..d8a0cb92cef 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(CONN),
 	define_mask(QUORUM),
 	define_mask(EXPORT),
+	define_mask(XATTR),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 597e064bb94..57670c68047 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -112,6 +112,7 @@
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
+#define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index d17c34b0ac6..5426a02c12b 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1305,8 +1305,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * This should never fail as our extent list is empty and all
 	 * related blocks have been journaled already.
 	 */
-	ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
-				  NULL, OCFS2_DINODE_EXTENT);
+	ret = ocfs2_dinode_insert_extent(osb, handle, dir, di_bh, 0, blkno,
+					 len, 0, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1337,8 +1337,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		}
 		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
 
-		ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
-					  len, 0, NULL, OCFS2_DINODE_EXTENT);
+		ret = ocfs2_dinode_insert_extent(osb, handle, dir, di_bh, 1,
+						 blkno, len, 0, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
@@ -1482,7 +1482,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		num_free_extents = ocfs2_num_free_extents(osb, dir,
 							  parent_fe_bh,
-							  OCFS2_DINODE_EXTENT);
+							  OCFS2_DINODE_EXTENT,
+							  NULL);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index aed268e80b4..a7b1cfa735b 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -551,6 +551,66 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb,
 		*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
 }
 
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+	u32 coff;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
+	} else {
+		rec = &el->l_recs[i];
+		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+		if (!rec->e_blkno) {
+			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+				    "record (%u, %u, 0) in xattr", inode->i_ino,
+				    le32_to_cpu(rec->e_cpos),
+				    ocfs2_rec_clusters(el, rec));
+			ret = -EROFS;
+			goto out;
+		}
+		coff = v_cluster - le32_to_cpu(rec->e_cpos);
+		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+						    le64_to_cpu(rec->e_blkno));
+		*p_cluster = *p_cluster + coff;
+		if (num_clusters)
+			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+	}
+out:
+	if (eb_bh)
+		brelse(eb_bh);
+	return ret;
+}
+
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 		       u32 *p_cluster, u32 *num_clusters,
 		       unsigned int *extent_flags)
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1b97490e1ea..1c4aa8b06f3 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,4 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 u64 map_start, u64 map_len);
 
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el);
+
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7bb4fde7005..89d8541f85b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -515,7 +515,7 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 					   clusters_to_add, mark_unwritten,
 					   fe_bh, el, handle,
 					   data_ac, meta_ac, reason_ret,
-					   OCFS2_DINODE_EXTENT);
+					   OCFS2_DINODE_EXTENT, NULL);
 }
 
 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
@@ -565,7 +565,7 @@ restart_all:
 	     clusters_to_add);
 	status = ocfs2_lock_allocators(inode, bh, &fe->id2.i_list,
 				       clusters_to_add, 0, &data_ac,
-				       &meta_ac);
+				       &meta_ac, OCFS2_DINODE_EXTENT, NULL);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
@@ -1237,7 +1237,8 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
 	ret = ocfs2_lock_allocators(inode, di_bh, &di->id2.i_list,
-				    0, 1, NULL, &meta_ac);
+				    0, 1, NULL, &meta_ac,
+				    OCFS2_DINODE_EXTENT, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -1268,7 +1269,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	}
 
 	ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
-				  dealloc, OCFS2_DINODE_EXTENT);
+				  dealloc, OCFS2_DINODE_EXTENT, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b642c825fb7..bb774d70d26 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1909,7 +1909,8 @@ int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
 			  struct ocfs2_extent_list *root_el,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac)
+			  struct ocfs2_alloc_context **meta_ac,
+			  enum ocfs2_extent_tree_type type, void *private)
 {
 	int ret = 0, num_free_extents;
 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
@@ -1922,7 +1923,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
 	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
 
 	num_free_extents = ocfs2_num_free_extents(osb, inode, root_bh,
-						  OCFS2_DINODE_EXTENT);
+						  type, private);
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
 		mlog_errno(ret);
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a3e531e62df..9e026c8afee 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -166,5 +166,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
 			  struct ocfs2_extent_list *root_el,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac);
+			  struct ocfs2_alloc_context **meta_ac,
+			  enum ocfs2_extent_tree_type type, void *private);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 00000000000..9604a4cd02b
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,305 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.c
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#define MLOG_MASK_PREFIX ML_XATTR
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+
+static int ocfs2_xattr_extend_allocation(struct inode *inode,
+					 u32 clusters_to_add,
+					 struct buffer_head *xattr_bh,
+					 struct ocfs2_xattr_value_root *xv)
+{
+	int status = 0;
+	int restart_func = 0;
+	int credits = 0;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_list *root_el = &xv->xr_list;
+	u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+
+	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
+
+restart_all:
+
+	status = ocfs2_lock_allocators(inode, xattr_bh, root_el,
+				       clusters_to_add, 0, &data_ac,
+				       &meta_ac, OCFS2_XATTR_VALUE_EXTENT, xv);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	credits = ocfs2_calc_extend_credits(osb->sb, root_el, clusters_to_add);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+restarted_transaction:
+	status = ocfs2_journal_access(handle, inode, xattr_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	prev_clusters = le32_to_cpu(xv->xr_clusters);
+	status = ocfs2_add_clusters_in_btree(osb,
+					     inode,
+					     &logical_start,
+					     clusters_to_add,
+					     0,
+					     xattr_bh,
+					     root_el,
+					     handle,
+					     data_ac,
+					     meta_ac,
+					     &why,
+					     OCFS2_XATTR_VALUE_EXTENT,
+					     xv);
+	if ((status < 0) && (status != -EAGAIN)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, xattr_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+
+	if (why != RESTART_NONE && clusters_to_add) {
+		if (why == RESTART_META) {
+			mlog(0, "restarting function.\n");
+			restart_func = 1;
+		} else {
+			BUG_ON(why != RESTART_TRANS);
+
+			mlog(0, "restarting transaction.\n");
+			/* TODO: This can be more intelligent. */
+			credits = ocfs2_calc_extend_credits(osb->sb,
+							    root_el,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				/* handle still has to be committed at
+				 * this point. */
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+
+leave:
+	if (handle) {
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
+
+	return status;
+}
+
+static int __ocfs2_remove_xattr_range(struct inode *inode,
+				      struct buffer_head *root_bh,
+				      struct ocfs2_xattr_value_root *xv,
+				      u32 cpos, u32 phys_cpos, u32 len,
+				      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_lock_allocators(inode, root_bh, &xv->xr_list,
+				    0, 1, NULL, &meta_ac,
+				    OCFS2_XATTR_VALUE_EXTENT, xv);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_remove_extent(inode, root_bh, cpos, len, handle, meta_ac,
+				  dealloc, OCFS2_XATTR_VALUE_EXTENT, xv);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	le32_add_cpu(&xv->xr_clusters, -len);
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+static int ocfs2_xattr_shrink_size(struct inode *inode,
+				   u32 old_clusters,
+				   u32 new_clusters,
+				   struct buffer_head *root_bh,
+				   struct ocfs2_xattr_value_root *xv)
+{
+	int ret = 0;
+	u32 trunc_len, cpos, phys_cpos, alloc_size;
+	u64 block;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (old_clusters <= new_clusters)
+		return 0;
+
+	cpos = new_clusters;
+	trunc_len = old_clusters - new_clusters;
+	while (trunc_len) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
+					       &alloc_size, &xv->xr_list);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (alloc_size > trunc_len)
+			alloc_size = trunc_len;
+
+		ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+						 phys_cpos, alloc_size,
+						 &dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+		ocfs2_remove_xattr_clusters_from_cache(inode, block,
+						       alloc_size);
+		cpos += alloc_size;
+		trunc_len -= alloc_size;
+	}
+
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+static int ocfs2_xattr_value_truncate(struct inode *inode,
+				      struct buffer_head *root_bh,
+				      struct ocfs2_xattr_value_root *xv,
+				      int len)
+{
+	int ret;
+	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
+	u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+
+	if (new_clusters == old_clusters)
+		return 0;
+
+	if (new_clusters > old_clusters)
+		ret = ocfs2_xattr_extend_allocation(inode,
+						    new_clusters - old_clusters,
+						    root_bh, xv);
+	else
+		ret = ocfs2_xattr_shrink_size(inode,
+					      old_clusters, new_clusters,
+					      root_bh, xv);
+
+	return ret;
+}
-- 
cgit v1.2.3


From fdd77704a8b4666a32120fcd1e4a9fedaf3263d8 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Mon, 18 Aug 2008 17:08:55 +0800
Subject: ocfs2: reserve inline space for extended attribute

Add the structures and helper functions we want for handling inline extended
attributes. We also update the inline-data handlers so that they properly
function in the event that we have both inline data and inline attributes
sharing an inode block.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c    | 22 ++++++++++++++++------
 fs/ocfs2/ocfs2.h    |  1 +
 fs/ocfs2/ocfs2_fs.h | 46 +++++++++++++++++++++++++++++++++++++++++++---
 fs/ocfs2/super.c    |  2 ++
 4 files changed, 62 insertions(+), 9 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e45421fee20..ace27d1ca57 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6577,20 +6577,29 @@ out:
 	return ret;
 }
 
-static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
+static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
+					     struct ocfs2_dinode *di)
 {
 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
 
-	memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		memset(&di->id2, 0, blocksize -
+				    offsetof(struct ocfs2_dinode, id2) -
+				    xattrsize);
+	else
+		memset(&di->id2, 0, blocksize -
+				    offsetof(struct ocfs2_dinode, id2));
 }
 
 void ocfs2_dinode_new_extent_list(struct inode *inode,
 				  struct ocfs2_dinode *di)
 {
-	ocfs2_zero_dinode_id2(inode, di);
+	ocfs2_zero_dinode_id2_with_xattr(inode, di);
 	di->id2.i_list.l_tree_depth = 0;
 	di->id2.i_list.l_next_free_rec = 0;
-	di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
+	di->id2.i_list.l_count = cpu_to_le16(
+		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
 }
 
 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
@@ -6607,9 +6616,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
 	 * We clear the entire i_data structure here so that all
 	 * fields can be properly initialized.
 	 */
-	ocfs2_zero_dinode_id2(inode, di);
+	ocfs2_zero_dinode_id2_with_xattr(inode, di);
 
-	idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
+	idata->id_count = cpu_to_le16(
+			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
 }
 
 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 128279986d6..ce75ca312a2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -245,6 +245,7 @@ struct ocfs2_super
 	int s_sectsize_bits;
 	int s_clustersize;
 	int s_clustersize_bits;
+	unsigned int s_xattr_inline_size;
 
 	atomic_t vol_state;
 	struct mutex recovery_lock;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 1b46505e1e3..1055ba0af9b 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -300,6 +300,12 @@ struct ocfs2_new_group_input {
  */
 #define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE	8
 
+/*
+ * Inline extended attribute size (in bytes)
+ * The value chosen should be aligned to 16 byte boundaries.
+ */
+#define OCFS2_MIN_XATTR_INLINE_SIZE     256
+
 struct ocfs2_system_inode_info {
 	char	*si_name;
 	int	si_iflags;
@@ -622,7 +628,8 @@ struct ocfs2_dinode {
 					   belongs to */
 	__le16 i_suballoc_bit;		/* Bit offset in suballocator
 					   block group */
-/*10*/	__le32 i_reserved0;
+/*10*/	__le16 i_reserved0;
+	__le16 i_xattr_inline_size;
 	__le32 i_clusters;		/* Cluster count */
 	__le32 i_uid;			/* Owner UID */
 	__le32 i_gid;			/* Owning GID */
@@ -641,11 +648,12 @@ struct ocfs2_dinode {
 	__le32 i_atime_nsec;
 	__le32 i_ctime_nsec;
 	__le32 i_mtime_nsec;
-	__le32 i_attr;
+/*70*/	__le32 i_attr;
 	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
 					   was set in i_flags */
 	__le16 i_dyn_features;
-/*70*/	__le64 i_reserved2[8];
+	__le64 i_xattr_loc;
+/*80*/	__le64 i_reserved2[7];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -846,6 +854,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb)
 		offsetof(struct ocfs2_dinode, id2.i_data.id_data);
 }
 
+static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
+						   struct ocfs2_dinode *di)
+{
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			xattrsize;
+	else
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
 static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 {
 	int size;
@@ -856,6 +878,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 	return size / sizeof(struct ocfs2_extent_rec);
 }
 
+static inline int ocfs2_extent_recs_per_inode_with_xattr(
+						struct super_block *sb,
+						struct ocfs2_dinode *di)
+{
+	int size;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+			xattrsize;
+	else
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
 static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
 {
 	int size;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a2d3dcf7025..9bdb3aeefe8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1424,6 +1424,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
 	osb->slot_num = OCFS2_INVALID_SLOT;
 
+	osb->s_xattr_inline_size = OCFS2_MIN_XATTR_INLINE_SIZE;
+
 	osb->local_alloc_state = OCFS2_LA_UNUSED;
 	osb->local_alloc_bh = NULL;
 	INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
-- 
cgit v1.2.3


From cf1d6c763fbcb115263114302485ad17e7933d87 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Mon, 18 Aug 2008 17:11:00 +0800
Subject: ocfs2: Add extended attribute support

This patch implements storing extended attributes both in inode or a single
external block. We only store EA's in-inode when blocksize > 512 or that
inode block has free space for it. When an EA's value is larger than 80
bytes, we will store the value via b-tree outside inode or block.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/Makefile        |    2 +
 fs/ocfs2/file.c          |    5 +
 fs/ocfs2/inode.c         |    8 +
 fs/ocfs2/inode.h         |    3 +
 fs/ocfs2/journal.h       |   10 +
 fs/ocfs2/namei.c         |    5 +
 fs/ocfs2/ocfs2.h         |    2 +
 fs/ocfs2/ocfs2_fs.h      |    8 +-
 fs/ocfs2/suballoc.c      |   17 +-
 fs/ocfs2/suballoc.h      |    3 +
 fs/ocfs2/super.c         |   14 +
 fs/ocfs2/symlink.c       |    9 +
 fs/ocfs2/xattr.c         | 1620 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr.h         |   51 ++
 fs/ocfs2/xattr_trusted.c |   82 +++
 fs/ocfs2/xattr_user.c    |   94 +++
 16 files changed, 1927 insertions(+), 6 deletions(-)
 create mode 100644 fs/ocfs2/xattr.h
 create mode 100644 fs/ocfs2/xattr_trusted.c
 create mode 100644 fs/ocfs2/xattr_user.c

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index af63980319c..21323da4085 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -36,6 +36,8 @@ ocfs2-objs := \
 	uptodate.o		\
 	ver.o			\
 	xattr.o			\
+	xattr_user.o		\
+	xattr_trusted.o
 
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89d8541f85b..f4273c2c209 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -55,6 +55,7 @@
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -2070,6 +2071,10 @@ const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 	.fallocate	= ocfs2_fallocate,
 	.fiemap		= ocfs2_fiemap,
 };
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 99f012a0f20..4738dd25bb9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,6 +49,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -741,6 +742,13 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		goto bail_unlock_dir;
 	}
 
+	/*Free extended attribute resources associated with this inode.*/
+	status = ocfs2_xattr_remove(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
 	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
 				    orphan_dir_bh);
 	if (status < 0)
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 390a85596aa..499bc62e758 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -40,6 +40,9 @@ struct ocfs2_inode_info
 	/* protects allocation changes on this inode. */
 	struct rw_semaphore		ip_alloc_sem;
 
+	/* protects extended attribute changes on this inode */
+	struct rw_semaphore		ip_xattr_sem;
+
 	/* These fields are protected by ip_lock */
 	spinlock_t			ip_lock;
 	u32				ip_open_count;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 9485f8037d9..08d1add1487 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -283,6 +283,9 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* simple file updates like chmod, etc. */
 #define OCFS2_INODE_UPDATE_CREDITS 1
 
+/* extended attribute block update */
+#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
 
@@ -340,6 +343,13 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
 			     + OCFS2_UNLINK_CREDITS)
 
+/* global bitmap dinode, group desc., relinked group,
+ * suballocator dinode, group desc., relinked group,
+ * dinode, xattr block */
+#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+					  + OCFS2_INODE_UPDATE_CREDITS \
+					  + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+
 /*
  * Please note that the caller must make sure that root_el is the root
  * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2cd6f501755..76d1d131430 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,6 +60,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -1918,4 +1919,8 @@ const struct inode_operations ocfs2_dir_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ce75ca312a2..cae0dd4b7f7 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -188,6 +188,7 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
 	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
 	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
@@ -218,6 +219,7 @@ struct ocfs2_super
 	u32 bitmap_cpg;
 	u8 *uuid;
 	char *uuid_str;
+	u32 uuid_hash;
 	u8 *vol_label;
 	u64 first_cluster_group_blkno;
 	u32 fs_generation;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 1055ba0af9b..98e1f8bba0e 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -570,7 +570,7 @@ struct ocfs2_super_block {
 /*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
 					   before tunefs required */
 	__le16 s_tunefs_flag;
-	__le32 s_reserved1;
+	__le32 s_uuid_hash;		/* hash value of uuid */
 	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
 					 * group header */
 /*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
@@ -787,7 +787,11 @@ struct ocfs2_xattr_tree_root {
 /*10*/	struct ocfs2_extent_list xt_list; /* Extent record list */
 };
 
-#define OCFS2_XATTR_INDEXED 0x1
+#define OCFS2_XATTR_INDEXED	0x1
+#define OCFS2_HASH_SHIFT	5
+#define OCFS2_XATTR_ROUND	3
+#define OCFS2_XATTR_SIZE(size)	(((size) + OCFS2_XATTR_ROUND) & \
+				~(OCFS2_XATTR_ROUND))
 
 /*
  * On disk structure for xattr block.
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index bb774d70d26..f1871ca8381 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -493,9 +493,9 @@ bail:
 	return status;
 }
 
-int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_extent_list *root_el,
-			       struct ocfs2_alloc_context **ac)
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac)
 {
 	int status;
 	u32 slot;
@@ -507,7 +507,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(root_el);
+	(*ac)->ac_bits_wanted = blocks;
 	(*ac)->ac_which = OCFS2_AC_USE_META;
 	slot = osb->slot_num;
 	(*ac)->ac_group_search = ocfs2_block_group_search;
@@ -532,6 +532,15 @@ bail:
 	return status;
 }
 
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_extent_list *root_el,
+			       struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_new_metadata_blocks(osb,
+					ocfs2_extend_meta_needed(root_el),
+					ac);
+}
+
 static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
 					      struct ocfs2_alloc_context *ac)
 {
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 9e026c8afee..028fd633b44 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -67,6 +67,9 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 			       struct ocfs2_extent_list *root_el,
 			       struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 			    struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 9bdb3aeefe8..3b04f5d2e89 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -64,6 +64,7 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "ver.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -154,6 +155,8 @@ enum {
 	Opt_localalloc,
 	Opt_localflocks,
 	Opt_stack,
+	Opt_user_xattr,
+	Opt_nouser_xattr,
 	Opt_err,
 };
 
@@ -173,6 +176,8 @@ static const match_table_t tokens = {
 	{Opt_localalloc, "localalloc=%d"},
 	{Opt_localflocks, "localflocks"},
 	{Opt_stack, "cluster_stack=%s"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_err, NULL}
 };
 
@@ -848,6 +853,12 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_data_writeback:
 			mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
 			break;
+		case Opt_user_xattr:
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+			break;
+		case Opt_nouser_xattr:
+			mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+			break;
 		case Opt_atime_quantum:
 			if (match_int(&args[0], &option)) {
 				status = 0;
@@ -1135,6 +1146,7 @@ static void ocfs2_inode_init_once(void *data)
 	oi->ip_dir_start_lookup = 0;
 
 	init_rwsem(&oi->ip_alloc_sem);
+	init_rwsem(&oi->ip_xattr_sem);
 	mutex_init(&oi->ip_io_mutex);
 
 	oi->ip_blkno = 0ULL;
@@ -1378,6 +1390,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
 	/* this is needed to support O_LARGEFILE */
@@ -1574,6 +1587,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->first_cluster_group_blkno =
 		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
 	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+	osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
 	mlog(0, "vol_label: %s\n", osb->vol_label);
 	mlog(0, "uuid: %s\n", osb->uuid_str);
 	mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ba9dbb51d25..8c5879c7f84 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -50,6 +50,7 @@
 #include "inode.h"
 #include "journal.h"
 #include "symlink.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -168,10 +169,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
 	.follow_link	= ocfs2_follow_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
 const struct inode_operations ocfs2_fast_symlink_inode_operations = {
 	.readlink	= ocfs2_readlink,
 	.follow_link	= ocfs2_follow_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 9604a4cd02b..67bebd9259e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5,6 +5,9 @@
  *
  * Copyright (C) 2008 Oracle.  All rights reserved.
  *
+ * CREDITS:
+ * Lots of code in this file is taken from ext3.
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
@@ -21,6 +24,19 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+
 #define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
 
@@ -28,12 +44,119 @@
 #include "alloc.h"
 #include "dlmglue.h"
 #include "file.h"
+#include "symlink.h"
+#include "sysfile.h"
 #include "inode.h"
 #include "journal.h"
 #include "ocfs2_fs.h"
 #include "suballoc.h"
 #include "uptodate.h"
 #include "buffer_head_io.h"
+#include "xattr.h"
+
+
+struct ocfs2_xattr_def_value_root {
+	struct ocfs2_xattr_value_root	xv;
+	struct ocfs2_extent_rec		er;
+};
+
+#define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
+#define OCFS2_XATTR_INLINE_SIZE	80
+
+static struct ocfs2_xattr_def_value_root def_xv = {
+	.xv.xr_list.l_count = cpu_to_le16(1),
+};
+
+struct xattr_handler *ocfs2_xattr_handlers[] = {
+	&ocfs2_xattr_user_handler,
+	&ocfs2_xattr_trusted_handler,
+	NULL
+};
+
+static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
+	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
+};
+
+struct ocfs2_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ocfs2_xattr_search {
+	struct buffer_head *inode_bh;
+	/*
+	 * xattr_bh point to the block buffer head which has extended attribute
+	 * when extended attribute in inode, xattr_bh is equal to inode_bh.
+	 */
+	struct buffer_head *xattr_bh;
+	struct ocfs2_xattr_header *header;
+	void *base;
+	void *end;
+	struct ocfs2_xattr_entry *here;
+	int not_found;
+};
+
+static inline struct xattr_handler *ocfs2_xattr_handler(int name_index)
+{
+	struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
+		handler = ocfs2_xattr_handler_map[name_index];
+
+	return handler;
+}
+
+static inline u32 ocfs2_xattr_name_hash(struct inode *inode,
+					char *prefix,
+					int prefix_len,
+					char *name,
+					int name_len)
+{
+	/* Get hash value of uuid from super block */
+	u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
+	int i;
+
+	/* hash extended attribute prefix */
+	for (i = 0; i < prefix_len; i++) {
+		hash = (hash << OCFS2_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
+		       *prefix++;
+	}
+	/* hash extended attribute name */
+	for (i = 0; i < name_len; i++) {
+		hash = (hash << OCFS2_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	return hash;
+}
+
+/*
+ * ocfs2_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static void ocfs2_xattr_hash_entry(struct inode *inode,
+				   struct ocfs2_xattr_header *header,
+				   struct ocfs2_xattr_entry *entry)
+{
+	u32 hash = 0;
+	struct xattr_handler *handler =
+			ocfs2_xattr_handler(ocfs2_xattr_get_type(entry));
+	char *prefix = handler->prefix;
+	char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
+	int prefix_len = strlen(handler->prefix);
+
+	hash = ocfs2_xattr_name_hash(inode, prefix, prefix_len, name,
+				     entry->xe_name_len);
+	entry->xe_name_hash = cpu_to_le32(hash);
+
+	return;
+}
 
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
 					 u32 clusters_to_add,
@@ -303,3 +426,1500 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
 
 	return ret;
 }
+
+static int ocfs2_xattr_list_entries(struct inode *inode,
+				    struct ocfs2_xattr_header *header,
+				    char *buffer, size_t buffer_size)
+{
+	size_t rest = buffer_size;
+	int i;
+
+	for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+		struct xattr_handler *handler =
+			ocfs2_xattr_handler(ocfs2_xattr_get_type(entry));
+
+		if (handler) {
+			size_t size = handler->list(inode, buffer, rest,
+					((char *)header +
+					le16_to_cpu(entry->xe_name_offset)),
+					entry->xe_name_len);
+			if (buffer) {
+				if (size > rest)
+					return -ERANGE;
+				buffer += size;
+			}
+			rest -= size;
+		}
+	}
+
+	return buffer_size - rest;
+}
+
+static int ocfs2_xattr_ibody_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct ocfs2_xattr_header *header = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return ret;
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+
+	return ret;
+}
+
+static int ocfs2_xattr_block_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_header *header = NULL;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+			       le64_to_cpu(di->i_xattr_loc),
+			       &blk_bh, OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+		ret = -EFAULT;
+		goto cleanup;
+	}
+
+	header = &((struct ocfs2_xattr_block *)blk_bh->b_data)->
+		 xb_attrs.xb_header;
+
+	ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+cleanup:
+	brelse(blk_bh);
+
+	return ret;
+}
+
+ssize_t ocfs2_listxattr(struct dentry *dentry,
+			char *buffer,
+			size_t size)
+{
+	int ret = 0, i_ret = 0, b_ret = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return ret;
+
+	ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_read(&oi->ip_xattr_sem);
+	i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+	if (i_ret < 0)
+		b_ret = 0;
+	else {
+		if (buffer) {
+			buffer += i_ret;
+			size -= i_ret;
+		}
+		b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+					       buffer, size);
+		if (b_ret < 0)
+			i_ret = 0;
+	}
+	up_read(&oi->ip_xattr_sem);
+	ocfs2_inode_unlock(dentry->d_inode, 0);
+
+	brelse(di_bh);
+
+	return i_ret + b_ret;
+}
+
+static int ocfs2_xattr_find_entry(int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_entry *entry;
+	size_t name_len;
+	int i, cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	entry = xs->here;
+	for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+		cmp = name_index - ocfs2_xattr_get_type(entry);
+		if (!cmp)
+			cmp = name_len - entry->xe_name_len;
+		if (!cmp)
+			cmp = memcmp(name, (xs->base +
+				     le16_to_cpu(entry->xe_name_offset)),
+				     name_len);
+		if (cmp == 0)
+			break;
+		entry += 1;
+	}
+	xs->here = entry;
+
+	return cmp ? -ENODATA : 0;
+}
+
+static int ocfs2_xattr_get_value_outside(struct inode *inode,
+					 struct ocfs2_xattr_search *xs,
+					 void *buffer,
+					 size_t len)
+{
+	u32 cpos, p_cluster, num_clusters, bpc, clusters;
+	u64 blkno;
+	int i, ret = 0;
+	size_t cplen, blocksize;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_extent_list *el;
+
+	xv = (struct ocfs2_xattr_value_root *)
+		(xs->base + le16_to_cpu(xs->here->xe_name_offset) +
+		OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+	el = &xv->xr_list;
+	clusters = le32_to_cpu(xv->xr_clusters);
+	bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	blocksize = inode->i_sb->s_blocksize;
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		/* Copy ocfs2_xattr_value */
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+					       &bh, OCFS2_BH_CACHED, inode);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cplen = len >= blocksize ? blocksize : len;
+			memcpy(buffer, bh->b_data, cplen);
+			len -= cplen;
+			buffer += cplen;
+
+			brelse(bh);
+			bh = NULL;
+			if (len == 0)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	size_t size;
+	int ret = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return -ENODATA;
+
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - le16_to_cpu(di->i_xattr_inline_size));
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	if (ret)
+		return ret;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		if (size > buffer_size)
+			return -ERANGE;
+		if (ocfs2_xattr_is_local(xs->here)) {
+			memcpy(buffer, (void *)xs->base +
+			       le16_to_cpu(xs->here->xe_name_offset) +
+			       OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+		} else {
+			ret = ocfs2_xattr_get_value_outside(inode, xs,
+							    buffer, size);
+			if (ret < 0) {
+				mlog_errno(ret);
+				return ret;
+			}
+		}
+	}
+
+	return size;
+}
+
+static int ocfs2_xattr_block_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	size_t size;
+	int ret = -ENODATA;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+			       le64_to_cpu(di->i_xattr_loc),
+			       &blk_bh, OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+		ret = -EFAULT;
+		goto cleanup;
+	}
+
+	xs->xattr_bh = blk_bh;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	xs->header = &xb->xb_attrs.xb_header;
+	xs->base = (void *)xs->header;
+	xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+	xs->here = xs->header->xh_entries;
+
+	ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	if (ret)
+		goto cleanup;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		ret = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		if (ocfs2_xattr_is_local(xs->here)) {
+			memcpy(buffer, (void *)xs->base +
+			       le16_to_cpu(xs->here->xe_name_offset) +
+			       OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+		} else {
+			ret = ocfs2_xattr_get_value_outside(inode, xs,
+							    buffer, size);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+		}
+	}
+	ret = size;
+cleanup:
+	brelse(blk_bh);
+
+	return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+int ocfs2_xattr_get(struct inode *inode,
+		    int name_index,
+		    const char *name,
+		    void *buffer,
+		    size_t buffer_size)
+{
+	int ret;
+	struct ocfs2_dinode *di = NULL;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		ret = -ENODATA;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_read(&oi->ip_xattr_sem);
+	ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
+				    buffer_size, &xis);
+	if (ret == -ENODATA)
+		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
+					    buffer_size, &xbs);
+	up_read(&oi->ip_xattr_sem);
+	ocfs2_inode_unlock(inode, 0);
+
+	brelse(di_bh);
+
+	return ret;
+}
+
+static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+					   struct ocfs2_xattr_value_root *xv,
+					   const void *value,
+					   int value_len)
+{
+	int ret = 0, i, cp_len, credits;
+	u16 blocksize = inode->i_sb->s_blocksize;
+	u32 p_cluster, num_clusters;
+	u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
+	u64 blkno;
+	struct buffer_head *bh = NULL;
+	handle_t *handle;
+
+	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
+
+	credits = clusters * bpc;
+	handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, &xv->xr_list);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+					       &bh, OCFS2_BH_CACHED, inode);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			ret = ocfs2_journal_access(handle,
+						   inode,
+						   bh,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			cp_len = value_len > blocksize ? blocksize : value_len;
+			memcpy(bh->b_data, value, cp_len);
+			value_len -= cp_len;
+			value += cp_len;
+			if (cp_len < blocksize)
+				memset(bh->b_data + cp_len, 0,
+				       blocksize - cp_len);
+
+			ret = ocfs2_journal_dirty(handle, bh);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+			brelse(bh);
+			bh = NULL;
+
+			/*
+			 * XXX: do we need to empty all the following
+			 * blocks in this cluster?
+			 */
+			if (!value_len)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	brelse(bh);
+
+	return ret;
+}
+
+static int ocfs2_xattr_cleanup(struct inode *inode,
+			       struct ocfs2_xattr_info *xi,
+			       struct ocfs2_xattr_search *xs,
+			       size_t offs)
+{
+	handle_t *handle = NULL;
+	int ret = 0;
+	size_t name_len = strlen(xi->name);
+	void *val = xs->base + offs;
+	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+	/* Decrease xattr count */
+	le16_add_cpu(&xs->header->xh_count, -1);
+	/* Remove the xattr entry and tree root which has already be set*/
+	memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
+	memset(val, 0, size);
+
+	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_update_entry(struct inode *inode,
+				    struct ocfs2_xattr_info *xi,
+				    struct ocfs2_xattr_search *xs,
+				    size_t offs)
+{
+	handle_t *handle = NULL;
+	int ret = 0;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	xs->here->xe_name_offset = cpu_to_le16(offs);
+	xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+	if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
+		ocfs2_xattr_set_local(xs->here, 1);
+	else
+		ocfs2_xattr_set_local(xs->here, 0);
+	ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+
+	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set_value_outside()
+ *
+ * Set large size value in B tree.
+ */
+static int ocfs2_xattr_set_value_outside(struct inode *inode,
+					 struct ocfs2_xattr_info *xi,
+					 struct ocfs2_xattr_search *xs,
+					 size_t offs)
+{
+	size_t name_len = strlen(xi->name);
+	void *val = xs->base + offs;
+	struct ocfs2_xattr_value_root *xv = NULL;
+	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	int ret = 0;
+
+	memset(val, 0, size);
+	memcpy(val, xi->name, name_len);
+	xv = (struct ocfs2_xattr_value_root *)
+		(val + OCFS2_XATTR_SIZE(name_len));
+	xv->xr_clusters = 0;
+	xv->xr_last_eb_blk = 0;
+	xv->xr_list.l_tree_depth = 0;
+	xv->xr_list.l_count = cpu_to_le16(1);
+	xv->xr_list.l_next_free_rec = 0;
+
+	ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+					 xi->value_len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+					      xi->value_len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set_entry_local()
+ *
+ * Set, replace or remove extended attribute in local.
+ */
+static void ocfs2_xattr_set_entry_local(struct inode *inode,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xs,
+					struct ocfs2_xattr_entry *last,
+					size_t min_offs)
+{
+	size_t name_len = strlen(xi->name);
+	int i;
+
+	if (xi->value && xs->not_found) {
+		/* Insert the new xattr entry. */
+		le16_add_cpu(&xs->header->xh_count, 1);
+		ocfs2_xattr_set_type(last, xi->name_index);
+		ocfs2_xattr_set_local(last, 1);
+		last->xe_name_len = name_len;
+	} else {
+		void *first_val;
+		void *val;
+		size_t offs, size;
+
+		first_val = xs->base + min_offs;
+		offs = le16_to_cpu(xs->here->xe_name_offset);
+		val = xs->base + offs;
+
+		if (le64_to_cpu(xs->here->xe_value_size) >
+		    OCFS2_XATTR_INLINE_SIZE)
+			size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_ROOT_SIZE;
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+
+		if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_SIZE(xi->value_len)) {
+			/* The old and the new value have the
+			   same size. Just replace the value. */
+			ocfs2_xattr_set_local(xs->here, 1);
+			xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+			/* Clear value bytes. */
+			memset(val + OCFS2_XATTR_SIZE(name_len),
+			       0,
+			       OCFS2_XATTR_SIZE(xi->value_len));
+			memcpy(val + OCFS2_XATTR_SIZE(name_len),
+			       xi->value,
+			       xi->value_len);
+			return;
+		}
+		/* Remove the old name+value. */
+		memmove(first_val + size, first_val, val - first_val);
+		memset(first_val, 0, size);
+		xs->here->xe_name_hash = 0;
+		xs->here->xe_name_offset = 0;
+		ocfs2_xattr_set_local(xs->here, 1);
+		xs->here->xe_value_size = 0;
+
+		min_offs += size;
+
+		/* Adjust all value offsets. */
+		last = xs->header->xh_entries;
+		for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+			size_t o = le16_to_cpu(last->xe_name_offset);
+
+			if (o < offs)
+				last->xe_name_offset = cpu_to_le16(o + size);
+			last += 1;
+		}
+
+		if (!xi->value) {
+			/* Remove the old entry. */
+			last -= 1;
+			memmove(xs->here, xs->here + 1,
+				(void *)last - (void *)xs->here);
+			memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+			le16_add_cpu(&xs->header->xh_count, -1);
+		}
+	}
+	if (xi->value) {
+		/* Insert the new name+value. */
+		size_t size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_SIZE(xi->value_len);
+		void *val = xs->base + min_offs - size;
+
+		xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
+		memset(val, 0, size);
+		memcpy(val, xi->name, name_len);
+		memcpy(val + OCFS2_XATTR_SIZE(name_len),
+		       xi->value,
+		       xi->value_len);
+		xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+		ocfs2_xattr_set_local(xs->here, 1);
+		ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+	}
+
+	return;
+}
+
+/*
+ * ocfs2_xattr_set_entry()
+ *
+ * Set extended attribute entry into inode or block.
+ *
+ * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
+ * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
+ * then set value in B tree with set_value_outside().
+ */
+static int ocfs2_xattr_set_entry(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs,
+				 int flag)
+{
+	struct ocfs2_xattr_entry *last;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
+	size_t size_l = 0;
+	handle_t *handle = NULL;
+	int free, i, ret;
+	struct ocfs2_xattr_info xi_l = {
+		.name_index = xi->name_index,
+		.name = xi->name,
+		.value = xi->value,
+		.value_len = xi->value_len,
+	};
+
+	/* Compute min_offs, last and free space. */
+	last = xs->header->xh_entries;
+
+	for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+		size_t offs = le16_to_cpu(last->xe_name_offset);
+		if (offs < min_offs)
+			min_offs = offs;
+		last += 1;
+	}
+
+	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	if (free < 0)
+		return -EFAULT;
+
+	if (!xs->not_found) {
+		size_t size = 0;
+		if (ocfs2_xattr_is_local(xs->here))
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_ROOT_SIZE;
+		free += (size + sizeof(struct ocfs2_xattr_entry));
+	}
+	/* Check free space in inode or block */
+	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		if (free < sizeof(struct ocfs2_xattr_entry) +
+			   OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_ROOT_SIZE) {
+			ret = -ENOSPC;
+			goto out;
+		}
+		size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+		xi_l.value = (void *)&def_xv;
+		xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
+	} else if (xi->value) {
+		if (free < sizeof(struct ocfs2_xattr_entry) +
+			   OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_SIZE(xi->value_len)) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
+
+	if (!xs->not_found) {
+		/* For existing extended attribute */
+		size_t size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+		void *val = xs->base + offs;
+
+		if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
+			/* Replace existing local xattr with tree root */
+			ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
+							    offs);
+			if (ret < 0)
+				mlog_errno(ret);
+			goto out;
+		} else if (!ocfs2_xattr_is_local(xs->here)) {
+			/* For existing xattr which has value outside */
+			struct ocfs2_xattr_value_root *xv = NULL;
+			xv = (struct ocfs2_xattr_value_root *)(val +
+				OCFS2_XATTR_SIZE(name_len));
+
+			if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+				/*
+				 * If new value need set outside also,
+				 * first truncate old value to new value,
+				 * then set new value with set_value_outside().
+				 */
+				ret = ocfs2_xattr_value_truncate(inode,
+								 xs->xattr_bh,
+								 xv,
+								 xi->value_len);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = __ocfs2_xattr_set_value_outside(inode,
+								xv,
+								xi->value,
+								xi->value_len);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_xattr_update_entry(inode,
+							       xi,
+							       xs,
+							       offs);
+				if (ret < 0)
+					mlog_errno(ret);
+				goto out;
+			} else {
+				/*
+				 * If new value need set in local,
+				 * just trucate old value to zero.
+				 */
+				 ret = ocfs2_xattr_value_truncate(inode,
+								 xs->xattr_bh,
+								 xv,
+								 0);
+				if (ret < 0)
+					mlog_errno(ret);
+			}
+		}
+	}
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+		/*set extended attribue in external blcok*/
+		ret = ocfs2_extend_trans(handle,
+					 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/*
+	 * Set value in local, include set tree root in local.
+	 * This is the first step for value size >INLINE_SIZE.
+	 */
+	ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
+
+	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+		ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
+	    (flag & OCFS2_INLINE_XATTR_FL)) {
+		struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+		unsigned int xattrsize = osb->s_xattr_inline_size;
+
+		/*
+		 * Adjust extent record count or inline data size
+		 * to reserve space for extended attribute.
+		 */
+		if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+			struct ocfs2_inline_data *idata = &di->id2.i_data;
+			le16_add_cpu(&idata->id_count, -xattrsize);
+		} else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+			struct ocfs2_extent_list *el = &di->id2.i_list;
+			le16_add_cpu(&el->l_count, -(xattrsize /
+					sizeof(struct ocfs2_extent_rec)));
+		}
+		di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+	}
+	/* Update xattr flag */
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= flag;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+	/* Update inode ctime */
+	inode->i_ctime = CURRENT_TIME;
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
+	if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/*
+		 * Set value outside in B tree.
+		 * This is the second step for value size > INLINE_SIZE.
+		 */
+		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+		if (ret < 0) {
+			int ret2;
+
+			mlog_errno(ret);
+			/*
+			 * If set value outside failed, we have to clean
+			 * the junk tree root we have already set in local.
+			 */
+			ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+			if (ret2 < 0)
+				mlog_errno(ret2);
+		}
+	}
+out:
+	return ret;
+
+}
+
+static int ocfs2_xattr_free_block(handle_t *handle,
+				  struct ocfs2_super *osb,
+				  struct ocfs2_xattr_block *xb)
+{
+	struct inode *xb_alloc_inode;
+	struct buffer_head *xb_alloc_bh = NULL;
+	u64 blk = le64_to_cpu(xb->xb_blkno);
+	u16 bit = le16_to_cpu(xb->xb_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+	int ret = 0;
+
+	xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+				EXTENT_ALLOC_SYSTEM_INODE,
+				le16_to_cpu(xb->xb_suballoc_slot));
+	if (!xb_alloc_inode) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	mutex_lock(&xb_alloc_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+	ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+	ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+				       bit, bg_blkno, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+out_unlock:
+	ocfs2_inode_unlock(xb_alloc_inode, 1);
+	brelse(xb_alloc_bh);
+out_mutex:
+	mutex_unlock(&xb_alloc_inode->i_mutex);
+	iput(xb_alloc_inode);
+out:
+	return ret;
+}
+
+static int ocfs2_remove_value_outside(struct inode*inode,
+				      struct buffer_head *bh,
+				      struct ocfs2_xattr_header *header)
+{
+	int ret = 0, i;
+
+	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+
+		if (!ocfs2_xattr_is_local(entry)) {
+			struct ocfs2_xattr_value_root *xv;
+			void *val;
+
+			val = (void *)header +
+				le16_to_cpu(entry->xe_name_offset);
+			xv = (struct ocfs2_xattr_value_root *)
+				(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+			ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+			if (ret < 0) {
+				mlog_errno(ret);
+				return ret;
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_remove(struct inode *inode,
+				    struct buffer_head *di_bh)
+{
+
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_xattr_header *header;
+	int ret;
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	ret = ocfs2_remove_value_outside(inode, di_bh, header);
+
+	return ret;
+}
+
+static int ocfs2_xattr_block_remove(struct inode *inode,
+				    struct buffer_head *blk_bh)
+{
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_header *header;
+	int ret = 0;
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	header = &(xb->xb_attrs.xb_header);
+
+	ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_remove()
+ *
+ * Free extended attribute resources associated with this inode.
+ */
+int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
+{
+	struct ocfs2_xattr_block *xb;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	handle_t *handle;
+	int ret;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return 0;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+	if (di->i_xattr_loc) {
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       le64_to_cpu(di->i_xattr_loc),
+				       &blk_bh, OCFS2_BH_CACHED, inode);
+		if (ret < 0) {
+			mlog_errno(ret);
+			return ret;
+		}
+		/*Verify the signature of xattr block*/
+		if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+			   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		ret = ocfs2_xattr_block_remove(inode, blk_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (di->i_xattr_loc) {
+		xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+		ocfs2_xattr_free_block(handle, osb, xb);
+		di->i_xattr_loc = cpu_to_le64(0);
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	brelse(blk_bh);
+
+	return ret;
+}
+
+static int ocfs2_xattr_has_space_inline(struct inode *inode,
+					struct ocfs2_dinode *di)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
+	int free;
+
+	if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
+		return 0;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		struct ocfs2_inline_data *idata = &di->id2.i_data;
+		free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
+	} else if (ocfs2_inode_is_fast_symlink(inode)) {
+		free = ocfs2_fast_symlink_chars(inode->i_sb) -
+			le64_to_cpu(di->i_size);
+	} else {
+		struct ocfs2_extent_list *el = &di->id2.i_list;
+		free = (le16_to_cpu(el->l_count) -
+			le16_to_cpu(el->l_next_free_rec)) *
+			sizeof(struct ocfs2_extent_rec);
+	}
+	if (free >= xattrsize)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_find()
+ *
+ * Find extended attribute in inode block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_ibody_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	int ret;
+	int has_space = 0;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		down_read(&oi->ip_alloc_sem);
+		has_space = ocfs2_xattr_has_space_inline(inode, di);
+		up_read(&oi->ip_alloc_sem);
+		if (!has_space)
+			return 0;
+	}
+
+	xs->xattr_bh = xs->inode_bh;
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
+		xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - le16_to_cpu(di->i_xattr_inline_size));
+	else
+		xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	/* Find the named attribute. */
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+		if (ret && ret != -ENODATA)
+			return ret;
+		xs->not_found = ret;
+	}
+
+	return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_set()
+ *
+ * Set, replace or remove an extended attribute into inode block.
+ *
+ */
+static int ocfs2_xattr_ibody_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	int ret;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return -ENOSPC;
+
+	down_write(&oi->ip_alloc_sem);
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		if (!ocfs2_xattr_has_space_inline(inode, di)) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
+
+	ret = ocfs2_xattr_set_entry(inode, xi, xs,
+				(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
+out:
+	up_write(&oi->ip_alloc_sem);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_block_find()
+ *
+ * Find extended attribute in external block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+			       le64_to_cpu(di->i_xattr_loc),
+			       &blk_bh, OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+			ret = -EFAULT;
+			goto cleanup;
+	}
+
+	xs->xattr_bh = blk_bh;
+	xs->header = &((struct ocfs2_xattr_block *)blk_bh->b_data)->
+			xb_attrs.xb_header;
+	xs->base = (void *)xs->header;
+	xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+	xs->here = xs->header->xh_entries;
+
+	ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	if (ret && ret != -ENODATA) {
+		xs->xattr_bh = NULL;
+		goto cleanup;
+	}
+	xs->not_found = ret;
+	return 0;
+
+cleanup:
+	brelse(blk_bh);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_block_set()
+ *
+ * Set, replace or remove an extended attribute into external block.
+ *
+ */
+static int ocfs2_xattr_block_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_xattr_block *xblk = NULL;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	int ret;
+
+	if (!xs->xattr_bh) {
+		/*
+		 * Alloc one external block for extended attribute
+		 * outside of inode.
+		 */
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		handle = ocfs2_start_trans(osb,
+					   OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			goto out;
+		}
+		ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+					   &suballoc_bit_start, &num_got,
+					   &first_blkno);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		new_bh = sb_getblk(inode->i_sb, first_blkno);
+		ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+		ret = ocfs2_journal_access(handle, inode, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		/* Initialize ocfs2_xattr_block */
+		xs->xattr_bh = new_bh;
+		xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+		memset(xblk, 0, inode->i_sb->s_blocksize);
+		strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+		xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+		xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+		xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+		xblk->xb_blkno = cpu_to_le64(first_blkno);
+
+		xs->header = &xblk->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)xblk + inode->i_sb->s_blocksize;
+		xs->here = xs->header->xh_entries;
+
+
+		ret = ocfs2_journal_dirty(handle, new_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		di->i_xattr_loc = cpu_to_le64(first_blkno);
+		ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+		if (ret < 0)
+			mlog_errno(ret);
+out_commit:
+		ocfs2_commit_trans(osb, handle);
+out:
+		if (meta_ac)
+			ocfs2_free_alloc_context(meta_ac);
+		if (ret < 0)
+			return ret;
+	}
+
+	/* Set extended attribute into external block */
+	ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set()
+ *
+ * Set, replace or remove an extended attribute for this inode.
+ * value is NULL to remove an existing extended attribute, else either
+ * create or replace an extended attribute.
+ */
+int ocfs2_xattr_set(struct inode *inode,
+		    int name_index,
+		    const char *name,
+		    const void *value,
+		    size_t value_len,
+		    int flags)
+{
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	int ret;
+
+	struct ocfs2_xattr_info xi = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+	};
+
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+	/*
+	 * Scan inode and external block to find the same name
+	 * extended attribute and collect search infomation.
+	 */
+	ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+	if (ret)
+		goto cleanup;
+	if (xis.not_found) {
+		ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+		if (ret)
+			goto cleanup;
+	}
+
+	if (xis.not_found && xbs.not_found) {
+		ret = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		ret = 0;
+		if (!value)
+			goto cleanup;
+	} else {
+		ret = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+
+	if (!value) {
+		/* Remove existing extended attribute */
+		if (!xis.not_found)
+			ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+		else if (!xbs.not_found)
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+	} else {
+		/* We always try to set extended attribute into inode first*/
+		ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+		if (!ret && !xbs.not_found) {
+			/*
+			 * If succeed and that extended attribute existing in
+			 * external block, then we will remove it.
+			 */
+			xi.value = NULL;
+			xi.value_len = 0;
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+		} else if (ret == -ENOSPC) {
+			if (di->i_xattr_loc && !xbs.xattr_bh) {
+				ret = ocfs2_xattr_block_find(inode, name_index,
+							     name, &xbs);
+				if (ret)
+					goto cleanup;
+			}
+			/*
+			 * If no space in inode, we will set extended attribute
+			 * into external block.
+			 */
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+			if (ret)
+				goto cleanup;
+			if (!xis.not_found) {
+				/*
+				 * If succeed and that extended attribute
+				 * existing in inode, we will remove it.
+				 */
+				xi.value = NULL;
+				xi.value_len = 0;
+				ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+			}
+		}
+	}
+cleanup:
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+	brelse(xbs.xattr_bh);
+
+	return ret;
+}
+
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 00000000000..ed32377be9d
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,51 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+enum ocfs2_xattr_type {
+	OCFS2_XATTR_INDEX_USER = 1,
+	OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+	OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+	OCFS2_XATTR_INDEX_TRUSTED,
+	OCFS2_XATTR_INDEX_SECURITY,
+	OCFS2_XATTR_MAX
+};
+
+extern struct xattr_handler ocfs2_xattr_user_handler;
+extern struct xattr_handler ocfs2_xattr_trusted_handler;
+
+extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+			   size_t, int);
+extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
+extern struct xattr_handler *ocfs2_xattr_handlers[];
+
+#endif /* OCFS2_XATTR_H */
diff --git a/fs/ocfs2/xattr_trusted.c b/fs/ocfs2/xattr_trusted.c
new file mode 100644
index 00000000000..4c589c447aa
--- /dev/null
+++ b/fs/ocfs2/xattr_trusted.c
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr_trusted.c
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is taken from ext3.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+#include "xattr.h"
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+
+static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+				       size_t list_size, const char *name,
+				       size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ocfs2_xattr_trusted_list,
+	.get	= ocfs2_xattr_trusted_get,
+	.set	= ocfs2_xattr_trusted_set,
+};
diff --git a/fs/ocfs2/xattr_user.c b/fs/ocfs2/xattr_user.c
new file mode 100644
index 00000000000..93ba7163778
--- /dev/null
+++ b/fs/ocfs2/xattr_user.c
@@ -0,0 +1,94 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr_user.c
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is taken from ext3.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+#include "xattr.h"
+
+#define XATTR_USER_PREFIX "user."
+
+static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+	const size_t total_len = prefix_len + name_len + 1;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
+				void *buffer, size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
+				const void *value, size_t size, int flags)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ocfs2_xattr_user_list,
+	.get	= ocfs2_xattr_user_get,
+	.set	= ocfs2_xattr_user_set,
+};
-- 
cgit v1.2.3


From ba492615f0d32d0210b02c14b24512b4372b13d6 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:49 +0800
Subject: ocfs2: Add xattr index tree operations

When necessary, an ocfs2_xattr_block will embed an ocfs2_extent_list to
store large numbers of EAs. This patch adds a new type in
ocfs2_extent_tree_type and adds the implementation so that we can re-use the
b-tree code to handle the storage of many EAs.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/alloc.h | 10 +++++++
 2 files changed, 99 insertions(+)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ace27d1ca57..06ea7913c13 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -177,6 +177,48 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
 	.sanity_check		= ocfs2_xattr_value_sanity_check,
 };
 
+static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					     u64 blkno)
+{
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *) et->root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	xt->xt_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *) et->root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	return le64_to_cpu(xt->xt_last_eb_blk);
+}
+
+static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
+					     struct ocfs2_extent_tree *et,
+					     u32 clusters)
+{
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)et->root_bh->b_data;
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
+}
+
+static int ocfs2_xattr_tree_sanity_check(struct inode *inode,
+					 struct ocfs2_extent_tree *et)
+{
+	return 0;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+	.set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
+	.get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
+	.update_clusters	= ocfs2_xattr_tree_update_clusters,
+	.sanity_check		= ocfs2_xattr_tree_sanity_check,
+};
+
 static struct ocfs2_extent_tree*
 	 ocfs2_new_extent_tree(struct buffer_head *bh,
 			       enum ocfs2_extent_tree_type et_type,
@@ -201,6 +243,11 @@ static struct ocfs2_extent_tree*
 			(struct ocfs2_xattr_value_root *) private;
 		et->root_el = &xv->xr_list;
 		et->eops = &ocfs2_xattr_et_ops;
+	} else if (et_type == OCFS2_XATTR_TREE_EXTENT) {
+		struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)bh->b_data;
+		et->root_el = &xb->xb_attrs.xb_root.xt_list;
+		et->eops = &ocfs2_xattr_tree_et_ops;
 	}
 
 	return et;
@@ -570,6 +617,12 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 
 		last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk);
 		el = &xv->xr_list;
+	} else if (type == OCFS2_XATTR_TREE_EXTENT) {
+		struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+
+		last_eb_blk = le64_to_cpu(xb->xb_attrs.xb_root.xt_last_eb_blk);
+		el = &xb->xb_attrs.xb_root.xt_list;
 	}
 
 	if (last_eb_blk) {
@@ -4397,6 +4450,36 @@ bail:
 	return status;
 }
 
+int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
+				   handle_t *handle,
+				   struct inode *inode,
+				   struct buffer_head *root_bh,
+				   u32 cpos,
+				   u64 start_blk,
+				   u32 new_clusters,
+				   u8 flags,
+				   struct ocfs2_alloc_context *meta_ac)
+{
+	int status;
+	struct ocfs2_extent_tree *et = NULL;
+
+	et = ocfs2_new_extent_tree(root_bh, OCFS2_XATTR_TREE_EXTENT, NULL);
+	if (!et) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
+				     cpos, start_blk, new_clusters,
+				     flags, meta_ac, et);
+
+	if (et)
+		ocfs2_free_extent_tree(et);
+bail:
+	return status;
+}
+
 /*
  * Allcate and add clusters into the extent b-tree.
  * The new clusters(clusters_to_add) will be inserted at logical_offset.
@@ -4482,6 +4565,12 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 		status = ocfs2_dinode_insert_extent(osb, handle, inode, root_bh,
 						    *logical_offset, block,
 						    num_bits, flags, meta_ac);
+	else if (type == OCFS2_XATTR_TREE_EXTENT)
+		status = ocfs2_xattr_tree_insert_extent(osb, handle,
+							inode, root_bh,
+							*logical_offset,
+							block, num_bits, flags,
+							meta_ac);
 	else
 		status = ocfs2_xattr_value_insert_extent(osb, handle,
 							 inode, root_bh,
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index ec7baeb2ea7..cd4e12d2b6b 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -29,6 +29,7 @@
 enum ocfs2_extent_tree_type {
 	OCFS2_DINODE_EXTENT = 0,
 	OCFS2_XATTR_VALUE_EXTENT,
+	OCFS2_XATTR_TREE_EXTENT,
 };
 
 struct ocfs2_alloc_context;
@@ -51,6 +52,15 @@ int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
 				    u8 flags,
 				    struct ocfs2_alloc_context *meta_ac,
 				    void *private);
+int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
+				   handle_t *handle,
+				   struct inode *inode,
+				   struct buffer_head *root_bh,
+				   u32 cpos,
+				   u64 start_blk,
+				   u32 new_clusters,
+				   u8 flags,
+				   struct ocfs2_alloc_context *meta_ac);
 enum ocfs2_alloc_restarted {
 	RESTART_NONE = 0,
 	RESTART_TRANS,
-- 
cgit v1.2.3


From 0c044f0b24b9128ba8c297149d88bd81f2e36af3 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:50 +0800
Subject: ocfs2: Add xattr bucket iteration for large numbers of EAs

Ocfs2 breaks up xattr index tree leaves into 4k regions, called buckets.
Attributes are stored within a given bucket, depending on hash value.

After a discussion with Mark, we decided that the per-bucket index
(xe_entry[]) would only exist in the 1st block of a bucket. Likewise,
name/value pairs will not straddle more than one block. This allows the
majority of operations to work directly on the buffer heads in a leaf block.

This patch adds code to iterate the buckets in an EA. A new abstration of
ocfs2_xattr_bucket is added. It records the bhs in this bucket and
ocfs2_xattr_header. This keeps the code neat, improving readibility.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2_fs.h |  35 +++++++-
 fs/ocfs2/xattr.c    | 255 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/xattr.h    |   9 ++
 3 files changed, 293 insertions(+), 6 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 98e1f8bba0e..8d5e72f2c5c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -755,8 +755,13 @@ struct ocfs2_xattr_header {
 	__le16	xh_count;                       /* contains the count of how
 						   many records are in the
 						   local xattr storage. */
-	__le16	xh_reserved1;
-	__le32	xh_reserved2;
+	__le16	xh_free_start;                  /* current offset for storing
+						   xattr. */
+	__le16	xh_name_value_len;              /* total length of name/value
+						   length in this bucket. */
+	__le16	xh_num_buckets;                 /* bucket nums in one extent
+						   record, only valid in the
+						   first bucket. */
 	__le64  xh_csum;
 	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
@@ -793,6 +798,10 @@ struct ocfs2_xattr_tree_root {
 #define OCFS2_XATTR_SIZE(size)	(((size) + OCFS2_XATTR_ROUND) & \
 				~(OCFS2_XATTR_ROUND))
 
+#define OCFS2_XATTR_BUCKET_SIZE			4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET 	(OCFS2_XATTR_BUCKET_SIZE \
+						 / OCFS2_MIN_BLOCKSIZE)
+
 /*
  * On disk structure for xattr block.
  */
@@ -963,6 +972,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
 	return 0;
 
 }
+
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
 #else
 static inline int ocfs2_fast_symlink_chars(int blocksize)
 {
@@ -1046,6 +1066,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
 
 	return 0;
 }
+
+static inline int ocfs2_xattr_recs_per_xb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
 #endif  /* __KERNEL__ */
 
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 67bebd9259e..fb17f7fe4c6 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -52,6 +52,7 @@
 #include "suballoc.h"
 #include "uptodate.h"
 #include "buffer_head_io.h"
+#include "super.h"
 #include "xattr.h"
 
 
@@ -60,6 +61,11 @@ struct ocfs2_xattr_def_value_root {
 	struct ocfs2_extent_rec		er;
 };
 
+struct ocfs2_xattr_bucket {
+	struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+	struct ocfs2_xattr_header *xh;
+};
+
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE	80
 
@@ -99,6 +105,11 @@ struct ocfs2_xattr_search {
 	int not_found;
 };
 
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					struct ocfs2_xattr_tree_root *xt,
+					char *buffer,
+					size_t buffer_size);
+
 static inline struct xattr_handler *ocfs2_xattr_handler(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -483,7 +494,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 				  size_t buffer_size)
 {
 	struct buffer_head *blk_bh = NULL;
-	struct ocfs2_xattr_header *header = NULL;
+	struct ocfs2_xattr_block *xb;
 	int ret = 0;
 
 	if (!di->i_xattr_loc)
@@ -503,10 +514,17 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 		goto cleanup;
 	}
 
-	header = &((struct ocfs2_xattr_block *)blk_bh->b_data)->
-		 xb_attrs.xb_header;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 
-	ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+		ret = ocfs2_xattr_list_entries(inode, header,
+					       buffer, buffer_size);
+	} else {
+		struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+		ret = ocfs2_xattr_tree_list_index_block(inode, xt,
+						   buffer, buffer_size);
+	}
 cleanup:
 	brelse(blk_bh);
 
@@ -1923,3 +1941,232 @@ cleanup:
 	return ret;
 }
 
+/*
+ * Find the xattr extent rec which may contains name_hash.
+ * e_cpos will be the first name hash of the xattr rec.
+ * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
+ */
+static int ocfs2_xattr_get_rec(struct inode *inode,
+			       u32 name_hash,
+			       u64 *p_blkno,
+			       u32 *e_cpos,
+			       u32 *num_clusters,
+			       struct ocfs2_extent_list *el)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec = NULL;
+	u64 e_blkno = 0;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr tree block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= name_hash) {
+			e_blkno = le64_to_cpu(rec->e_blkno);
+			break;
+		}
+	}
+
+	if (!e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0) in xattr", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*p_blkno = le64_to_cpu(rec->e_blkno);
+	*num_clusters = le16_to_cpu(rec->e_leaf_clusters);
+	if (e_cpos)
+		*e_cpos = le32_to_cpu(rec->e_cpos);
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+typedef int (xattr_bucket_func)(struct inode *inode,
+				struct ocfs2_xattr_bucket *bucket,
+				void *para);
+
+static int ocfs2_iterate_xattr_buckets(struct inode *inode,
+				       u64 blkno,
+				       u32 clusters,
+				       xattr_bucket_func *func,
+				       void *para)
+{
+	int i, j, ret = 0;
+	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+	u32 num_buckets = clusters * bpc;
+	struct ocfs2_xattr_bucket bucket;
+
+	memset(&bucket, 0, sizeof(bucket));
+
+	mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
+	     clusters, blkno);
+
+	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
+		ret = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
+					blkno, blk_per_bucket,
+					bucket.bhs, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
+		/*
+		 * The real bucket num in this series of blocks is stored
+		 * in the 1st bucket.
+		 */
+		if (i == 0)
+			num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+
+		mlog(0, "iterating xattr bucket %llu\n", blkno);
+		if (func) {
+			ret = func(inode, &bucket, para);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		for (j = 0; j < blk_per_bucket; j++)
+			brelse(bucket.bhs[j]);
+		memset(&bucket, 0, sizeof(bucket));
+	}
+
+out:
+	for (j = 0; j < blk_per_bucket; j++)
+		brelse(bucket.bhs[j]);
+
+	return ret;
+}
+
+struct ocfs2_xattr_tree_list {
+	char *buffer;
+	size_t buffer_size;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+					     struct ocfs2_xattr_header *xh,
+					     int index,
+					     int *block_off,
+					     int *new_offset)
+{
+	u16 name_offset;
+
+	if (index < 0 || index >= le16_to_cpu(xh->xh_count))
+		return -EINVAL;
+
+	name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
+
+	*block_off = name_offset >> inode->i_sb->s_blocksize_bits;
+	*new_offset = name_offset % inode->i_sb->s_blocksize;
+
+	return 0;
+}
+
+static int ocfs2_list_xattr_bucket(struct inode *inode,
+				   struct ocfs2_xattr_bucket *bucket,
+				   void *para)
+{
+	int ret = 0;
+	struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
+	size_t size;
+	int i, block_off, new_offset;
+
+	for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+		struct xattr_handler *handler =
+			ocfs2_xattr_handler(ocfs2_xattr_get_type(entry));
+
+		if (handler) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode,
+								bucket->xh,
+								i,
+								&block_off,
+								&new_offset);
+			if (ret)
+				break;
+			size = handler->list(inode, xl->buffer, xl->buffer_size,
+					     bucket->bhs[block_off]->b_data +
+					     new_offset,
+					     entry->xe_name_len);
+			if (xl->buffer) {
+				if (size > xl->buffer_size)
+					return -ERANGE;
+				xl->buffer += size;
+			}
+			xl->buffer_size -= size;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					     struct ocfs2_xattr_tree_root *xt,
+					     char *buffer,
+					     size_t buffer_size)
+{
+	struct ocfs2_extent_list *el = &xt->xt_list;
+	int ret = 0;
+	u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
+	u64 p_blkno = 0;
+	struct ocfs2_xattr_tree_list xl = {
+		.buffer = buffer,
+		.buffer_size = buffer_size,
+	};
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return 0;
+
+	while (name_hash > 0) {
+		ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+					  &e_cpos, &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+						  ocfs2_list_xattr_bucket,
+						  &xl);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (e_cpos == 0)
+			break;
+
+		name_hash = e_cpos - 1;
+	}
+
+	ret = buffer_size - xl.buffer_size;
+out:
+	return ret;
+}
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index ed32377be9d..02afa87d5e6 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -48,4 +48,13 @@ extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
 extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
 #endif /* OCFS2_XATTR_H */
-- 
cgit v1.2.3


From 589dc2602f2a1b7fa5e59b90f548af189f128d77 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:51 +0800
Subject: ocfs2: Add xattr lookup code xattr btrees

Add code to lookup a given extended attribute in the xattr btree. Lookup
follows this general scheme:

1. Use ocfs2_xattr_get_rec to find the xattr extent record

2. Find the xattr bucket within the extent which may contain this xattr

3. Iterate the bucket to find the xattr. In ocfs2_xattr_block_get(), we need
   to recalcuate the block offset and name offset for the right position of
   name/value.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 351 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 328 insertions(+), 23 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fb17f7fe4c6..acccdfabd2d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -99,12 +99,25 @@ struct ocfs2_xattr_search {
 	 */
 	struct buffer_head *xattr_bh;
 	struct ocfs2_xattr_header *header;
+	struct ocfs2_xattr_bucket bucket;
 	void *base;
 	void *end;
 	struct ocfs2_xattr_entry *here;
 	int not_found;
 };
 
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+					     struct ocfs2_xattr_header *xh,
+					     int index,
+					     int *block_off,
+					     int *new_offset);
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs);
+
 static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
 					struct ocfs2_xattr_tree_root *xt,
 					char *buffer,
@@ -604,7 +617,7 @@ static int ocfs2_xattr_find_entry(int name_index,
 }
 
 static int ocfs2_xattr_get_value_outside(struct inode *inode,
-					 struct ocfs2_xattr_search *xs,
+					 struct ocfs2_xattr_value_root *xv,
 					 void *buffer,
 					 size_t len)
 {
@@ -613,12 +626,8 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
 	int i, ret = 0;
 	size_t cplen, blocksize;
 	struct buffer_head *bh = NULL;
-	struct ocfs2_xattr_value_root *xv;
 	struct ocfs2_extent_list *el;
 
-	xv = (struct ocfs2_xattr_value_root *)
-		(xs->base + le16_to_cpu(xs->here->xe_name_offset) +
-		OCFS2_XATTR_SIZE(xs->here->xe_name_len));
 	el = &xv->xr_list;
 	clusters = le32_to_cpu(xv->xr_clusters);
 	bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
@@ -668,6 +677,7 @@ static int ocfs2_xattr_ibody_get(struct inode *inode,
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct ocfs2_xattr_value_root *xv;
 	size_t size;
 	int ret = 0;
 
@@ -692,7 +702,11 @@ static int ocfs2_xattr_ibody_get(struct inode *inode,
 			       le16_to_cpu(xs->here->xe_name_offset) +
 			       OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
 		} else {
-			ret = ocfs2_xattr_get_value_outside(inode, xs,
+			xv = (struct ocfs2_xattr_value_root *)
+				(xs->base + le16_to_cpu(
+				 xs->here->xe_name_offset) +
+				OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+			ret = ocfs2_xattr_get_value_outside(inode, xv,
 							    buffer, size);
 			if (ret < 0) {
 				mlog_errno(ret);
@@ -714,12 +728,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	struct buffer_head *blk_bh = NULL;
 	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_value_root *xv;
 	size_t size;
-	int ret = -ENODATA;
+	int ret = -ENODATA, name_offset, name_len, block_off, i;
 
 	if (!di->i_xattr_loc)
 		return ret;
 
+	memset(&xs->bucket, 0, sizeof(xs->bucket));
+
 	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
 			       le64_to_cpu(di->i_xattr_loc),
 			       &blk_bh, OCFS2_BH_CACHED, inode);
@@ -736,12 +753,19 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 
 	xs->xattr_bh = blk_bh;
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	xs->header = &xb->xb_attrs.xb_header;
-	xs->base = (void *)xs->header;
-	xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
-	xs->here = xs->header->xh_entries;
 
-	ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		xs->header = &xb->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+		xs->here = xs->header->xh_entries;
+
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	} else
+		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+						   name_index,
+						   name, xs);
+
 	if (ret)
 		goto cleanup;
 	size = le64_to_cpu(xs->here->xe_value_size);
@@ -749,12 +773,26 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 		ret = -ERANGE;
 		if (size > buffer_size)
 			goto cleanup;
+
+		name_offset = le16_to_cpu(xs->here->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
+		i = xs->here - xs->header->xh_entries;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode,
+								xs->bucket.xh,
+								i,
+								&block_off,
+								&name_offset);
+			xs->base = xs->bucket.bhs[block_off]->b_data;
+		}
 		if (ocfs2_xattr_is_local(xs->here)) {
 			memcpy(buffer, (void *)xs->base +
-			       le16_to_cpu(xs->here->xe_name_offset) +
-			       OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+			       name_offset + name_len, size);
 		} else {
-			ret = ocfs2_xattr_get_value_outside(inode, xs,
+			xv = (struct ocfs2_xattr_value_root *)
+				(xs->base + name_offset + name_len);
+			ret = ocfs2_xattr_get_value_outside(inode, xv,
 							    buffer, size);
 			if (ret < 0) {
 				mlog_errno(ret);
@@ -764,8 +802,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	}
 	ret = size;
 cleanup:
-	brelse(blk_bh);
+	for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
+		brelse(xs->bucket.bhs[i]);
+	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
+	brelse(blk_bh);
 	return ret;
 }
 
@@ -1679,6 +1720,7 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 {
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
 	int ret = 0;
 
 	if (!di->i_xattr_loc)
@@ -1699,20 +1741,26 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 	}
 
 	xs->xattr_bh = blk_bh;
-	xs->header = &((struct ocfs2_xattr_block *)blk_bh->b_data)->
-			xb_attrs.xb_header;
-	xs->base = (void *)xs->header;
-	xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
-	xs->here = xs->header->xh_entries;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		xs->header = &xb->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+		xs->here = xs->header->xh_entries;
+
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	} else
+		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+						   name_index,
+						   name, xs);
 
-	ret = ocfs2_xattr_find_entry(name_index, name, xs);
 	if (ret && ret != -ENODATA) {
 		xs->xattr_bh = NULL;
 		goto cleanup;
 	}
 	xs->not_found = ret;
 	return 0;
-
 cleanup:
 	brelse(blk_bh);
 
@@ -1941,6 +1989,18 @@ cleanup:
 	return ret;
 }
 
+static inline u32 ocfs2_xattr_hash_by_name(struct inode *inode,
+					   int name_index,
+					   const char *suffix_name)
+{
+	struct xattr_handler *handler = ocfs2_xattr_handler(name_index);
+	char *prefix = handler->prefix;
+	int prefix_len = strlen(handler->prefix);
+
+	return ocfs2_xattr_name_hash(inode, prefix, prefix_len,
+				     (char *)suffix_name, strlen(suffix_name));
+}
+
 /*
  * Find the xattr extent rec which may contains name_hash.
  * e_cpos will be the first name hash of the xattr rec.
@@ -2010,6 +2070,251 @@ typedef int (xattr_bucket_func)(struct inode *inode,
 				struct ocfs2_xattr_bucket *bucket,
 				void *para);
 
+static int ocfs2_find_xe_in_bucket(struct inode *inode,
+				   struct buffer_head *header_bh,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u16 *xe_index,
+				   int *found)
+{
+	int i, ret = 0, cmp = 1, block_off, new_offset;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)header_bh->b_data;
+	size_t name_len = strlen(name);
+	struct ocfs2_xattr_entry *xe = NULL;
+	struct buffer_head *name_bh = NULL;
+	char *xe_name;
+
+	/*
+	 * We don't use binary search in the bucket because there
+	 * may be multiple entries with the same name hash.
+	 */
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash))
+			continue;
+		else if (name_hash < le32_to_cpu(xe->xe_name_hash))
+			break;
+
+		cmp = name_index - ocfs2_xattr_get_type(xe);
+		if (!cmp)
+			cmp = name_len - xe->xe_name_len;
+		if (cmp)
+			continue;
+
+		ret = ocfs2_xattr_bucket_get_name_value(inode,
+							xh,
+							i,
+							&block_off,
+							&new_offset);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       header_bh->b_blocknr + block_off,
+				       &name_bh, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		xe_name = name_bh->b_data + new_offset;
+
+		cmp = memcmp(name, xe_name, name_len);
+		brelse(name_bh);
+		name_bh = NULL;
+
+		if (cmp == 0) {
+			*xe_index = i;
+			*found = 1;
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Find the specified xattr entry in a series of buckets.
+ * This series start from p_blkno and last for num_clusters.
+ * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
+ * the num of the valid buckets.
+ *
+ * Return the buffer_head this xattr should reside in. And if the xattr's
+ * hash is in the gap of 2 buckets, return the lower bucket.
+ */
+static int ocfs2_xattr_bucket_find(struct inode *inode,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u64 p_blkno,
+				   u32 first_hash,
+				   u32 num_clusters,
+				   struct ocfs2_xattr_search *xs)
+{
+	int ret, found = 0;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *lower_bh = NULL;
+	struct ocfs2_xattr_header *xh = NULL;
+	struct ocfs2_xattr_entry *xe = NULL;
+	u16 index = 0;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int low_bucket = 0, bucket, high_bucket;
+	u32 last_hash;
+	u64 blkno;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno,
+			       &bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = (struct ocfs2_xattr_header *)bh->b_data;
+	high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
+
+	while (low_bucket <= high_bucket) {
+		brelse(bh);
+		bh = NULL;
+		bucket = (low_bucket + high_bucket) / 2;
+
+		blkno = p_blkno + bucket * blk_per_bucket;
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+				       &bh, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		xh = (struct ocfs2_xattr_header *)bh->b_data;
+		xe = &xh->xh_entries[0];
+		if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+			high_bucket = bucket - 1;
+			continue;
+		}
+
+		/*
+		 * Check whether the hash of the last entry in our
+		 * bucket is larger than the search one.
+		 */
+		xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+		last_hash = le32_to_cpu(xe->xe_name_hash);
+
+		/* record lower_bh which may be the insert place. */
+		brelse(lower_bh);
+		lower_bh = bh;
+		bh = NULL;
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
+			low_bucket = bucket + 1;
+			continue;
+		}
+
+		/* the searched xattr should reside in this bucket if exists. */
+		ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+					      name_index, name, name_hash,
+					      &index, &found);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		break;
+	}
+
+	/*
+	 * Record the bucket we have found.
+	 * When the xattr's hash value is in the gap of 2 buckets, we will
+	 * always set it to the previous bucket.
+	 */
+	if (!lower_bh) {
+		/*
+		 * We can't find any bucket whose first name_hash is less
+		 * than the find name_hash.
+		 */
+		BUG_ON(bh->b_blocknr != p_blkno);
+		lower_bh = bh;
+		bh = NULL;
+	}
+	xs->bucket.bhs[0] = lower_bh;
+	xs->bucket.xh = (struct ocfs2_xattr_header *)
+					xs->bucket.bhs[0]->b_data;
+	lower_bh = NULL;
+
+	xs->header = xs->bucket.xh;
+	xs->base = xs->bucket.bhs[0]->b_data;
+	xs->end = xs->base + inode->i_sb->s_blocksize;
+
+	if (found) {
+		/*
+		 * If we have found the xattr enty, read all the blocks in
+		 * this bucket.
+		 */
+		ret = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
+					xs->bucket.bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		xs->here = &xs->header->xh_entries[index];
+		mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
+		     (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+	} else
+		ret = -ENODATA;
+
+out:
+	brelse(bh);
+	brelse(lower_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs)
+{
+	int ret;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	u64 p_blkno = 0;
+	u32 first_hash, num_clusters = 0;
+	u32 name_hash = ocfs2_xattr_hash_by_name(inode, name_index, name);
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return -ENODATA;
+
+	mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
+	     name, name_hash, name_index);
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
+				  &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
+
+	mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
+	     "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+
+	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
+				      p_blkno, first_hash, num_clusters, xs);
+
+out:
+	return ret;
+}
+
 static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 				       u64 blkno,
 				       u32 clusters,
-- 
cgit v1.2.3


From ca12b7c48942d21b2e7890b820db9d578bc291cd Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:52 +0800
Subject: ocfs2: Optionally limit extent size in ocfs2_insert_extent()

In xattr bucket, we want to limit the maximum size of a btree leaf,
otherwise we'll lose the benefits of hashing because we'll have to search
large leaves.

So add a new field in ocfs2_extent_tree which indicates the maximum leaf cluster
size we want so that we can prevent ocfs2_insert_extent() from merging the leaf
record even if it is contiguous with an adjacent record.

Other btree types are not affected by this change.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 39 ++++++++++++++++++++++++++++++---------
 fs/ocfs2/alloc.h |  5 +++++
 2 files changed, 35 insertions(+), 9 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 06ea7913c13..f65cb43edb7 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -79,6 +79,7 @@ struct ocfs2_extent_tree {
 	struct buffer_head *root_bh;
 	struct ocfs2_extent_list *root_el;
 	void *private;
+	unsigned int max_leaf_clusters;
 };
 
 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -220,7 +221,8 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 };
 
 static struct ocfs2_extent_tree*
-	 ocfs2_new_extent_tree(struct buffer_head *bh,
+	 ocfs2_new_extent_tree(struct inode *inode,
+			       struct buffer_head *bh,
 			       enum ocfs2_extent_tree_type et_type,
 			       void *private)
 {
@@ -248,6 +250,8 @@ static struct ocfs2_extent_tree*
 			(struct ocfs2_xattr_block *)bh->b_data;
 		et->root_el = &xb->xb_attrs.xb_root.xt_list;
 		et->eops = &ocfs2_xattr_tree_et_ops;
+		et->max_leaf_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+						OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
 	}
 
 	return et;
@@ -4109,7 +4113,8 @@ out:
 static void ocfs2_figure_contig_type(struct inode *inode,
 				     struct ocfs2_insert_type *insert,
 				     struct ocfs2_extent_list *el,
-				     struct ocfs2_extent_rec *insert_rec)
+				     struct ocfs2_extent_rec *insert_rec,
+				     struct ocfs2_extent_tree *et)
 {
 	int i;
 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -4125,6 +4130,20 @@ static void ocfs2_figure_contig_type(struct inode *inode,
 		}
 	}
 	insert->ins_contig = contig_type;
+
+	if (insert->ins_contig != CONTIG_NONE) {
+		struct ocfs2_extent_rec *rec =
+				&el->l_recs[insert->ins_contig_index];
+		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
+				   le16_to_cpu(insert_rec->e_leaf_clusters);
+
+		/*
+		 * Caller might want us to limit the size of extents, don't
+		 * calculate contiguousness if we might exceed that limit.
+		 */
+		if (et->max_leaf_clusters && len > et->max_leaf_clusters)
+			insert->ins_contig = CONTIG_NONE;
+	}
 }
 
 /*
@@ -4232,7 +4251,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		le16_to_cpu(el->l_next_free_rec);
 
 	if (!insert->ins_tree_depth) {
-		ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+		ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
 		ocfs2_figure_appending_type(insert, el, insert_rec);
 		return 0;
 	}
@@ -4266,7 +4285,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
          *     into two types of appends: simple record append, or a
          *     rotate inside the tail leaf.
 	 */
-	ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+	ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
 
 	/*
 	 * The insert code isn't quite ready to deal with all cases of
@@ -4402,7 +4421,7 @@ int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
 	int status;
 	struct ocfs2_extent_tree *et = NULL;
 
-	et = ocfs2_new_extent_tree(root_bh, OCFS2_DINODE_EXTENT, NULL);
+	et = ocfs2_new_extent_tree(inode, root_bh, OCFS2_DINODE_EXTENT, NULL);
 	if (!et) {
 		status = -ENOMEM;
 		mlog_errno(status);
@@ -4433,7 +4452,8 @@ int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
 	int status;
 	struct ocfs2_extent_tree *et = NULL;
 
-	et = ocfs2_new_extent_tree(root_bh, OCFS2_XATTR_VALUE_EXTENT, private);
+	et = ocfs2_new_extent_tree(inode, root_bh,
+				   OCFS2_XATTR_VALUE_EXTENT, private);
 	if (!et) {
 		status = -ENOMEM;
 		mlog_errno(status);
@@ -4463,7 +4483,8 @@ int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
 	int status;
 	struct ocfs2_extent_tree *et = NULL;
 
-	et = ocfs2_new_extent_tree(root_bh, OCFS2_XATTR_TREE_EXTENT, NULL);
+	et = ocfs2_new_extent_tree(inode, root_bh, OCFS2_XATTR_TREE_EXTENT,
+				   NULL);
 	if (!et) {
 		status = -ENOMEM;
 		mlog_errno(status);
@@ -4879,7 +4900,7 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 		goto out;
 	}
 
-	et = ocfs2_new_extent_tree(root_bh, et_type, private);
+	et = ocfs2_new_extent_tree(inode, root_bh, et_type, private);
 	if (!et) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -5177,7 +5198,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 	struct ocfs2_path *path = NULL;
 	struct ocfs2_extent_tree *et = NULL;
 
-	et = ocfs2_new_extent_tree(root_bh, et_type, private);
+	et = ocfs2_new_extent_tree(inode, root_bh, et_type, private);
 	if (!et) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cd4e12d2b6b..23c695ddaa5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -32,6 +32,11 @@ enum ocfs2_extent_tree_type {
 	OCFS2_XATTR_TREE_EXTENT,
 };
 
+/*
+ * For xattr tree leaf, we limit the leaf byte size to be 64K.
+ */
+#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
+
 struct ocfs2_alloc_context;
 int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
 			       handle_t *handle,
-- 
cgit v1.2.3


From 012255961c9ecfe22b7a1df47ac26ab37818cb1e Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:53 +0800
Subject: ocfs2: Enable xattr set in index btree

Where the previous patches added the ability of list/get xattr in buckets
for ocfs2, this patch enables ocfs2 to store large numbers of EAs.

The original design doc is written by Mark Fasheh, and it can be found in
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/IndexedEATrees. I only had to
make small modifications to it.

First, because the bucket size is 4K, a new field named xh_free_start is added
in ocfs2_xattr_header to indicate the next valid name/value offset in a bucket.
It is used when we store new EA name/value. With this field, we can find the
place more quickly and what's more, we don't need to sort the name/value every
time to let the last entry indicate the next unused space. This makes the
insert operation more efficient for blocksizes smaller than 4k.

Because of the new xh_free_start, another field named as xh_name_value_len is
also added in ocfs2_xattr_header. It records the total length of all the
name/values in the bucket. We need this so that we can check it and defragment
the bucket if there is not enough contiguous free space.

An xattr insertion looks like this:
1. xattr_index_block_find: find the right bucket by the name_hash, say bucketA.
2. check whether there is enough space in bucketA. If yes, insert it directly
   and modify xh_free_start and xh_name_value_len accordingly. If not, check
   xh_name_value_len to see whether we can store this by defragment the bucket.
   If yes, defragment it and go on insertion.
3. If defragement doesn't work, check whether there is new empty bucket in
   the clusters within this extent record. If yes, init the new bucket and move
   all the buckets after bucketA one by one to the next bucket. Move half of the
   entries in bucketA to the next bucket and go on insertion.
4. If there is no new bucket, grow the extent tree.

As for xattr deletion, we will delete an xattr bucket when all it's xattrs
are removed and move all the buckets after it to the previous one. When all
the xattr buckets in an extend record are freed, free this extend records
from ocfs2_xattr_tree.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 2267 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/xattr.h |    8 +
 2 files changed, 2273 insertions(+), 2 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index acccdfabd2d..5e8fae94888 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -36,6 +36,7 @@
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
+#include <linux/sort.h>
 
 #define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
@@ -123,6 +124,13 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
 					char *buffer,
 					size_t buffer_size);
 
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs);
+
 static inline struct xattr_handler *ocfs2_xattr_handler(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -1767,6 +1775,52 @@ cleanup:
 	return ret;
 }
 
+/*
+ * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
+ * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
+ * re-initialized.
+ */
+static int ocfs2_restore_xattr_block(struct inode *inode,
+				     struct ocfs2_xattr_search *xs)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+	u16 xb_flags = le16_to_cpu(xb->xb_flags);
+
+	BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
+		le16_to_cpu(el->l_next_free_rec) != 0);
+
+	handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+	       offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+	xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
+
+	ocfs2_journal_dirty(handle, xs->xattr_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
 /*
  * ocfs2_xattr_block_set()
  *
@@ -1862,10 +1916,25 @@ out:
 			ocfs2_free_alloc_context(meta_ac);
 		if (ret < 0)
 			return ret;
+	} else
+		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+
+	if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		/* Set extended attribute into external block */
+		ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+		if (!ret || ret != -ENOSPC)
+			goto end;
+
+		ret = ocfs2_xattr_create_index_block(inode, xs);
+		if (ret)
+			goto end;
 	}
 
-	/* Set extended attribute into external block */
-	ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+	if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
+		ret = ocfs2_restore_xattr_block(inode, xs);
+
+end:
 
 	return ret;
 }
@@ -1887,6 +1956,7 @@ int ocfs2_xattr_set(struct inode *inode,
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
 	int ret;
+	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
 	struct ocfs2_xattr_info xi = {
 		.name_index = name_index,
@@ -1985,6 +2055,8 @@ cleanup:
 	ocfs2_inode_unlock(inode, 1);
 	brelse(di_bh);
 	brelse(xbs.xattr_bh);
+	for (i = 0; i < blk_per_bucket; i++)
+		brelse(xbs.bucket.bhs[i]);
 
 	return ret;
 }
@@ -2475,3 +2547,2194 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
 out:
 	return ret;
 }
+
+static int cmp_xe(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_hash = le32_to_cpu(l->xe_name_hash);
+	u32 r_hash = le32_to_cpu(r->xe_name_hash);
+
+	if (l_hash > r_hash)
+		return 1;
+	if (l_hash < r_hash)
+		return -1;
+	return 0;
+}
+
+static void swap_xe(void *a, void *b, int size)
+{
+	struct ocfs2_xattr_entry *l = a, *r = b, tmp;
+
+	tmp = *l;
+	memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
+	memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
+}
+
+/*
+ * When the ocfs2_xattr_block is filled up, new bucket will be created
+ * and all the xattr entries will be moved to the new bucket.
+ * Note: we need to sort the entries since they are not saved in order
+ * in the ocfs2_xattr_block.
+ */
+static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
+					   struct buffer_head *xb_bh,
+					   struct buffer_head *xh_bh,
+					   struct buffer_head *data_bh)
+{
+	int i, blocksize = inode->i_sb->s_blocksize;
+	u16 offset, size, off_change;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_block *xb =
+				(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
+	struct ocfs2_xattr_header *xh =
+				(struct ocfs2_xattr_header *)xh_bh->b_data;
+	u16 count = le16_to_cpu(xb_xh->xh_count);
+	char *target = xh_bh->b_data, *src = xb_bh->b_data;
+
+	mlog(0, "cp xattr from block %llu to bucket %llu\n",
+	     (unsigned long long)xb_bh->b_blocknr,
+	     (unsigned long long)xh_bh->b_blocknr);
+
+	memset(xh_bh->b_data, 0, blocksize);
+	if (data_bh)
+		memset(data_bh->b_data, 0, blocksize);
+	/*
+	 * Since the xe_name_offset is based on ocfs2_xattr_header,
+	 * there is a offset change corresponding to the change of
+	 * ocfs2_xattr_header's position.
+	 */
+	off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	xe = &xb_xh->xh_entries[count - 1];
+	offset = le16_to_cpu(xe->xe_name_offset) + off_change;
+	size = blocksize - offset;
+
+	/* copy all the names and values. */
+	if (data_bh)
+		target = data_bh->b_data;
+	memcpy(target + offset, src + offset, size);
+
+	/* Init new header now. */
+	xh->xh_count = xb_xh->xh_count;
+	xh->xh_num_buckets = cpu_to_le16(1);
+	xh->xh_name_value_len = cpu_to_le16(size);
+	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
+
+	/* copy all the entries. */
+	target = xh_bh->b_data;
+	offset = offsetof(struct ocfs2_xattr_header, xh_entries);
+	size = count * sizeof(struct ocfs2_xattr_entry);
+	memcpy(target + offset, (char *)xb_xh + offset, size);
+
+	/* Change the xe offset for all the xe because of the move. */
+	off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
+		 offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	for (i = 0; i < count; i++)
+		le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
+
+	mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
+	     offset, size, off_change);
+
+	sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+}
+
+/*
+ * After we move xattr from block to index btree, we have to
+ * update ocfs2_xattr_search to the new xe and base.
+ *
+ * When the entry is in xattr block, xattr_bh indicates the storage place.
+ * While if the entry is in index b-tree, "bucket" indicates the
+ * real place of the xattr.
+ */
+static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+					   struct ocfs2_xattr_search *xs,
+					   struct buffer_head *old_bh,
+					   struct buffer_head *new_bh)
+{
+	int ret = 0;
+	char *buf = old_bh->b_data;
+	struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
+	struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
+	int i, blocksize = inode->i_sb->s_blocksize;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	xs->bucket.bhs[0] = new_bh;
+	get_bh(new_bh);
+	xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
+	xs->header = xs->bucket.xh;
+
+	xs->base = new_bh->b_data;
+	xs->end = xs->base + inode->i_sb->s_blocksize;
+
+	if (!xs->not_found) {
+		if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+			ret = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
+					xs->bucket.bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					OCFS2_BH_CACHED, inode);
+			if (ret) {
+				mlog_errno(ret);
+				return ret;
+			}
+
+			i = xs->here - old_xh->xh_entries;
+			xs->here = &xs->header->xh_entries[i];
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs)
+{
+	int ret, credits = OCFS2_SUBALLOC_ALLOC;
+	u32 bit_off, len;
+	u64 blkno;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_alloc_context *data_ac;
+	struct buffer_head *xh_bh = NULL, *data_bh = NULL;
+	struct buffer_head *xb_bh = xs->xattr_bh;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xr;
+	u16 xb_flags = le16_to_cpu(xb->xb_flags);
+	u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	mlog(0, "create xattr index block for %llu\n",
+	     (unsigned long long)xb_bh->b_blocknr);
+
+	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+
+	ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * XXX:
+	 * We can use this lock for now, and maybe move to a dedicated mutex
+	 * if performance becomes a problem later.
+	 */
+	down_write(&oi->ip_alloc_sem);
+
+	/*
+	 * 3 more credits, one for xattr block update, one for the 1st block
+	 * of the new xattr bucket and one for the value/data.
+	 */
+	credits += 3;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_sem;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xb_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * The bucket may spread in many blocks, and
+	 * we will only touch the 1st block and the last block
+	 * in the whole bucket(one for entry and one for data).
+	 */
+	blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+	mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+
+	xh_bh = sb_getblk(inode->i_sb, blkno);
+	if (!xh_bh) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_set_new_buffer_uptodate(inode, xh_bh);
+
+	ret = ocfs2_journal_access(handle, inode, xh_bh,
+				   OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (bpb > 1) {
+		data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
+		if (!data_bh) {
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ocfs2_set_new_buffer_uptodate(inode, data_bh);
+
+		ret = ocfs2_journal_access(handle, inode, data_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+
+	ocfs2_journal_dirty(handle, xh_bh);
+	if (data_bh)
+		ocfs2_journal_dirty(handle, data_bh);
+
+	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+
+	/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
+	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+	       offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+	xr = &xb->xb_attrs.xb_root;
+	xr->xt_clusters = cpu_to_le32(1);
+	xr->xt_last_eb_blk = 0;
+	xr->xt_list.l_tree_depth = 0;
+	xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
+	xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+
+	xr->xt_list.l_recs[0].e_cpos = 0;
+	xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+	xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+
+	xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
+
+	ret = ocfs2_journal_dirty(handle, xb_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_sem:
+	up_write(&oi->ip_alloc_sem);
+
+out:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	brelse(xh_bh);
+	brelse(data_bh);
+
+	return ret;
+}
+
+static int cmp_xe_offset(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
+	u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
+
+	if (l_name_offset < r_name_offset)
+		return 1;
+	if (l_name_offset > r_name_offset)
+		return -1;
+	return 0;
+}
+
+/*
+ * defrag a xattr bucket if we find that the bucket has some
+ * holes beteen name/value pairs.
+ * We will move all the name/value pairs to the end of the bucket
+ * so that we can spare some space for insertion.
+ */
+static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+				     struct ocfs2_xattr_bucket *bucket)
+{
+	int ret, i;
+	size_t end, offset, len, value_len;
+	struct ocfs2_xattr_header *xh;
+	char *entries, *buf, *bucket_buf = NULL;
+	u64 blkno = bucket->bhs[0]->b_blocknr;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u16 xh_free_start;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	size_t blocksize = inode->i_sb->s_blocksize;
+	handle_t *handle;
+	struct buffer_head **bhs;
+	struct ocfs2_xattr_entry *xe;
+
+	bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+			GFP_NOFS);
+	if (!bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_blocks(osb, blkno, blk_per_bucket, bhs,
+				OCFS2_BH_CACHED, inode);
+	if (ret)
+		goto out;
+
+	/*
+	 * In order to make the operation more efficient and generic,
+	 * we copy all the blocks into a contiguous memory and do the
+	 * defragment there, so if anything is error, we will not touch
+	 * the real block.
+	 */
+	bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+	if (!bucket_buf) {
+		ret = -EIO;
+		goto out;
+	}
+
+	buf = bucket_buf;
+	for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+		memcpy(buf, bhs[i]->b_data, blocksize);
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto commit;
+		}
+	}
+
+	xh = (struct ocfs2_xattr_header *)bucket_buf;
+	entries = (char *)xh->xh_entries;
+	xh_free_start = le16_to_cpu(xh->xh_free_start);
+
+	mlog(0, "adjust xattr bucket in %llu, count = %u, "
+	     "xh_free_start = %u, xh_name_value_len = %u.\n",
+	     blkno, le16_to_cpu(xh->xh_count), xh_free_start,
+	     le16_to_cpu(xh->xh_name_value_len));
+
+	/*
+	 * sort all the entries by their offset.
+	 * the largest will be the first, so that we can
+	 * move them to the end one by one.
+	 */
+	sort(entries, le16_to_cpu(xh->xh_count),
+	     sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe_offset, swap_xe);
+
+	/* Move all name/values to the end of the bucket. */
+	xe = xh->xh_entries;
+	end = OCFS2_XATTR_BUCKET_SIZE;
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
+		offset = le16_to_cpu(xe->xe_name_offset);
+		if (ocfs2_xattr_is_local(xe))
+			value_len = OCFS2_XATTR_SIZE(
+					le64_to_cpu(xe->xe_value_size));
+		else
+			value_len = OCFS2_XATTR_ROOT_SIZE;
+		len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
+
+		/*
+		 * We must make sure that the name/value pair
+		 * exist in the same block. So adjust end to
+		 * the previous block end if needed.
+		 */
+		if (((end - len) / blocksize !=
+			(end - 1) / blocksize))
+			end = end - end % blocksize;
+
+		if (end > offset + len) {
+			memmove(bucket_buf + end - len,
+				bucket_buf + offset, len);
+			xe->xe_name_offset = cpu_to_le16(end - len);
+		}
+
+		mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
+				"bucket %llu\n", (unsigned long long)blkno);
+
+		end -= len;
+	}
+
+	mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
+			"bucket %llu\n", (unsigned long long)blkno);
+
+	if (xh_free_start == end)
+		goto commit;
+
+	memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
+	xh->xh_free_start = cpu_to_le16(end);
+
+	/* sort the entries by their name_hash. */
+	sort(entries, le16_to_cpu(xh->xh_count),
+	     sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+
+	buf = bucket_buf;
+	for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
+		memcpy(bhs[i]->b_data, buf, blocksize);
+		ocfs2_journal_dirty(handle, bhs[i]);
+	}
+
+commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+
+	if (bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(bhs[i]);
+	}
+	kfree(bhs);
+
+	kfree(bucket_buf);
+	return ret;
+}
+
+/*
+ * Move half nums of the xattr bucket in the previous cluster to this new
+ * cluster. We only touch the last cluster of the previous extend record.
+ *
+ * first_bh is the first buffer_head of a series of bucket in the same
+ * extent rec and header_bh is the header of one bucket in this cluster.
+ * They will be updated if we move the data header_bh contains to the new
+ * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
+ */
+static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
+					       handle_t *handle,
+					       struct buffer_head **first_bh,
+					       struct buffer_head **header_bh,
+					       u64 new_blkno,
+					       u64 prev_blkno,
+					       u32 num_clusters,
+					       u32 *first_hash)
+{
+	int i, ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+	int blocksize = inode->i_sb->s_blocksize;
+	struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+	struct ocfs2_xattr_header *new_xh;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)((*first_bh)->b_data);
+
+	BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
+
+	prev_bh = *first_bh;
+	get_bh(prev_bh);
+	xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
+
+	prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+
+	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
+	     prev_blkno, new_blkno);
+
+	/*
+	 * We need to update the 1st half of the new cluster and
+	 * 1 more for the update of the 1st bucket of the previous
+	 * extent record.
+	 */
+	credits = bpc / 2 + 1;
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, prev_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
+		old_bh = new_bh = NULL;
+		new_bh = sb_getblk(inode->i_sb, new_blkno);
+		if (!new_bh) {
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+		ret = ocfs2_journal_access(handle, inode, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			brelse(new_bh);
+			goto out;
+		}
+
+		ret = ocfs2_read_block(osb, prev_blkno,
+				       &old_bh, OCFS2_BH_CACHED, inode);
+		if (ret < 0) {
+			mlog_errno(ret);
+			brelse(new_bh);
+			goto out;
+		}
+
+		memcpy(new_bh->b_data, old_bh->b_data, blocksize);
+
+		if (i == 0) {
+			new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
+			new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
+
+			if (first_hash)
+				*first_hash = le32_to_cpu(
+					new_xh->xh_entries[0].xe_name_hash);
+			new_first_bh = new_bh;
+			get_bh(new_first_bh);
+		}
+
+		ocfs2_journal_dirty(handle, new_bh);
+
+		if (*header_bh == old_bh) {
+			brelse(*header_bh);
+			*header_bh = new_bh;
+			get_bh(*header_bh);
+
+			brelse(*first_bh);
+			*first_bh = new_first_bh;
+			get_bh(*first_bh);
+		}
+		brelse(new_bh);
+		brelse(old_bh);
+	}
+
+	le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
+
+	ocfs2_journal_dirty(handle, prev_bh);
+out:
+	brelse(prev_bh);
+	brelse(new_first_bh);
+	return ret;
+}
+
+static int ocfs2_read_xattr_bucket(struct inode *inode,
+				   u64 blkno,
+				   struct buffer_head **bhs,
+				   int new)
+{
+	int ret = 0;
+	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	if (!new)
+		return ocfs2_read_blocks(OCFS2_SB(inode->i_sb), blkno,
+					 blk_per_bucket, bhs,
+					 OCFS2_BH_CACHED, inode);
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		bhs[i] = sb_getblk(inode->i_sb, blkno + i);
+		if (bhs[i] == NULL) {
+			ret = -EIO;
+			mlog_errno(ret);
+			break;
+		}
+		ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+	}
+
+	return ret;
+}
+
+/*
+ * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
+ * first_hash will record the 1st hash of the new bucket.
+ */
+static int ocfs2_half_xattr_bucket(struct inode *inode,
+				   handle_t *handle,
+				   u64 blk,
+				   u64 new_blk,
+				   u32 *first_hash,
+				   int new_bucket_head)
+{
+	int ret, i;
+	u16 count, start, len, name_value_len, xe_len, name_offset;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct buffer_head **s_bhs, **t_bhs = NULL;
+	struct ocfs2_xattr_header *xh;
+	struct ocfs2_xattr_entry *xe;
+	int blocksize = inode->i_sb->s_blocksize;
+
+	mlog(0, "move half of xattrs from bucket %llu to %llu\n",
+	     blk, new_blk);
+
+	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!s_bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!t_bhs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/* copy the whole bucket to the new first. */
+	for (i = 0; i < blk_per_bucket; i++)
+		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+
+	/* update the new bucket. */
+	xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+	count = le16_to_cpu(xh->xh_count);
+	start = count / 2;
+
+	/*
+	 * Calculate the total name/value len and xh_free_start for
+	 * the old bucket first.
+	 */
+	name_offset = OCFS2_XATTR_BUCKET_SIZE;
+	name_value_len = 0;
+	for (i = 0; i < start; i++) {
+		xe = &xh->xh_entries[i];
+		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		if (ocfs2_xattr_is_local(xe))
+			xe_len +=
+			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			xe_len += OCFS2_XATTR_ROOT_SIZE;
+		name_value_len += xe_len;
+		if (le16_to_cpu(xe->xe_name_offset) < name_offset)
+			name_offset = le16_to_cpu(xe->xe_name_offset);
+	}
+
+	/*
+	 * Now begin the modification to the new bucket.
+	 *
+	 * In the new bucket, We just move the xattr entry to the beginning
+	 * and don't touch the name/value. So there will be some holes in the
+	 * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
+	 * called.
+	 */
+	xe = &xh->xh_entries[start];
+	len = sizeof(struct ocfs2_xattr_entry) * (count - start);
+	mlog(0, "mv xattr entry len %d from %d to %d\n", len,
+	     (char *)xe - (char *)xh, (char *)xh->xh_entries - (char *)xh);
+	memmove((char *)xh->xh_entries, (char *)xe, len);
+	xe = &xh->xh_entries[count - start];
+	len = sizeof(struct ocfs2_xattr_entry) * start;
+	memset((char *)xe, 0, len);
+
+	le16_add_cpu(&xh->xh_count, -start);
+	le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
+
+	/* Calculate xh_free_start for the new bucket. */
+	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		if (ocfs2_xattr_is_local(xe))
+			xe_len +=
+			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			xe_len += OCFS2_XATTR_ROOT_SIZE;
+		if (le16_to_cpu(xe->xe_name_offset) <
+		    le16_to_cpu(xh->xh_free_start))
+			xh->xh_free_start = xe->xe_name_offset;
+	}
+
+	/* set xh->xh_num_buckets for the new xh. */
+	if (new_bucket_head)
+		xh->xh_num_buckets = cpu_to_le16(1);
+	else
+		xh->xh_num_buckets = 0;
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ocfs2_journal_dirty(handle, t_bhs[i]);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+	/* store the first_hash of the new bucket. */
+	if (first_hash)
+		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+
+	/*
+	 * Now only update the 1st block of the old bucket.
+	 * Please note that the entry has been sorted already above.
+	 */
+	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	memset(&xh->xh_entries[start], 0,
+	       sizeof(struct ocfs2_xattr_entry) * (count - start));
+	xh->xh_count = cpu_to_le16(start);
+	xh->xh_free_start = cpu_to_le16(name_offset);
+	xh->xh_name_value_len = cpu_to_le16(name_value_len);
+
+	ocfs2_journal_dirty(handle, s_bhs[0]);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	if (s_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(s_bhs[i]);
+	}
+	kfree(s_bhs);
+
+	if (t_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(t_bhs[i]);
+	}
+	kfree(t_bhs);
+
+	return ret;
+}
+
+/*
+ * Copy xattr from one bucket to another bucket.
+ *
+ * The caller must make sure that the journal transaction
+ * has enough space for journaling.
+ */
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+				 handle_t *handle,
+				 u64 s_blkno,
+				 u64 t_blkno,
+				 int t_is_new)
+{
+	int ret, i;
+	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int blocksize = inode->i_sb->s_blocksize;
+	struct buffer_head **s_bhs, **t_bhs = NULL;
+
+	BUG_ON(s_blkno == t_blkno);
+
+	mlog(0, "cp bucket %llu to %llu, target is %d\n",
+	     s_blkno, t_blkno, t_is_new);
+
+	s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+			GFP_NOFS);
+	if (!s_bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+	if (ret)
+		goto out;
+
+	t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+			GFP_NOFS);
+	if (!t_bhs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret)
+			goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+		ocfs2_journal_dirty(handle, t_bhs[i]);
+	}
+
+out:
+	if (s_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(s_bhs[i]);
+	}
+	kfree(s_bhs);
+
+	if (t_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(t_bhs[i]);
+	}
+	kfree(t_bhs);
+
+	return ret;
+}
+
+/*
+ * Copy one xattr cluster from src_blk to to_blk.
+ * The to_blk will become the first bucket header of the cluster, so its
+ * xh_num_buckets will be initialized as the bucket num in the cluster.
+ */
+static int ocfs2_cp_xattr_cluster(struct inode *inode,
+				  handle_t *handle,
+				  struct buffer_head *first_bh,
+				  u64 src_blk,
+				  u64 to_blk,
+				  u32 *first_hash)
+{
+	int i, ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_xattr_header *xh;
+	u64 to_blk_start = to_blk;
+
+	mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+
+	/*
+	 * We need to update the new cluster and 1 more for the update of
+	 * the 1st bucket of the previous extent rec.
+	 */
+	credits = bpc + 1;
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, first_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < num_buckets; i++) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle,
+					    src_blk, to_blk, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	}
+
+	/* update the old bucket header. */
+	xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+	le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
+
+	ocfs2_journal_dirty(handle, first_bh);
+
+	/* update the new bucket header. */
+	ret = ocfs2_read_block(osb, to_blk_start, &bh, OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = (struct ocfs2_xattr_header *)bh->b_data;
+	xh->xh_num_buckets = cpu_to_le16(num_buckets);
+
+	ocfs2_journal_dirty(handle, bh);
+
+	if (first_hash)
+		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+out:
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * Move half of the xattrs in this cluster to the new cluster.
+ * This function should only be called when bucket size == cluster size.
+ * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
+ */
+static int ocfs2_half_xattr_cluster(struct inode *inode,
+				    handle_t *handle,
+				    u64 prev_blk,
+				    u64 new_blk,
+				    u32 *first_hash)
+{
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int ret, credits = 2 * blk_per_bucket;
+
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
+
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* Move half of the xattr in start_blk to the next bucket. */
+	return  ocfs2_half_xattr_bucket(inode, handle, prev_blk,
+					new_blk, first_hash, 1);
+}
+
+/*
+ * Move some xattrs from the old cluster to the new one since they are not
+ * contiguous in ocfs2 xattr tree.
+ *
+ * new_blk starts a new separate cluster, and we will move some xattrs from
+ * prev_blk to it. v_start will be set as the first name hash value in this
+ * new cluster so that it can be used as e_cpos during tree insertion and
+ * don't collide with our original b-tree operations. first_bh and header_bh
+ * will also be updated since they will be used in ocfs2_extend_xattr_bucket
+ * to extend the insert bucket.
+ *
+ * The problem is how much xattr should we move to the new one and when should
+ * we update first_bh and header_bh?
+ * 1. If cluster size > bucket size, that means the previous cluster has more
+ *    than 1 bucket, so just move half nums of bucket into the new cluster and
+ *    update the first_bh and header_bh if the insert bucket has been moved
+ *    to the new cluster.
+ * 2. If cluster_size == bucket_size:
+ *    a) If the previous extent rec has more than one cluster and the insert
+ *       place isn't in the last cluster, copy the entire last cluster to the
+ *       new one. This time, we don't need to upate the first_bh and header_bh
+ *       since they will not be moved into the new cluster.
+ *    b) Otherwise, move the bottom half of the xattrs in the last cluster into
+ *       the new one. And we set the extend flag to zero if the insert place is
+ *       moved into the new allocated cluster since no extend is needed.
+ */
+static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
+					    handle_t *handle,
+					    struct buffer_head **first_bh,
+					    struct buffer_head **header_bh,
+					    u64 new_blk,
+					    u64 prev_blk,
+					    u32 prev_clusters,
+					    u32 *v_start,
+					    int *extend)
+{
+	int ret = 0;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
+	     prev_blk, prev_clusters, new_blk);
+
+	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
+							  handle,
+							  first_bh,
+							  header_bh,
+							  new_blk,
+							  prev_blk,
+							  prev_clusters,
+							  v_start);
+	else {
+		u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+
+		if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+			ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+						     last_blk, new_blk,
+						     v_start);
+		else {
+			ret = ocfs2_half_xattr_cluster(inode, handle,
+						       last_blk, new_blk,
+						       v_start);
+
+			if ((*header_bh)->b_blocknr == last_blk && extend)
+				*extend = 0;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Add a new cluster for xattr storage.
+ *
+ * If the new cluster is contiguous with the previous one, it will be
+ * appended to the same extent record, and num_clusters will be updated.
+ * If not, we will insert a new extent for it and move some xattrs in
+ * the last cluster into the new allocated one.
+ * We also need to limit the maximum size of a btree leaf, otherwise we'll
+ * lose the benefits of hashing because we'll have to search large leaves.
+ * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
+ * if it's bigger).
+ *
+ * first_bh is the first block of the previous extent rec and header_bh
+ * indicates the bucket we will insert the new xattrs. They will be updated
+ * when the header_bh is moved into the new cluster.
+ */
+static int ocfs2_add_new_xattr_cluster(struct inode *inode,
+				       struct buffer_head *root_bh,
+				       struct buffer_head **first_bh,
+				       struct buffer_head **header_bh,
+				       u32 *num_clusters,
+				       u32 prev_cpos,
+				       u64 prev_blkno,
+				       int *extend)
+{
+	int ret, credits;
+	u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 prev_clusters = *num_clusters;
+	u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
+	u64 block;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *root_el = &xb_root->xt_list;
+	enum ocfs2_extent_tree_type type = OCFS2_XATTR_TREE_EXTENT;
+
+	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
+	     "previous xattr blkno = %llu\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     prev_cpos, prev_blkno);
+
+	ret = ocfs2_lock_allocators(inode, root_bh, root_el,
+				    clusters_to_add, 0, &data_ac,
+				    &meta_ac, type, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	credits = ocfs2_calc_extend_credits(osb->sb, root_el, clusters_to_add);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+				     clusters_to_add, &bit_off, &num_bits);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
+	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (prev_blkno + prev_clusters * bpc == block &&
+	    (prev_clusters + num_bits) << osb->s_clustersize_bits <=
+	     OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
+		/*
+		 * If this cluster is contiguous with the old one and
+		 * adding this new cluster, we don't surpass the limit of
+		 * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
+		 * initialized and used like other buckets in the previous
+		 * cluster.
+		 * So add it as a contiguous one. The caller will handle
+		 * its init process.
+		 */
+		v_start = prev_cpos + prev_clusters;
+		*num_clusters = prev_clusters + num_bits;
+		mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
+		     num_bits);
+	} else {
+		ret = ocfs2_adjust_xattr_cross_cluster(inode,
+						       handle,
+						       first_bh,
+						       header_bh,
+						       block,
+						       prev_blkno,
+						       prev_clusters,
+						       &v_start,
+						       extend);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	}
+
+	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
+	     num_bits, block, v_start);
+	ret = ocfs2_xattr_tree_insert_extent(osb, handle, inode, root_bh,
+					     v_start, block, num_bits,
+					     0, meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+leave:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+/*
+ * Extend a new xattr bucket and move xattrs to the end one by one until
+ * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ */
+static int ocfs2_extend_xattr_bucket(struct inode *inode,
+				     struct buffer_head *first_bh,
+				     struct buffer_head *start_bh,
+				     u32 num_clusters)
+{
+	int ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u64 start_blk = start_bh->b_blocknr, end_blk;
+	u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+	handle_t *handle;
+	struct ocfs2_xattr_header *first_xh =
+				(struct ocfs2_xattr_header *)first_bh->b_data;
+	u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
+
+	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
+	     "from %llu, len = %u\n", start_blk,
+	     (unsigned long long)first_bh->b_blocknr, num_clusters);
+
+	BUG_ON(bucket >= num_buckets);
+
+	end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+
+	/*
+	 * We will touch all the buckets after the start_bh(include it).
+	 * Add one more bucket and modify the first_bh.
+	 */
+	credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, first_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto commit;
+	}
+
+	while (end_blk != start_blk) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
+					    end_blk + blk_per_bucket, 0);
+		if (ret)
+			goto commit;
+		end_blk -= blk_per_bucket;
+	}
+
+	/* Move half of the xattr in start_blk to the next bucket. */
+	ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
+				      start_blk + blk_per_bucket, NULL, 0);
+
+	le16_add_cpu(&first_xh->xh_num_buckets, 1);
+	ocfs2_journal_dirty(handle, first_bh);
+
+commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * Add new xattr bucket in an extent record and adjust the buckets accordingly.
+ * xb_bh is the ocfs2_xattr_block.
+ * We will move all the buckets starting from header_bh to the next place. As
+ * for this one, half num of its xattrs will be moved to the next one.
+ *
+ * We will allocate a new cluster if current cluster is full and adjust
+ * header_bh and first_bh if the insert place is moved to the new cluster.
+ */
+static int ocfs2_add_new_xattr_bucket(struct inode *inode,
+				      struct buffer_head *xb_bh,
+				      struct buffer_head *header_bh)
+{
+	struct ocfs2_xattr_header *first_xh = NULL;
+	struct buffer_head *first_bh = NULL;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)header_bh->b_data;
+	u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int ret, num_buckets, extend = 1;
+	u64 p_blkno;
+	u32 e_cpos, num_clusters;
+
+	mlog(0, "Add new xattr bucket starting form %llu\n",
+	     (unsigned long long)header_bh->b_blocknr);
+
+	/*
+	 * Add refrence for header_bh here because it may be
+	 * changed in ocfs2_add_new_xattr_cluster and we need
+	 * to free it in the end.
+	 */
+	get_bh(header_bh);
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
+				  &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_block(osb, p_blkno,
+			       &first_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+	first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+
+	if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+		ret = ocfs2_add_new_xattr_cluster(inode,
+						  xb_bh,
+						  &first_bh,
+						  &header_bh,
+						  &num_clusters,
+						  e_cpos,
+						  p_blkno,
+						  &extend);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (extend)
+		ret = ocfs2_extend_xattr_bucket(inode,
+						first_bh,
+						header_bh,
+						num_clusters);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(first_bh);
+	brelse(header_bh);
+	return ret;
+}
+
+static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					int offs)
+{
+	int block_off = offs >> inode->i_sb->s_blocksize_bits;
+
+	offs = offs % inode->i_sb->s_blocksize;
+	return bucket->bhs[block_off]->b_data + offs;
+}
+
+/*
+ * Handle the normal xattr set, including replace, delete and new.
+ * When the bucket is empty, "is_empty" is set and the caller can
+ * free this bucket.
+ *
+ * Note: "local" indicates the real data's locality. So we can't
+ * just its bucket locality by its length.
+ */
+static void ocfs2_xattr_set_entry_normal(struct inode *inode,
+					 struct ocfs2_xattr_info *xi,
+					 struct ocfs2_xattr_search *xs,
+					 u32 name_hash,
+					 int local,
+					 int *is_empty)
+{
+	struct ocfs2_xattr_entry *last, *xe;
+	int name_len = strlen(xi->name);
+	struct ocfs2_xattr_header *xh = xs->header;
+	u16 count = le16_to_cpu(xh->xh_count), start;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	char *val;
+	size_t offs, size, new_size;
+
+	last = &xh->xh_entries[count];
+	if (!xs->not_found) {
+		xe = xs->here;
+		offs = le16_to_cpu(xe->xe_name_offset);
+		if (ocfs2_xattr_is_local(xe))
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+		/*
+		 * If the new value will be stored outside, xi->value has been
+		 * initalized as an empty ocfs2_xattr_value_root, and the same
+		 * goes with xi->value_len, so we can set new_size safely here.
+		 * See ocfs2_xattr_set_in_bucket.
+		 */
+		new_size = OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_SIZE(xi->value_len);
+
+		le16_add_cpu(&xh->xh_name_value_len, -size);
+		if (xi->value) {
+			if (new_size > size)
+				goto set_new_name_value;
+
+			/* Now replace the old value with new one. */
+			if (local)
+				xe->xe_value_size = cpu_to_le64(xi->value_len);
+			else
+				xe->xe_value_size = 0;
+
+			val = ocfs2_xattr_bucket_get_val(inode,
+							 &xs->bucket, offs);
+			memset(val + OCFS2_XATTR_SIZE(name_len), 0,
+			       size - OCFS2_XATTR_SIZE(name_len));
+			if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
+				memcpy(val + OCFS2_XATTR_SIZE(name_len),
+				       xi->value, xi->value_len);
+
+			le16_add_cpu(&xh->xh_name_value_len, new_size);
+			ocfs2_xattr_set_local(xe, local);
+			return;
+		} else {
+			/* Remove the old entry. */
+			last -= 1;
+			memmove(xe, xe + 1,
+				(void *)last - (void *)xe);
+			memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+			le16_add_cpu(&xh->xh_count, -1);
+			if (xh->xh_count == 0 && is_empty)
+				*is_empty = 1;
+			return;
+		}
+	} else {
+		/* find a new entry for insert. */
+		int low = 0, high = count - 1, tmp;
+		struct ocfs2_xattr_entry *tmp_xe;
+
+		while (low <= high) {
+			tmp = (low + high) / 2;
+			tmp_xe = &xh->xh_entries[tmp];
+
+			if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+				low = tmp + 1;
+			else if (name_hash <
+				 le32_to_cpu(tmp_xe->xe_name_hash))
+				high = tmp - 1;
+			else
+				break;
+		}
+
+		xe = &xh->xh_entries[low];
+		if (low != count)
+			memmove(xe + 1, xe, (void *)last - (void *)xe);
+
+		le16_add_cpu(&xh->xh_count, 1);
+		memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
+		xe->xe_name_hash = cpu_to_le32(name_hash);
+		xe->xe_name_len = name_len;
+		ocfs2_xattr_set_type(xe, xi->name_index);
+	}
+
+set_new_name_value:
+	/* Insert the new name+value. */
+	size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
+
+	/*
+	 * We must make sure that the name/value pair
+	 * exists in the same block.
+	 */
+	offs = le16_to_cpu(xh->xh_free_start);
+	start = offs - size;
+
+	if (start >> inode->i_sb->s_blocksize_bits !=
+	    (offs - 1) >> inode->i_sb->s_blocksize_bits) {
+		offs = offs - offs % blocksize;
+		xh->xh_free_start = cpu_to_le16(offs);
+	}
+
+	val = ocfs2_xattr_bucket_get_val(inode,
+					 &xs->bucket, offs - size);
+	xe->xe_name_offset = cpu_to_le16(offs - size);
+
+	memset(val, 0, size);
+	memcpy(val, xi->name, name_len);
+	memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
+
+	xe->xe_value_size = cpu_to_le64(xi->value_len);
+	ocfs2_xattr_set_local(xe, local);
+	xs->here = xe;
+	le16_add_cpu(&xh->xh_free_start, -size);
+	le16_add_cpu(&xh->xh_name_value_len, size);
+
+	return;
+}
+
+static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
+					     handle_t *handle,
+					     struct ocfs2_xattr_search *xs,
+					     struct buffer_head **bhs,
+					     u16 bh_num)
+{
+	int ret = 0, off, block_off;
+	struct ocfs2_xattr_entry *xe = xs->here;
+
+	/*
+	 * First calculate all the blocks we should journal_access
+	 * and journal_dirty. The first block should always be touched.
+	 */
+	ret = ocfs2_journal_dirty(handle, bhs[0]);
+	if (ret)
+		mlog_errno(ret);
+
+	/* calc the data. */
+	off = le16_to_cpu(xe->xe_name_offset);
+	block_off = off >> inode->i_sb->s_blocksize_bits;
+	ret = ocfs2_journal_dirty(handle, bhs[block_off]);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+/*
+ * Set the xattr entry in the specified bucket.
+ * The bucket is indicated by xs->bucket and it should have the enough
+ * space for the xattr insertion.
+ */
+static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+					   struct ocfs2_xattr_info *xi,
+					   struct ocfs2_xattr_search *xs,
+					   u32 name_hash,
+					   int local,
+					   int *bucket_empty)
+{
+	int i, ret;
+	handle_t *handle = NULL;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "Set xattr entry len = %d index = %d in bucket %llu\n",
+	     xi->value_len, xi->name_index,
+	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+
+	if (!xs->bucket.bhs[1]) {
+		ret = ocfs2_read_blocks(osb,
+					xs->bucket.bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, blk_per_bucket);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash,
+				     local, bucket_empty);
+
+	/*Only dirty the blocks we have touched in set xattr. */
+	ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
+						xs->bucket.bhs, blk_per_bucket);
+	if (ret)
+		mlog_errno(ret);
+out:
+	ocfs2_commit_trans(osb, handle);
+
+	return ret;
+}
+
+static int ocfs2_xattr_value_update_size(struct inode *inode,
+					 struct buffer_head *xe_bh,
+					 struct ocfs2_xattr_entry *xe,
+					 u64 new_size)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, 1);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xe_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	xe->xe_value_size = cpu_to_le64(new_size);
+
+	ret = ocfs2_journal_dirty(handle, xe_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * Truncate the specified xe_off entry in xattr bucket.
+ * bucket is indicated by header_bh and len is the new length.
+ * Both the ocfs2_xattr_value_root and the entry will be updated here.
+ *
+ * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
+ */
+static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
+					     struct buffer_head *header_bh,
+					     int xe_off,
+					     int len)
+{
+	int ret, offset;
+	u64 value_blk;
+	struct buffer_head *value_bh = NULL;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)header_bh->b_data;
+	size_t blocksize = inode->i_sb->s_blocksize;
+
+	xe = &xh->xh_entries[xe_off];
+
+	BUG_ON(!xe || ocfs2_xattr_is_local(xe));
+
+	offset = le16_to_cpu(xe->xe_name_offset) +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	value_blk = offset / blocksize;
+
+	/* We don't allow ocfs2_xattr_value to be stored in different block. */
+	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
+	value_blk += header_bh->b_blocknr;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), value_blk,
+			       &value_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xv = (struct ocfs2_xattr_value_root *)
+		(value_bh->b_data + offset % blocksize);
+
+	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+	     xe_off, (unsigned long long)header_bh->b_blocknr, len);
+	ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	brelse(value_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
+						struct ocfs2_xattr_search *xs,
+						int len)
+{
+	int ret, offset;
+	struct ocfs2_xattr_entry *xe = xs->here;
+	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
+
+	BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+
+	offset = xe - xh->xh_entries;
+	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+						offset, len);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+						struct ocfs2_xattr_search *xs,
+						char *val,
+						int value_len)
+{
+	int offset;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe = xs->here;
+
+	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
+
+	offset = le16_to_cpu(xe->xe_name_offset) +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
+
+	return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+}
+
+/*
+ * Remove the xattr bucket pointed by bucket_bh.
+ * All the buckets after it in the same xattr extent rec will be
+ * move forward one by one.
+ */
+static int ocfs2_rm_xattr_bucket(struct inode *inode,
+				 struct buffer_head *first_bh,
+				 struct ocfs2_xattr_bucket *bucket)
+{
+	int ret = 0, credits;
+	struct ocfs2_xattr_header *xh =
+				(struct ocfs2_xattr_header *)first_bh->b_data;
+	u16 bucket_num = le16_to_cpu(xh->xh_num_buckets);
+	u64 end, start = bucket->bhs[0]->b_blocknr;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	end = first_bh->b_blocknr + (bucket_num - 1) * blk_per_bucket;
+
+	mlog(0, "rm xattr bucket %llu\n", start);
+	/*
+	 * We need to update the first xattr_header and all the buckets starting
+	 * from start in this xattr rec.
+	 *
+	 * XXX: Should we empty the old last bucket here?
+	 */
+	credits = 1 + end - start;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, first_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+
+	while (start < end) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle,
+					    start + blk_per_bucket,
+					    start, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		start += blk_per_bucket;
+	}
+
+	/* update the first_bh. */
+	xh->xh_num_buckets = cpu_to_le16(bucket_num - 1);
+	ocfs2_journal_dirty(handle, first_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+	return ret;
+}
+
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno,
+				  u32 cpos,
+				  u32 len)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_extent_list *root_el = &xb->xb_attrs.xb_root.xt_list;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
+	     cpos, len, (unsigned long long)blkno);
+
+	ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
+
+	ret = ocfs2_lock_allocators(inode, root_bh, root_el,
+				    0, 1, NULL, &meta_ac,
+				    OCFS2_XATTR_TREE_EXTENT, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_remove_extent(inode, root_bh, cpos, len, handle, meta_ac,
+				  &dealloc, OCFS2_XATTR_TREE_EXTENT, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+/*
+ * Free the xattr bucket indicated by xs->bucket and if all the buckets
+ * in the clusters is free, free the clusters also.
+ */
+static int ocfs2_xattr_bucket_shrink(struct inode *inode,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xs,
+				     u32 name_hash)
+{
+	int ret;
+	u32 e_cpos, num_clusters;
+	u64 p_blkno;
+	struct buffer_head *first_bh = NULL;
+	struct ocfs2_xattr_header *first_xh;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+
+	BUG_ON(xs->header->xh_count != 0);
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+				  &e_cpos, &num_clusters,
+				  &xb->xb_attrs.xb_root.xt_list);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno,
+			       &first_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_rm_xattr_bucket(inode, first_bh, &xs->bucket);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+	if (first_xh->xh_num_buckets == 0)
+		ret = ocfs2_rm_xattr_cluster(inode, xs->xattr_bh,
+					     p_blkno, e_cpos,
+					     num_clusters);
+
+out:
+	brelse(first_bh);
+	return ret;
+}
+
+static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+					 struct ocfs2_xattr_search *xs)
+{
+	handle_t *handle = NULL;
+	struct ocfs2_xattr_header *xh = xs->bucket.xh;
+	struct ocfs2_xattr_entry *last = &xh->xh_entries[
+						le16_to_cpu(xh->xh_count) - 1];
+	int ret = 0;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/* Remove the old entry. */
+	memmove(xs->here, xs->here + 1,
+		(void *)last - (void *)xs->here);
+	memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+	le16_add_cpu(&xh->xh_count, -1);
+
+	ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+}
+
+/*
+ * Set the xattr name/value in the bucket specified in xs.
+ *
+ * As the new value in xi may be stored in the bucket or in an outside cluster,
+ * we divide the whole process into 3 steps:
+ * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
+ * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
+ * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
+ * 4. If the clusters for the new outside value can't be allocated, we need
+ *    to free the xattr we allocated in set.
+ */
+static int ocfs2_xattr_set_in_bucket(struct inode *inode,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xs)
+{
+	int ret, local = 1, bucket_empty = 0;
+	size_t value_len;
+	char *val = (char *)xi->value;
+	struct ocfs2_xattr_entry *xe = xs->here;
+	u32 name_hash = ocfs2_xattr_hash_by_name(inode,
+						 xi->name_index, xi->name);
+
+	if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
+		/*
+		 * We need to truncate the xattr storage first.
+		 *
+		 * If both the old and new value are stored to
+		 * outside block, we only need to truncate
+		 * the storage and then set the value outside.
+		 *
+		 * If the new value should be stored within block,
+		 * we should free all the outside block first and
+		 * the modification to the xattr block will be done
+		 * by following steps.
+		 */
+		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+			value_len = xi->value_len;
+		else
+			value_len = 0;
+
+		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+							   value_len);
+		if (ret)
+			goto out;
+
+		if (value_len)
+			goto set_value_outside;
+	}
+
+	value_len = xi->value_len;
+	/* So we have to handle the inside block change now. */
+	if (value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/*
+		 * If the new value will be stored outside of block,
+		 * initalize a new empty value root and insert it first.
+		 */
+		local = 0;
+		xi->value = &def_xv;
+		xi->value_len = OCFS2_XATTR_ROOT_SIZE;
+	}
+
+	ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash,
+					      local, &bucket_empty);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/* allocate the space now for the outside block storage. */
+		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+							   value_len);
+		if (ret) {
+			mlog_errno(ret);
+
+			if (xs->not_found) {
+				/*
+				 * We can't allocate enough clusters for outside
+				 * storage and we have allocated xattr already,
+				 * so need to remove it.
+				 */
+				ocfs2_xattr_bucket_remove_xs(inode, xs);
+			}
+			goto out;
+		}
+	} else {
+		if (bucket_empty)
+			ret = ocfs2_xattr_bucket_shrink(inode, xi,
+							xs, name_hash);
+		goto out;
+	}
+
+set_value_outside:
+	ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+out:
+	return ret;
+}
+
+/* check whether the xattr bucket is filled up with the same hash value. */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+					      struct ocfs2_xattr_bucket *bucket)
+{
+	struct ocfs2_xattr_header *xh = bucket->xh;
+
+	if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
+	    xh->xh_entries[0].xe_name_hash) {
+		mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+		     "hash = %u\n",
+		     (unsigned long long)bucket->bhs[0]->b_blocknr,
+		     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_header *xh;
+	struct ocfs2_xattr_entry *xe;
+	u16 count, header_size, xh_free_start;
+	int i, free, max_free, need, old;
+	size_t value_size = 0, name_len = strlen(xi->name);
+	size_t blocksize = inode->i_sb->s_blocksize;
+	int ret, allocation = 0;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	mlog_entry("Set xattr %s in xattr index block\n", xi->name);
+
+try_again:
+	xh = xs->header;
+	count = le16_to_cpu(xh->xh_count);
+	xh_free_start = le16_to_cpu(xh->xh_free_start);
+	header_size = sizeof(struct ocfs2_xattr_header) +
+			count * sizeof(struct ocfs2_xattr_entry);
+	max_free = OCFS2_XATTR_BUCKET_SIZE -
+		le16_to_cpu(xh->xh_name_value_len) - header_size;
+
+	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
+			"of %u which exceed block size\n",
+			(unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+			header_size);
+
+	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+		value_size = OCFS2_XATTR_ROOT_SIZE;
+	else if (xi->value)
+		value_size = OCFS2_XATTR_SIZE(xi->value_len);
+
+	if (xs->not_found)
+		need = sizeof(struct ocfs2_xattr_entry) +
+			OCFS2_XATTR_SIZE(name_len) + value_size;
+	else {
+		need = value_size + OCFS2_XATTR_SIZE(name_len);
+
+		/*
+		 * We only replace the old value if the new length is smaller
+		 * than the old one. Otherwise we will allocate new space in the
+		 * bucket to store it.
+		 */
+		xe = xs->here;
+		if (ocfs2_xattr_is_local(xe))
+			old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+		if (old >= value_size)
+			need = 0;
+	}
+
+	free = xh_free_start - header_size;
+	/*
+	 * We need to make sure the new name/value pair
+	 * can exist in the same block.
+	 */
+	if (xh_free_start % blocksize < need)
+		free -= xh_free_start % blocksize;
+
+	mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
+	     "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
+	     " %u\n", xs->not_found,
+	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+	     free, need, max_free, le16_to_cpu(xh->xh_free_start),
+	     le16_to_cpu(xh->xh_name_value_len));
+
+	if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+		if (need <= max_free &&
+		    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+			/*
+			 * We can create the space by defragment. Since only the
+			 * name/value will be moved, the xe shouldn't be changed
+			 * in xs.
+			 */
+			ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			xh_free_start = le16_to_cpu(xh->xh_free_start);
+			free = xh_free_start - header_size;
+			if (xh_free_start % blocksize < need)
+				free -= xh_free_start % blocksize;
+
+			if (free >= need)
+				goto xattr_set;
+
+			mlog(0, "Can't get enough space for xattr insert by "
+			     "defragment. Need %u bytes, but we have %d, so "
+			     "allocate new bucket for it.\n", need, free);
+		}
+
+		/*
+		 * We have to add new buckets or clusters and one
+		 * allocation should leave us enough space for insert.
+		 */
+		BUG_ON(allocation);
+
+		/*
+		 * We do not allow for overlapping ranges between buckets. And
+		 * the maximum number of collisions we will allow for then is
+		 * one bucket's worth, so check it here whether we need to
+		 * add a new bucket for the insert.
+		 */
+		ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_add_new_xattr_bucket(inode,
+						 xs->xattr_bh,
+						 xs->bucket.bhs[0]);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(xs->bucket.bhs[i]);
+
+		memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+						   xi->name_index,
+						   xi->name, xs);
+		if (ret && ret != -ENODATA)
+			goto out;
+		xs->not_found = ret;
+		allocation = 1;
+		goto try_again;
+	}
+
+xattr_set:
+	ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+out:
+	mlog_exit(ret);
+	return ret;
+}
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 02afa87d5e6..c25c7c62a05 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -57,4 +57,12 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
 {
 	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
 }
+
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+	u16 len = sb->s_blocksize -
+		 offsetof(struct ocfs2_xattr_header, xh_entries);
+
+	return len / sizeof(struct ocfs2_xattr_entry);
+}
 #endif /* OCFS2_XATTR_H */
-- 
cgit v1.2.3


From a394425643e1e9c3a624d629fc8ba5633d8474c6 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 18 Aug 2008 17:38:54 +0800
Subject: ocfs2: Delete all xattr buckets during inode removal

In inode removal, we need to iterate all the buckets, remove any
externally-stored EA values and delete the xattr buckets.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 80 insertions(+), 4 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5e8fae94888..9ec7136b3ad 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -131,6 +131,9 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 					     struct ocfs2_xattr_info *xi,
 					     struct ocfs2_xattr_search *xs);
 
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+					  struct buffer_head *xb_bh);
+
 static inline struct xattr_handler *ocfs2_xattr_handler(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -1511,13 +1514,14 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 				    struct buffer_head *blk_bh)
 {
 	struct ocfs2_xattr_block *xb;
-	struct ocfs2_xattr_header *header;
 	int ret = 0;
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	header = &(xb->xb_attrs.xb_header);
-
-	ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
+		ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+	} else
+		ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
 
 	return ret;
 }
@@ -4738,3 +4742,75 @@ out:
 	mlog_exit(ret);
 	return ret;
 }
+
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					void *para)
+{
+	int ret = 0;
+	struct ocfs2_xattr_header *xh = bucket->xh;
+	u16 i;
+	struct ocfs2_xattr_entry *xe;
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = ocfs2_xattr_bucket_value_truncate(inode,
+							bucket->bhs[0],
+							i, 0);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+					  struct buffer_head *xb_bh)
+{
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+	int ret = 0;
+	u32 name_hash = UINT_MAX, e_cpos, num_clusters;
+	u64 p_blkno;
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return 0;
+
+	while (name_hash > 0) {
+		ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+					  &e_cpos, &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+						  ocfs2_delete_xattr_in_bucket,
+						  NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
+					     p_blkno, e_cpos, num_clusters);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		if (e_cpos == 0)
+			break;
+
+		name_hash = e_cpos - 1;
+	}
+
+out:
+	return ret;
+}
-- 
cgit v1.2.3


From 8154da3d2114241cf3edb108b43e2172be86d483 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Mon, 18 Aug 2008 17:11:46 +0800
Subject: ocfs2: Add incompatible flag for extended attribute

This patch adds the s_incompat flag for extended attribute support. This
helps us ensure that older versions of Ocfs2 or ocfs2-tools will not be able
to mount a volume with xattr support.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2.h    |  7 +++++++
 fs/ocfs2/ocfs2_fs.h | 19 +++++++++++++------
 fs/ocfs2/super.c    |  3 ++-
 fs/ocfs2/xattr.c    | 12 ++++++++++++
 4 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index cae0dd4b7f7..6d3c10ddf48 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -363,6 +363,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
 	return 0;
 }
 
+static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 8d5e72f2c5c..f24ce3d3f95 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -91,7 +91,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
 					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
-					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
+					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
+					 | OCFS2_FEATURE_INCOMPAT_XATTR)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 
 /*
@@ -128,10 +129,6 @@
 /* Support for data packed into inode blocks */
 #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA	0x0040
 
-/* Support for the extended slot map */
-#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
-
-
 /*
  * Support for alternate, userspace cluster stacks.  If set, the superblock
  * field s_cluster_info contains a tag for the alternate stack in use as
@@ -143,6 +140,12 @@
  */
 #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK	0x0080
 
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+/* Support for extended attributes */
+#define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -578,7 +581,11 @@ struct ocfs2_super_block {
 /*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
 						     stack.  Only valid
 						     with INCOMPAT flag. */
-/*B8*/  __le64 s_reserved2[17];		/* Fill out superblock */
+/*B8*/	__le16 s_xattr_inline_size;	/* extended attribute inline size
+					   for this fs*/
+	__le16 s_reserved0;
+	__le32 s_reserved1;
+/*C0*/  __le64 s_reserved2[16];		/* Fill out superblock */
 /*140*/
 
 	/*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3b04f5d2e89..c85e525950a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1437,7 +1437,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
 	osb->slot_num = OCFS2_INVALID_SLOT;
 
-	osb->s_xattr_inline_size = OCFS2_MIN_XATTR_INLINE_SIZE;
+	osb->s_xattr_inline_size = le16_to_cpu(
+					di->id2.i_super.s_xattr_inline_size);
 
 	osb->local_alloc_state = OCFS2_LA_UNUSED;
 	osb->local_alloc_bh = NULL;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 9ec7136b3ad..090449f9263 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -564,6 +564,9 @@ ssize_t ocfs2_listxattr(struct dentry *dentry,
 	struct ocfs2_dinode *di = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
 
+	if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
+		return -EOPNOTSUPP;
+
 	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
 		return ret;
 
@@ -843,6 +846,9 @@ int ocfs2_xattr_get(struct inode *inode,
 		.not_found = -ENODATA,
 	};
 
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
 	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
 		ret = -ENODATA;
 
@@ -1541,6 +1547,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 	handle_t *handle;
 	int ret;
 
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return 0;
+
 	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
 		return 0;
 
@@ -1977,6 +1986,9 @@ int ocfs2_xattr_set(struct inode *inode,
 		.not_found = -ENODATA,
 	};
 
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret < 0) {
 		mlog_errno(ret);
-- 
cgit v1.2.3


From ff1ec20ef65d51cc3466e86912cdeaac16f3aaa0 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 19 Aug 2008 10:54:29 -0700
Subject: ocfs2: fix printk format warnings

This patch fixes the following build warnings:

fs/ocfs2/xattr.c: In function 'ocfs2_half_xattr_bucket':
fs/ocfs2/xattr.c:3282: warning: format '%d' expects type 'int', but argument 7 has type 'long int'
fs/ocfs2/xattr.c:3282: warning: format '%d' expects type 'int', but argument 8 has type 'long int'
fs/ocfs2/xattr.c:3282: warning: format '%d' expects type 'int', but argument 7 has type 'long int'
fs/ocfs2/xattr.c:3282: warning: format '%d' expects type 'int', but argument 8 has type 'long int'
fs/ocfs2/xattr.c:3282: warning: format '%d' expects type 'int', but argument 7 has type 'long int'
fs/ocfs2/xattr.c:3282: warning: format '%d' expects type 'int', but argument 8 has type 'long int'
fs/ocfs2/xattr.c: In function 'ocfs2_xattr_set_entry_in_bucket':
fs/ocfs2/xattr.c:4092: warning: format '%d' expects type 'int', but argument 6 has type 'size_t'
fs/ocfs2/xattr.c:4092: warning: format '%d' expects type 'int', but argument 6 has type 'size_t'
fs/ocfs2/xattr.c:4092: warning: format '%d' expects type 'int', but argument 6 has type 'size_t'

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 090449f9263..1b349c7367a 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3264,7 +3264,8 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 	xe = &xh->xh_entries[start];
 	len = sizeof(struct ocfs2_xattr_entry) * (count - start);
 	mlog(0, "mv xattr entry len %d from %d to %d\n", len,
-	     (char *)xe - (char *)xh, (char *)xh->xh_entries - (char *)xh);
+	     (int)((char *)xe - (char *)xh),
+	     (int)((char *)xh->xh_entries - (char *)xh));
 	memmove((char *)xh->xh_entries, (char *)xe, len);
 	xe = &xh->xh_entries[count - start];
 	len = sizeof(struct ocfs2_xattr_entry) * start;
@@ -4073,8 +4074,8 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog(0, "Set xattr entry len = %d index = %d in bucket %llu\n",
-	     xi->value_len, xi->name_index,
+	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
+	     (unsigned long)xi->value_len, xi->name_index,
 	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
 
 	if (!xs->bucket.bhs[1]) {
-- 
cgit v1.2.3


From 35dc0aa3c5e7391319754e0c19cdfc0a28eb5b25 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Aug 2008 16:25:06 -0700
Subject: ocfs2: Prefix the extent tree operations structure.

The ocfs2_extent_tree_operations structure gains a field prefix on its
members.  The ->eo_sanity_check() operation gains a wrapper function for
completeness.  All of the extent tree operation wrappers gain a
consistent name (ocfs2_et_*()).

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 85 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 46 insertions(+), 39 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f65cb43edb7..f2e35a8f019 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -65,12 +65,13 @@
 struct ocfs2_extent_tree;
 
 struct ocfs2_extent_tree_operations {
-	void (*set_last_eb_blk) (struct ocfs2_extent_tree *et, u64 blkno);
-	u64 (*get_last_eb_blk) (struct ocfs2_extent_tree *et);
-	void (*update_clusters) (struct inode *inode,
-				 struct ocfs2_extent_tree *et,
-				 u32 new_clusters);
-	int (*sanity_check) (struct inode *inode, struct ocfs2_extent_tree *et);
+	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
+				   u64 blkno);
+	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
+	void (*eo_update_clusters)(struct inode *inode,
+				   struct ocfs2_extent_tree *et,
+				   u32 new_clusters);
+	int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
 };
 
 struct ocfs2_extent_tree {
@@ -132,10 +133,10 @@ static int ocfs2_dinode_sanity_check(struct inode *inode,
 }
 
 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
-	.set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
-	.get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
-	.update_clusters	= ocfs2_dinode_update_clusters,
-	.sanity_check		= ocfs2_dinode_sanity_check,
+	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_dinode_update_clusters,
+	.eo_sanity_check	= ocfs2_dinode_sanity_check,
 };
 
 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -172,10 +173,10 @@ static int ocfs2_xattr_value_sanity_check(struct inode *inode,
 }
 
 static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
-	.set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
-	.get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
-	.update_clusters	= ocfs2_xattr_value_update_clusters,
-	.sanity_check		= ocfs2_xattr_value_sanity_check,
+	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
+	.eo_sanity_check	= ocfs2_xattr_value_sanity_check,
 };
 
 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -214,10 +215,10 @@ static int ocfs2_xattr_tree_sanity_check(struct inode *inode,
 }
 
 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
-	.set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
-	.get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
-	.update_clusters	= ocfs2_xattr_tree_update_clusters,
-	.sanity_check		= ocfs2_xattr_tree_sanity_check,
+	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
+	.eo_sanity_check	= ocfs2_xattr_tree_sanity_check,
 };
 
 static struct ocfs2_extent_tree*
@@ -265,22 +266,28 @@ static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
 	}
 }
 
-static inline void ocfs2_set_last_eb_blk(struct ocfs2_extent_tree *et,
-					 u64 new_last_eb_blk)
+static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					    u64 new_last_eb_blk)
 {
-	et->eops->set_last_eb_blk(et, new_last_eb_blk);
+	et->eops->eo_set_last_eb_blk(et, new_last_eb_blk);
 }
 
-static inline u64 ocfs2_get_last_eb_blk(struct ocfs2_extent_tree *et)
+static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-	return et->eops->get_last_eb_blk(et);
+	return et->eops->eo_get_last_eb_blk(et);
 }
 
-static inline void ocfs2_update_clusters(struct inode *inode,
-					 struct ocfs2_extent_tree *et,
-					 u32 clusters)
+static inline void ocfs2_et_update_clusters(struct inode *inode,
+					    struct ocfs2_extent_tree *et,
+					    u32 clusters)
+{
+	et->eops->eo_update_clusters(inode, et, clusters);
+}
+
+static inline int ocfs2_et_sanity_check(struct inode *inode,
+					struct ocfs2_extent_tree *et)
 {
-	et->eops->update_clusters(inode, et, clusters);
+	return et->eops->eo_sanity_check(inode, et);
 }
 
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
@@ -913,7 +920,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	/* fe needs a new last extent block pointer, as does the
 	 * next_leaf on the previously last-extent-block. */
-	ocfs2_set_last_eb_blk(et, new_last_eb_blk);
+	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
 
 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
@@ -1029,7 +1036,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	/* If this is our 1st tree depth shift, then last_eb_blk
 	 * becomes the allocated extent block */
 	if (root_el->l_tree_depth == cpu_to_le16(1))
-		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
 	status = ocfs2_journal_dirty(handle, et->root_bh);
 	if (status < 0) {
@@ -2427,7 +2434,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		ocfs2_update_edge_lengths(inode, handle, left_path);
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
 		/*
 		 * Removal of the extent in the left leaf was skipped
@@ -2688,7 +2695,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 	struct ocfs2_extent_list *el;
 
 
-	ret = et->eops->sanity_check(inode, et);
+	ret = ocfs2_et_sanity_check(inode, et);
 	if (ret)
 		goto out;
 	/*
@@ -2747,7 +2754,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		ocfs2_update_edge_lengths(inode, handle, left_path);
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 	} else {
 		/*
 		 * 'path' is also the leftmost path which
@@ -2763,7 +2770,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		el->l_next_free_rec = 0;
 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
 
-		ocfs2_set_last_eb_blk(et, 0);
+		ocfs2_et_set_last_eb_blk(et, 0);
 	}
 
 	ocfs2_journal_dirty(handle, path_root_bh(path));
@@ -3980,8 +3987,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
 out_update_clusters:
 	if (type->ins_split == SPLIT_NONE)
-		ocfs2_update_clusters(inode, et,
-				      le16_to_cpu(insert_rec->e_leaf_clusters));
+		ocfs2_et_update_clusters(inode, et,
+					 le16_to_cpu(insert_rec->e_leaf_clusters));
 
 	ret = ocfs2_journal_dirty(handle, et->root_bh);
 	if (ret)
@@ -4229,7 +4236,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		 * may want it later.
 		 */
 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       ocfs2_get_last_eb_blk(et), &bh,
+				       ocfs2_et_get_last_eb_blk(et), &bh,
 				       OCFS2_BH_CACHED, inode);
 		if (ret) {
 			mlog_exit(ret);
@@ -4306,7 +4313,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 	 * the case that we're doing a tail append, so maybe we can
 	 * take advantage of that information somehow.
 	 */
-	if (ocfs2_get_last_eb_blk(et) ==
+	if (ocfs2_et_get_last_eb_blk(et) ==
 	    path_leaf_bh(path)->b_blocknr) {
 		/*
 		 * Ok, ocfs2_find_path() returned us the rightmost
@@ -4814,7 +4821,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 		struct ocfs2_extent_block *eb;
 
 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       ocfs2_get_last_eb_blk(et),
+				       ocfs2_et_get_last_eb_blk(et),
 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
 		if (ret) {
 			mlog_exit(ret);
@@ -4981,7 +4988,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 	depth = path->p_tree_depth;
 	if (depth > 0) {
 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       ocfs2_get_last_eb_blk(et),
+				       ocfs2_et_get_last_eb_blk(et),
 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
 		if (ret < 0) {
 			mlog_errno(ret);
-- 
cgit v1.2.3


From ce1d9ea621291ed5e985d6677278c6bb20d96a40 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Aug 2008 16:30:07 -0700
Subject: ocfs2: Prefix the ocfs2_extent_tree structure.

The members of the ocfs2_extent_tree structure gain a prefix of 'et_'.
All users are updated.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 118 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 61 insertions(+), 57 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f2e35a8f019..4ade2b259e6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -75,28 +75,30 @@ struct ocfs2_extent_tree_operations {
 };
 
 struct ocfs2_extent_tree {
-	enum ocfs2_extent_tree_type type;
-	struct ocfs2_extent_tree_operations *eops;
-	struct buffer_head *root_bh;
-	struct ocfs2_extent_list *root_el;
-	void *private;
-	unsigned int max_leaf_clusters;
+	enum ocfs2_extent_tree_type		et_type;
+	struct ocfs2_extent_tree_operations	*et_ops;
+	struct buffer_head			*et_root_bh;
+	struct ocfs2_extent_list		*et_root_el;
+	void					*et_private;
+	unsigned int				et_max_leaf_clusters;
 };
 
 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					 u64 blkno)
 {
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
+	struct ocfs2_dinode *di =
+		(struct ocfs2_dinode *)et->et_root_bh->b_data;
 
-	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+	BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
 	di->i_last_eb_blk = cpu_to_le64(blkno);
 }
 
 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
+	struct ocfs2_dinode *di =
+		(struct ocfs2_dinode *)et->et_root_bh->b_data;
 
-	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+	BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
 	return le64_to_cpu(di->i_last_eb_blk);
 }
 
@@ -105,7 +107,7 @@ static void ocfs2_dinode_update_clusters(struct inode *inode,
 					 u32 clusters)
 {
 	struct ocfs2_dinode *di =
-			(struct ocfs2_dinode *)et->root_bh->b_data;
+			(struct ocfs2_dinode *)et->et_root_bh->b_data;
 
 	le32_add_cpu(&di->i_clusters, clusters);
 	spin_lock(&OCFS2_I(inode)->ip_lock);
@@ -119,9 +121,9 @@ static int ocfs2_dinode_sanity_check(struct inode *inode,
 	int ret = 0;
 	struct ocfs2_dinode *di;
 
-	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+	BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
 
-	di = (struct ocfs2_dinode *)et->root_bh->b_data;
+	di = (struct ocfs2_dinode *)et->et_root_bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(di)) {
 		ret = -EIO;
 		ocfs2_error(inode->i_sb,
@@ -143,7 +145,7 @@ static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					      u64 blkno)
 {
 	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *)et->private;
+		(struct ocfs2_xattr_value_root *)et->et_private;
 
 	xv->xr_last_eb_blk = cpu_to_le64(blkno);
 }
@@ -151,7 +153,7 @@ static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
 	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *) et->private;
+		(struct ocfs2_xattr_value_root *) et->et_private;
 
 	return le64_to_cpu(xv->xr_last_eb_blk);
 }
@@ -161,7 +163,7 @@ static void ocfs2_xattr_value_update_clusters(struct inode *inode,
 					      u32 clusters)
 {
 	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *)et->private;
+		(struct ocfs2_xattr_value_root *)et->et_private;
 
 	le32_add_cpu(&xv->xr_clusters, clusters);
 }
@@ -183,7 +185,7 @@ static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					     u64 blkno)
 {
 	struct ocfs2_xattr_block *xb =
-		(struct ocfs2_xattr_block *) et->root_bh->b_data;
+		(struct ocfs2_xattr_block *) et->et_root_bh->b_data;
 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
 
 	xt->xt_last_eb_blk = cpu_to_le64(blkno);
@@ -192,7 +194,7 @@ static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
 	struct ocfs2_xattr_block *xb =
-		(struct ocfs2_xattr_block *) et->root_bh->b_data;
+		(struct ocfs2_xattr_block *) et->et_root_bh->b_data;
 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
 
 	return le64_to_cpu(xt->xt_last_eb_blk);
@@ -203,7 +205,7 @@ static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
 					     u32 clusters)
 {
 	struct ocfs2_xattr_block *xb =
-			(struct ocfs2_xattr_block *)et->root_bh->b_data;
+			(struct ocfs2_xattr_block *)et->et_root_bh->b_data;
 
 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
 }
@@ -233,25 +235,26 @@ static struct ocfs2_extent_tree*
 	if (!et)
 		return NULL;
 
-	et->type = et_type;
+	et->et_type = et_type;
 	get_bh(bh);
-	et->root_bh = bh;
-	et->private = private;
+	et->et_root_bh = bh;
+	et->et_private = private;
 
 	if (et_type == OCFS2_DINODE_EXTENT) {
-		et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
-		et->eops = &ocfs2_dinode_et_ops;
+		et->et_root_el =
+			&((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
+		et->et_ops = &ocfs2_dinode_et_ops;
 	} else if (et_type == OCFS2_XATTR_VALUE_EXTENT) {
 		struct ocfs2_xattr_value_root *xv =
 			(struct ocfs2_xattr_value_root *) private;
-		et->root_el = &xv->xr_list;
-		et->eops = &ocfs2_xattr_et_ops;
+		et->et_root_el = &xv->xr_list;
+		et->et_ops = &ocfs2_xattr_et_ops;
 	} else if (et_type == OCFS2_XATTR_TREE_EXTENT) {
 		struct ocfs2_xattr_block *xb =
 			(struct ocfs2_xattr_block *)bh->b_data;
-		et->root_el = &xb->xb_attrs.xb_root.xt_list;
-		et->eops = &ocfs2_xattr_tree_et_ops;
-		et->max_leaf_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+		et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
+		et->et_ops = &ocfs2_xattr_tree_et_ops;
+		et->et_max_leaf_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
 						OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
 	}
 
@@ -261,7 +264,7 @@ static struct ocfs2_extent_tree*
 static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
 {
 	if (et) {
-		brelse(et->root_bh);
+		brelse(et->et_root_bh);
 		kfree(et);
 	}
 }
@@ -269,25 +272,25 @@ static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					    u64 new_last_eb_blk)
 {
-	et->eops->eo_set_last_eb_blk(et, new_last_eb_blk);
+	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
 }
 
 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-	return et->eops->eo_get_last_eb_blk(et);
+	return et->et_ops->eo_get_last_eb_blk(et);
 }
 
 static inline void ocfs2_et_update_clusters(struct inode *inode,
 					    struct ocfs2_extent_tree *et,
 					    u32 clusters)
 {
-	et->eops->eo_update_clusters(inode, et, clusters);
+	et->et_ops->eo_update_clusters(inode, et, clusters);
 }
 
 static inline int ocfs2_et_sanity_check(struct inode *inode,
 					struct ocfs2_extent_tree *et)
 {
-	return et->eops->eo_sanity_check(inode, et);
+	return et->et_ops->eo_sanity_check(inode, et);
 }
 
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
@@ -805,7 +808,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
 	} else
-		el = et->root_el;
+		el = et->et_root_el;
 
 	/* we never add a branch to a leaf. */
 	BUG_ON(!el->l_tree_depth);
@@ -895,7 +898,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		mlog_errno(status);
 		goto bail;
 	}
-	status = ocfs2_journal_access(handle, inode, et->root_bh,
+	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -928,7 +931,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
 	if (status < 0)
 		mlog_errno(status);
-	status = ocfs2_journal_dirty(handle, et->root_bh);
+	status = ocfs2_journal_dirty(handle, et->et_root_bh);
 	if (status < 0)
 		mlog_errno(status);
 	if (eb_bh) {
@@ -994,7 +997,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	}
 
 	eb_el = &eb->h_list;
-	root_el = et->root_el;
+	root_el = et->et_root_el;
 
 	status = ocfs2_journal_access(handle, inode, new_eb_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -1015,7 +1018,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, inode, et->root_bh,
+	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1038,7 +1041,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	if (root_el->l_tree_depth == cpu_to_le16(1))
 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
-	status = ocfs2_journal_dirty(handle, et->root_bh);
+	status = ocfs2_journal_dirty(handle, et->et_root_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1088,7 +1091,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	*target_bh = NULL;
 
-	el = et->root_el;
+	el = et->et_root_el;
 
 	while(le16_to_cpu(el->l_tree_depth) > 1) {
 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
@@ -1140,7 +1143,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	/* If we didn't find one and the fe doesn't have any room,
 	 * then return '1' */
-	el = et->root_el;
+	el = et->et_root_el;
 	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
 		status = 1;
 
@@ -1169,7 +1172,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
 			   struct ocfs2_alloc_context *meta_ac)
 {
 	int ret, shift;
-	struct ocfs2_extent_list *el = et->root_el;
+	struct ocfs2_extent_list *el = et->et_root_el;
 	int depth = le16_to_cpu(el->l_tree_depth);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *bh = NULL;
@@ -2765,7 +2768,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		 */
 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
 
-		el = et->root_el;
+		el = et->et_root_el;
 		el->l_tree_depth = 0;
 		el->l_next_free_rec = 0;
 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
@@ -3898,9 +3901,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 	struct ocfs2_path *left_path = NULL;
 	struct ocfs2_extent_list *el;
 
-	el = et->root_el;
+	el = et->et_root_el;
 
-	ret = ocfs2_journal_access(handle, inode, et->root_bh,
+	ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
@@ -3912,7 +3915,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		goto out_update_clusters;
 	}
 
-	right_path = ocfs2_new_path(et->root_bh, et->root_el);
+	right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -3962,7 +3965,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		 * ocfs2_rotate_tree_right() might have extended the
 		 * transaction without re-journaling our tree root.
 		 */
-		ret = ocfs2_journal_access(handle, inode, et->root_bh,
+		ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
@@ -3990,7 +3993,7 @@ out_update_clusters:
 		ocfs2_et_update_clusters(inode, et,
 					 le16_to_cpu(insert_rec->e_leaf_clusters));
 
-	ret = ocfs2_journal_dirty(handle, et->root_bh);
+	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
 	if (ret)
 		mlog_errno(ret);
 
@@ -4148,7 +4151,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
 		 * Caller might want us to limit the size of extents, don't
 		 * calculate contiguousness if we might exceed that limit.
 		 */
-		if (et->max_leaf_clusters && len > et->max_leaf_clusters)
+		if (et->et_max_leaf_clusters &&
+		    (len > et->et_max_leaf_clusters))
 			insert->ins_contig = CONTIG_NONE;
 	}
 }
@@ -4225,7 +4229,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 
 	insert->ins_split = SPLIT_NONE;
 
-	el = et->root_el;
+	el = et->et_root_el;
 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
 
 	if (el->l_tree_depth) {
@@ -4263,7 +4267,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		return 0;
 	}
 
-	path = ocfs2_new_path(et->root_bh, et->root_el);
+	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4404,7 +4408,7 @@ static int ocfs2_insert_extent(struct ocfs2_super *osb,
 	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
 	if (status < 0)
 		mlog_errno(status);
-	else if (et->type == OCFS2_DINODE_EXTENT)
+	else if (et->et_type == OCFS2_DINODE_EXTENT)
 		ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
@@ -4678,7 +4682,7 @@ leftright:
 	 */
 	rec = path_leaf_el(path)->l_recs[split_index];
 
-	rightmost_el = et->root_el;
+	rightmost_el = et->et_root_el;
 
 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
 	if (depth) {
@@ -4921,7 +4925,7 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 	if (et_type == OCFS2_DINODE_EXTENT)
 		ocfs2_extent_map_trunc(inode, 0);
 
-	left_path = ocfs2_new_path(et->root_bh, et->root_el);
+	left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -5001,7 +5005,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 		rightmost_el = path_leaf_el(path);
 
 	credits += path->p_tree_depth +
-		   ocfs2_extend_meta_needed(et->root_el);
+		   ocfs2_extend_meta_needed(et->et_root_el);
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -5214,7 +5218,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 
 	ocfs2_extent_map_trunc(inode, 0);
 
-	path = ocfs2_new_path(et->root_bh, et->root_el);
+	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
-- 
cgit v1.2.3


From dc0ce61af418305afa7e0d05d86ab334e0daabf7 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Aug 2008 16:48:35 -0700
Subject: ocfs2: Make ocfs2_extent_tree get/put instead of alloc.

Rather than allocating a struct ocfs2_extent_tree, just put it on the
stack.  Fill it with ocfs2_get_extent_tree() and drop it with
ocfs2_put_extent_tree().  Now the callers don't have to ENOMEM, yet
still safely ref the root_bh.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 117 +++++++++++++++++--------------------------------------
 1 file changed, 36 insertions(+), 81 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4ade2b259e6..c200d332168 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -223,22 +223,17 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 	.eo_sanity_check	= ocfs2_xattr_tree_sanity_check,
 };
 
-static struct ocfs2_extent_tree*
-	 ocfs2_new_extent_tree(struct inode *inode,
-			       struct buffer_head *bh,
-			       enum ocfs2_extent_tree_type et_type,
-			       void *private)
+static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
+				  struct inode *inode,
+				  struct buffer_head *bh,
+				  enum ocfs2_extent_tree_type et_type,
+				  void *private)
 {
-	struct ocfs2_extent_tree *et;
-
-	et = kzalloc(sizeof(*et), GFP_NOFS);
-	if (!et)
-		return NULL;
-
 	et->et_type = et_type;
 	get_bh(bh);
 	et->et_root_bh = bh;
 	et->et_private = private;
+	et->et_max_leaf_clusters = 0;
 
 	if (et_type == OCFS2_DINODE_EXTENT) {
 		et->et_root_el =
@@ -257,16 +252,11 @@ static struct ocfs2_extent_tree*
 		et->et_max_leaf_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
 						OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
 	}
-
-	return et;
 }
 
-static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
+static void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
 {
-	if (et) {
-		brelse(et->et_root_bh);
-		kfree(et);
-	}
+	brelse(et->et_root_bh);
 }
 
 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -4430,22 +4420,15 @@ int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
 			       struct ocfs2_alloc_context *meta_ac)
 {
 	int status;
-	struct ocfs2_extent_tree *et = NULL;
-
-	et = ocfs2_new_extent_tree(inode, root_bh, OCFS2_DINODE_EXTENT, NULL);
-	if (!et) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
+	struct ocfs2_extent_tree et;
 
+	ocfs2_get_extent_tree(&et, inode, root_bh, OCFS2_DINODE_EXTENT,
+			      NULL);
 	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
 				     cpos, start_blk, new_clusters,
-				     flags, meta_ac, et);
+				     flags, meta_ac, &et);
+	ocfs2_put_extent_tree(&et);
 
-	if (et)
-		ocfs2_free_extent_tree(et);
-bail:
 	return status;
 }
 
@@ -4461,23 +4444,15 @@ int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
 				    void *private)
 {
 	int status;
-	struct ocfs2_extent_tree *et = NULL;
-
-	et = ocfs2_new_extent_tree(inode, root_bh,
-				   OCFS2_XATTR_VALUE_EXTENT, private);
-	if (!et) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
+	struct ocfs2_extent_tree et;
 
+	ocfs2_get_extent_tree(&et, inode, root_bh,
+			      OCFS2_XATTR_VALUE_EXTENT, private);
 	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
 				     cpos, start_blk, new_clusters,
-				     flags, meta_ac, et);
+				     flags, meta_ac, &et);
+	ocfs2_put_extent_tree(&et);
 
-	if (et)
-		ocfs2_free_extent_tree(et);
-bail:
 	return status;
 }
 
@@ -4492,23 +4467,15 @@ int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
 				   struct ocfs2_alloc_context *meta_ac)
 {
 	int status;
-	struct ocfs2_extent_tree *et = NULL;
-
-	et = ocfs2_new_extent_tree(inode, root_bh, OCFS2_XATTR_TREE_EXTENT,
-				   NULL);
-	if (!et) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
+	struct ocfs2_extent_tree et;
 
+	ocfs2_get_extent_tree(&et, inode, root_bh, OCFS2_XATTR_TREE_EXTENT,
+			      NULL);
 	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
 				     cpos, start_blk, new_clusters,
-				     flags, meta_ac, et);
+				     flags, meta_ac, &et);
+	ocfs2_put_extent_tree(&et);
 
-	if (et)
-		ocfs2_free_extent_tree(et);
-bail:
 	return status;
 }
 
@@ -4897,11 +4864,13 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 	struct ocfs2_extent_rec split_rec;
 	struct ocfs2_path *left_path = NULL;
 	struct ocfs2_extent_list *el;
-	struct ocfs2_extent_tree *et = NULL;
+	struct ocfs2_extent_tree et;
 
 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
 
+	ocfs2_get_extent_tree(&et, inode, root_bh, et_type, private);
+
 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
 			    "that are being written to, but the feature bit "
@@ -4911,13 +4880,6 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 		goto out;
 	}
 
-	et = ocfs2_new_extent_tree(inode, root_bh, et_type, private);
-	if (!et) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
-	}
-
 	/*
 	 * XXX: This should be fixed up so that we just re-insert the
 	 * next extent records.
@@ -4925,7 +4887,7 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 	if (et_type == OCFS2_DINODE_EXTENT)
 		ocfs2_extent_map_trunc(inode, 0);
 
-	left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	left_path = ocfs2_new_path(et.et_root_bh, et.et_root_el);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4956,7 +4918,7 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
 
-	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
+	ret = __ocfs2_mark_extent_written(inode, &et, handle, left_path,
 					  index, &split_rec, meta_ac,
 					  dealloc);
 	if (ret)
@@ -4964,8 +4926,7 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 
 out:
 	ocfs2_free_path(left_path);
-	if (et)
-		ocfs2_free_extent_tree(et);
+	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
@@ -5207,18 +5168,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 	struct ocfs2_extent_rec *rec;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_path *path = NULL;
-	struct ocfs2_extent_tree *et = NULL;
+	struct ocfs2_extent_tree et;
 
-	et = ocfs2_new_extent_tree(inode, root_bh, et_type, private);
-	if (!et) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
-	}
+	ocfs2_get_extent_tree(&et, inode, root_bh, et_type, private);
 
 	ocfs2_extent_map_trunc(inode, 0);
 
-	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	path = ocfs2_new_path(et.et_root_bh, et.et_root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -5271,13 +5227,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 
 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len, et);
+					 cpos, len, &et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	} else {
-		ret = ocfs2_split_tree(inode, et, handle, path, index,
+		ret = ocfs2_split_tree(inode, &et, handle, path, index,
 				       trunc_range, meta_ac);
 		if (ret) {
 			mlog_errno(ret);
@@ -5326,7 +5282,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 		}
 
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len, et);
+					 cpos, len, &et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -5335,8 +5291,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 
 out:
 	ocfs2_free_path(path);
-	if (et)
-		ocfs2_free_extent_tree(et);
+	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
-- 
cgit v1.2.3


From ea5efa151265a743f48e3d371992a0100d73a0eb Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Aug 2008 16:57:27 -0700
Subject: ocfs2: Make 'private' into 'object' on ocfs2_extent_tree.

The 'private' pointer was a way to store off xattr values, which don't
live at a set place in the bh.  But the concept of "the object
containing the extent tree" is much more generic.  For an inode it's the
struct ocfs2_dinode, for an xattr value its the value.  Let's save off
the 'object' at all times.  If NULL is passed to
ocfs2_get_extent_tree(), 'object' is set to bh->b_data;

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 62 ++++++++++++++++++++++++++------------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index c200d332168..4cefcb6a47a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -79,15 +79,14 @@ struct ocfs2_extent_tree {
 	struct ocfs2_extent_tree_operations	*et_ops;
 	struct buffer_head			*et_root_bh;
 	struct ocfs2_extent_list		*et_root_el;
-	void					*et_private;
+	void					*et_object;
 	unsigned int				et_max_leaf_clusters;
 };
 
 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					 u64 blkno)
 {
-	struct ocfs2_dinode *di =
-		(struct ocfs2_dinode *)et->et_root_bh->b_data;
+	struct ocfs2_dinode *di = et->et_object;
 
 	BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
 	di->i_last_eb_blk = cpu_to_le64(blkno);
@@ -95,8 +94,7 @@ static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
 
 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-	struct ocfs2_dinode *di =
-		(struct ocfs2_dinode *)et->et_root_bh->b_data;
+	struct ocfs2_dinode *di = et->et_object;
 
 	BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
 	return le64_to_cpu(di->i_last_eb_blk);
@@ -106,8 +104,7 @@ static void ocfs2_dinode_update_clusters(struct inode *inode,
 					 struct ocfs2_extent_tree *et,
 					 u32 clusters)
 {
-	struct ocfs2_dinode *di =
-			(struct ocfs2_dinode *)et->et_root_bh->b_data;
+	struct ocfs2_dinode *di = et->et_object;
 
 	le32_add_cpu(&di->i_clusters, clusters);
 	spin_lock(&OCFS2_I(inode)->ip_lock);
@@ -123,7 +120,7 @@ static int ocfs2_dinode_sanity_check(struct inode *inode,
 
 	BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
 
-	di = (struct ocfs2_dinode *)et->et_root_bh->b_data;
+	di = et->et_object;
 	if (!OCFS2_IS_VALID_DINODE(di)) {
 		ret = -EIO;
 		ocfs2_error(inode->i_sb,
@@ -145,7 +142,7 @@ static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					      u64 blkno)
 {
 	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *)et->et_private;
+		(struct ocfs2_xattr_value_root *)et->et_object;
 
 	xv->xr_last_eb_blk = cpu_to_le64(blkno);
 }
@@ -153,7 +150,7 @@ static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
 	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *) et->et_private;
+		(struct ocfs2_xattr_value_root *) et->et_object;
 
 	return le64_to_cpu(xv->xr_last_eb_blk);
 }
@@ -163,7 +160,7 @@ static void ocfs2_xattr_value_update_clusters(struct inode *inode,
 					      u32 clusters)
 {
 	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *)et->et_private;
+		(struct ocfs2_xattr_value_root *)et->et_object;
 
 	le32_add_cpu(&xv->xr_clusters, clusters);
 }
@@ -184,8 +181,7 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					     u64 blkno)
 {
-	struct ocfs2_xattr_block *xb =
-		(struct ocfs2_xattr_block *) et->et_root_bh->b_data;
+	struct ocfs2_xattr_block *xb = et->et_object;
 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
 
 	xt->xt_last_eb_blk = cpu_to_le64(blkno);
@@ -193,8 +189,7 @@ static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
 
 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-	struct ocfs2_xattr_block *xb =
-		(struct ocfs2_xattr_block *) et->et_root_bh->b_data;
+	struct ocfs2_xattr_block *xb = et->et_object;
 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
 
 	return le64_to_cpu(xt->xt_last_eb_blk);
@@ -204,8 +199,7 @@ static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
 					     struct ocfs2_extent_tree *et,
 					     u32 clusters)
 {
-	struct ocfs2_xattr_block *xb =
-			(struct ocfs2_xattr_block *)et->et_root_bh->b_data;
+	struct ocfs2_xattr_block *xb = et->et_object;
 
 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
 }
@@ -227,26 +221,28 @@ static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
 				  struct inode *inode,
 				  struct buffer_head *bh,
 				  enum ocfs2_extent_tree_type et_type,
-				  void *private)
+				  void *obj)
 {
 	et->et_type = et_type;
 	get_bh(bh);
 	et->et_root_bh = bh;
-	et->et_private = private;
 	et->et_max_leaf_clusters = 0;
+	if (!obj)
+		obj = (void *)bh->b_data;
+	et->et_object = obj;
 
 	if (et_type == OCFS2_DINODE_EXTENT) {
 		et->et_root_el =
-			&((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
+			&((struct ocfs2_dinode *)obj)->id2.i_list;
 		et->et_ops = &ocfs2_dinode_et_ops;
 	} else if (et_type == OCFS2_XATTR_VALUE_EXTENT) {
 		struct ocfs2_xattr_value_root *xv =
-			(struct ocfs2_xattr_value_root *) private;
+			(struct ocfs2_xattr_value_root *)obj;
 		et->et_root_el = &xv->xr_list;
 		et->et_ops = &ocfs2_xattr_et_ops;
 	} else if (et_type == OCFS2_XATTR_TREE_EXTENT) {
 		struct ocfs2_xattr_block *xb =
-			(struct ocfs2_xattr_block *)bh->b_data;
+			(struct ocfs2_xattr_block *)obj;
 		et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
 		et->et_ops = &ocfs2_xattr_tree_et_ops;
 		et->et_max_leaf_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
@@ -593,7 +589,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *root_bh,
 			   enum ocfs2_extent_tree_type type,
-			   void *private)
+			   void *obj)
 {
 	int retval;
 	struct ocfs2_extent_list *el = NULL;
@@ -617,7 +613,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 		el = &fe->id2.i_list;
 	} else if (type == OCFS2_XATTR_VALUE_EXTENT) {
 		struct ocfs2_xattr_value_root *xv =
-			(struct ocfs2_xattr_value_root *) private;
+			(struct ocfs2_xattr_value_root *) obj;
 
 		last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk);
 		el = &xv->xr_list;
@@ -4441,13 +4437,13 @@ int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
 				    u32 new_clusters,
 				    u8 flags,
 				    struct ocfs2_alloc_context *meta_ac,
-				    void *private)
+				    void *obj)
 {
 	int status;
 	struct ocfs2_extent_tree et;
 
 	ocfs2_get_extent_tree(&et, inode, root_bh,
-			      OCFS2_XATTR_VALUE_EXTENT, private);
+			      OCFS2_XATTR_VALUE_EXTENT, obj);
 	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
 				     cpos, start_blk, new_clusters,
 				     flags, meta_ac, &et);
@@ -4498,7 +4494,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 				struct ocfs2_alloc_context *meta_ac,
 				enum ocfs2_alloc_restarted *reason_ret,
 				enum ocfs2_extent_tree_type type,
-				void *private)
+				void *obj)
 {
 	int status = 0;
 	int free_extents;
@@ -4513,7 +4509,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 		flags = OCFS2_EXT_UNWRITTEN;
 
 	free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type,
-					      private);
+					      obj);
 	if (free_extents < 0) {
 		status = free_extents;
 		mlog_errno(status);
@@ -4575,7 +4571,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 							 inode, root_bh,
 							 *logical_offset,
 							 block, num_bits, flags,
-							 meta_ac, private);
+							 meta_ac, obj);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -4857,7 +4853,7 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
 			      enum ocfs2_extent_tree_type et_type,
-			      void *private)
+			      void *obj)
 {
 	int ret, index;
 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
@@ -4869,7 +4865,7 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
 
-	ocfs2_get_extent_tree(&et, inode, root_bh, et_type, private);
+	ocfs2_get_extent_tree(&et, inode, root_bh, et_type, obj);
 
 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
@@ -5161,7 +5157,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc,
 			enum ocfs2_extent_tree_type et_type,
-			void *private)
+			void *obj)
 {
 	int ret, index;
 	u32 rec_range, trunc_range;
@@ -5170,7 +5166,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 	struct ocfs2_path *path = NULL;
 	struct ocfs2_extent_tree et;
 
-	ocfs2_get_extent_tree(&et, inode, root_bh, et_type, private);
+	ocfs2_get_extent_tree(&et, inode, root_bh, et_type, obj);
 
 	ocfs2_extent_map_trunc(inode, 0);
 
-- 
cgit v1.2.3


From 0ce1010f1a4319e02574b856d50dfdc0ed855f40 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Aug 2008 17:19:50 -0700
Subject: ocfs2: Provide the get_root_el() method to
 ocfs2_extent_tree_operations.

The root_el of an ocfs2_extent_tree needs to be calculated from
et->et_object.  Make it an operation on et->et_ops.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4cefcb6a47a..fe2ddbb81f7 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -72,6 +72,10 @@ struct ocfs2_extent_tree_operations {
 				   struct ocfs2_extent_tree *et,
 				   u32 new_clusters);
 	int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
+
+	/* These are internal to ocfs2_extent_tree and don't have
+	 * accessor functions */
+	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
 };
 
 struct ocfs2_extent_tree {
@@ -83,6 +87,13 @@ struct ocfs2_extent_tree {
 	unsigned int				et_max_leaf_clusters;
 };
 
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	et->et_root_el = &di->id2.i_list;
+}
+
 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					 u64 blkno)
 {
@@ -136,8 +147,16 @@ static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_dinode_update_clusters,
 	.eo_sanity_check	= ocfs2_dinode_sanity_check,
+	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
 };
 
+static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_value_root *xv = et->et_object;
+
+	et->et_root_el = &xv->xr_list;
+}
+
 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					      u64 blkno)
 {
@@ -176,8 +195,16 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
 	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
 	.eo_sanity_check	= ocfs2_xattr_value_sanity_check,
+	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
 };
 
+static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+
+	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
+}
+
 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					     u64 blkno)
 {
@@ -215,6 +242,7 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
 	.eo_sanity_check	= ocfs2_xattr_tree_sanity_check,
+	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
 };
 
 static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
@@ -232,22 +260,16 @@ static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
 	et->et_object = obj;
 
 	if (et_type == OCFS2_DINODE_EXTENT) {
-		et->et_root_el =
-			&((struct ocfs2_dinode *)obj)->id2.i_list;
 		et->et_ops = &ocfs2_dinode_et_ops;
 	} else if (et_type == OCFS2_XATTR_VALUE_EXTENT) {
-		struct ocfs2_xattr_value_root *xv =
-			(struct ocfs2_xattr_value_root *)obj;
-		et->et_root_el = &xv->xr_list;
 		et->et_ops = &ocfs2_xattr_et_ops;
 	} else if (et_type == OCFS2_XATTR_TREE_EXTENT) {
-		struct ocfs2_xattr_block *xb =
-			(struct ocfs2_xattr_block *)obj;
-		et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
 		et->et_ops = &ocfs2_xattr_tree_et_ops;
 		et->et_max_leaf_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
 						OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
 	}
+
+	et->et_ops->eo_fill_root_el(et);
 }
 
 static void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
-- 
cgit v1.2.3


From 1c25d93a4a27c90c3ae33f9e724f7b67783d68d1 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Aug 2008 17:09:42 -0700
Subject: ocfs2: Use struct ocfs2_extent_tree in ocfs2_num_free_extents().

ocfs2_num_free_extents() re-implements the logic of
ocfs2_get_extent_tree().  Now that ocfs2_get_extent_tree() does not
allocate, let's use it in ocfs2_num_free_extents() to simplify the code.

The inode validation code in ocfs2_num_free_extents() is not needed.
All callers are passing in pre-validated inodes.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fe2ddbb81f7..d1aa7249deb 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -618,34 +618,13 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 	struct ocfs2_extent_block *eb;
 	struct buffer_head *eb_bh = NULL;
 	u64 last_eb_blk = 0;
+	struct ocfs2_extent_tree et;
 
 	mlog_entry_void();
 
-	if (type == OCFS2_DINODE_EXTENT) {
-		struct ocfs2_dinode *fe =
-				(struct ocfs2_dinode *)root_bh->b_data;
-		if (!OCFS2_IS_VALID_DINODE(fe)) {
-			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-			retval = -EIO;
-			goto bail;
-		}
-
-		if (fe->i_last_eb_blk)
-			last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
-		el = &fe->id2.i_list;
-	} else if (type == OCFS2_XATTR_VALUE_EXTENT) {
-		struct ocfs2_xattr_value_root *xv =
-			(struct ocfs2_xattr_value_root *) obj;
-
-		last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk);
-		el = &xv->xr_list;
-	} else if (type == OCFS2_XATTR_TREE_EXTENT) {
-		struct ocfs2_xattr_block *xb =
-			(struct ocfs2_xattr_block *)root_bh->b_data;
-
-		last_eb_blk = le64_to_cpu(xb->xb_attrs.xb_root.xt_last_eb_blk);
-		el = &xb->xb_attrs.xb_root.xt_list;
-	}
+	ocfs2_get_extent_tree(&et, inode, root_bh, type, obj);
+	el = et.et_root_el;
+	last_eb_blk = ocfs2_et_get_last_eb_blk(&et);
 
 	if (last_eb_blk) {
 		retval = ocfs2_read_block(osb, last_eb_blk,
@@ -665,6 +644,7 @@ bail:
 	if (eb_bh)
 		brelse(eb_bh);
 
+	ocfs2_put_extent_tree(&et);
 	mlog_exit(retval);
 	return retval;
 }
-- 
cgit v1.2.3


From 943cced39ee45ed2db25efd25eee8ba49cf2dfc4 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Aug 2008 17:31:10 -0700
Subject: ocfs2: Determine an extent tree's max_leaf_clusters in an et_op.

Provide an optional extent_tree_operation to specify the
max_leaf_clusters of an ocfs2_extent_tree.  If not provided, the value
is 0 (unlimited).

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d1aa7249deb..64f1af4e999 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -76,6 +76,8 @@ struct ocfs2_extent_tree_operations {
 	/* These are internal to ocfs2_extent_tree and don't have
 	 * accessor functions */
 	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
+	void (*eo_fill_max_leaf_clusters)(struct inode *inode,
+					  struct ocfs2_extent_tree *et);
 };
 
 struct ocfs2_extent_tree {
@@ -205,6 +207,14 @@ static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
 	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
 }
 
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
+						    struct ocfs2_extent_tree *et)
+{
+	et->et_max_leaf_clusters =
+		ocfs2_clusters_for_bytes(inode->i_sb,
+					 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+}
+
 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					     u64 blkno)
 {
@@ -243,6 +253,7 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
 	.eo_sanity_check	= ocfs2_xattr_tree_sanity_check,
 	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
+	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 };
 
 static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
@@ -254,7 +265,6 @@ static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
 	et->et_type = et_type;
 	get_bh(bh);
 	et->et_root_bh = bh;
-	et->et_max_leaf_clusters = 0;
 	if (!obj)
 		obj = (void *)bh->b_data;
 	et->et_object = obj;
@@ -265,11 +275,13 @@ static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
 		et->et_ops = &ocfs2_xattr_et_ops;
 	} else if (et_type == OCFS2_XATTR_TREE_EXTENT) {
 		et->et_ops = &ocfs2_xattr_tree_et_ops;
-		et->et_max_leaf_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
-						OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
 	}
 
 	et->et_ops->eo_fill_root_el(et);
+	if (!et->et_ops->eo_fill_max_leaf_clusters)
+		et->et_max_leaf_clusters = 0;
+	else
+		et->et_ops->eo_fill_max_leaf_clusters(inode, et);
 }
 
 static void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
-- 
cgit v1.2.3


From 1a09f556e5415a29cdddaf9a6ebf474194161cf3 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Aug 2008 17:44:24 -0700
Subject: ocfs2: Create specific get_extent_tree functions.

A caller knows what kind of extent tree they have.  There's no reason
they have to call ocfs2_get_extent_tree() with a NULL when they could
just as easily call a specific function to their type of extent tree.

Introduce ocfs2_dinode_get_extent_tree(),
ocfs2_xattr_tree_get_extent_tree(), and
ocfs2_xattr_value_get_extent_tree().  They only take the necessary
arguments, calling into the underlying __ocfs2_get_extent_tree() to do
the real work.

__ocfs2_get_extent_tree() is the old ocfs2_get_extent_tree(), but
without needing any switch-by-type logic.

ocfs2_get_extent_tree() is now a wrapper around the specific calls.  It
exists because a couple alloc.c functions can take et_type.  This will
go later.

Another benefit is that ocfs2_xattr_value_get_extent_tree() can take a
struct ocfs2_xattr_value_root* instead of void*.  This gives us
typechecking where we didn't have it before.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 76 ++++++++++++++++++++++++++++++++++++++++----------------
 fs/ocfs2/alloc.h |  2 +-
 2 files changed, 56 insertions(+), 22 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 64f1af4e999..7b08180a4c6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -192,7 +192,7 @@ static int ocfs2_xattr_value_sanity_check(struct inode *inode,
 	return 0;
 }
 
-static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
+static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
@@ -256,27 +256,21 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 };
 
-static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
-				  struct inode *inode,
-				  struct buffer_head *bh,
-				  enum ocfs2_extent_tree_type et_type,
-				  void *obj)
+static void __ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
+				    struct inode *inode,
+				    struct buffer_head *bh,
+				    void *obj,
+				    enum ocfs2_extent_tree_type et_type,
+				    struct ocfs2_extent_tree_operations *ops)
 {
 	et->et_type = et_type;
+	et->et_ops = ops;
 	get_bh(bh);
 	et->et_root_bh = bh;
 	if (!obj)
 		obj = (void *)bh->b_data;
 	et->et_object = obj;
 
-	if (et_type == OCFS2_DINODE_EXTENT) {
-		et->et_ops = &ocfs2_dinode_et_ops;
-	} else if (et_type == OCFS2_XATTR_VALUE_EXTENT) {
-		et->et_ops = &ocfs2_xattr_et_ops;
-	} else if (et_type == OCFS2_XATTR_TREE_EXTENT) {
-		et->et_ops = &ocfs2_xattr_tree_et_ops;
-	}
-
 	et->et_ops->eo_fill_root_el(et);
 	if (!et->et_ops->eo_fill_max_leaf_clusters)
 		et->et_max_leaf_clusters = 0;
@@ -284,6 +278,49 @@ static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
 		et->et_ops->eo_fill_max_leaf_clusters(inode, et);
 }
 
+static void ocfs2_get_dinode_extent_tree(struct ocfs2_extent_tree *et,
+					 struct inode *inode,
+					 struct buffer_head *bh)
+{
+	__ocfs2_get_extent_tree(et, inode, bh, NULL, OCFS2_DINODE_EXTENT,
+				&ocfs2_dinode_et_ops);
+}
+
+static void ocfs2_get_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+					     struct inode *inode,
+					     struct buffer_head *bh)
+{
+	__ocfs2_get_extent_tree(et, inode, bh, NULL,
+				OCFS2_XATTR_TREE_EXTENT,
+				&ocfs2_xattr_tree_et_ops);
+}
+
+static void ocfs2_get_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					     struct inode *inode,
+					     struct buffer_head *bh,
+					     struct ocfs2_xattr_value_root *xv)
+{
+	__ocfs2_get_extent_tree(et, inode, bh, xv,
+				OCFS2_XATTR_VALUE_EXTENT,
+				&ocfs2_xattr_value_et_ops);
+}
+
+static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
+				  struct inode *inode,
+				  struct buffer_head *bh,
+				  enum ocfs2_extent_tree_type et_type,
+				  void *obj)
+{
+	if (et_type == OCFS2_DINODE_EXTENT)
+		ocfs2_get_dinode_extent_tree(et, inode, bh);
+	else if (et_type == OCFS2_XATTR_VALUE_EXTENT)
+		ocfs2_get_xattr_tree_extent_tree(et, inode, bh);
+	else if (et_type == OCFS2_XATTR_TREE_EXTENT)
+		ocfs2_get_xattr_value_extent_tree(et, inode, bh, obj);
+	else
+		BUG();
+}
+
 static void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
 {
 	brelse(et->et_root_bh);
@@ -4432,8 +4469,7 @@ int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
 	int status;
 	struct ocfs2_extent_tree et;
 
-	ocfs2_get_extent_tree(&et, inode, root_bh, OCFS2_DINODE_EXTENT,
-			      NULL);
+	ocfs2_get_dinode_extent_tree(&et, inode, root_bh);
 	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
 				     cpos, start_blk, new_clusters,
 				     flags, meta_ac, &et);
@@ -4451,13 +4487,12 @@ int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
 				    u32 new_clusters,
 				    u8 flags,
 				    struct ocfs2_alloc_context *meta_ac,
-				    void *obj)
+				    struct ocfs2_xattr_value_root *xv)
 {
 	int status;
 	struct ocfs2_extent_tree et;
 
-	ocfs2_get_extent_tree(&et, inode, root_bh,
-			      OCFS2_XATTR_VALUE_EXTENT, obj);
+	ocfs2_get_xattr_value_extent_tree(&et, inode, root_bh, xv);
 	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
 				     cpos, start_blk, new_clusters,
 				     flags, meta_ac, &et);
@@ -4479,8 +4514,7 @@ int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
 	int status;
 	struct ocfs2_extent_tree et;
 
-	ocfs2_get_extent_tree(&et, inode, root_bh, OCFS2_XATTR_TREE_EXTENT,
-			      NULL);
+	ocfs2_get_xattr_tree_extent_tree(&et, inode, root_bh);
 	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
 				     cpos, start_blk, new_clusters,
 				     flags, meta_ac, &et);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 23c695ddaa5..5cc9a83cf1a 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -56,7 +56,7 @@ int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
 				    u32 new_clusters,
 				    u8 flags,
 				    struct ocfs2_alloc_context *meta_ac,
-				    void *private);
+				    struct ocfs2_xattr_value_root *xv);
 int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
 				   handle_t *handle,
 				   struct inode *inode,
-- 
cgit v1.2.3


From 1e61ee79e2a96f62c007486677319814ce621c3c Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Aug 2008 18:32:45 -0700
Subject: ocfs2: Add an insertion check to ocfs2_extent_tree_operations.

A couple places check an extent_tree for a valid inode.  We move that
out to add an eo_insert_check() operation.  It can be called from
ocfs2_insert_extent() and elsewhere.

We also have the wrapper calls ocfs2_et_insert_check() and
ocfs2_et_sanity_check() ignore NULL ops.  That way we don't have to
provide useless operations for xattr types.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 69 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 44 insertions(+), 25 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 7b08180a4c6..ce54730e18f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -71,6 +71,9 @@ struct ocfs2_extent_tree_operations {
 	void (*eo_update_clusters)(struct inode *inode,
 				   struct ocfs2_extent_tree *et,
 				   u32 new_clusters);
+	int (*eo_insert_check)(struct inode *inode,
+			       struct ocfs2_extent_tree *et,
+			       struct ocfs2_extent_rec *rec);
 	int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
 
 	/* These are internal to ocfs2_extent_tree and don't have
@@ -125,6 +128,25 @@ static void ocfs2_dinode_update_clusters(struct inode *inode,
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
 
+static int ocfs2_dinode_insert_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+			(OCFS2_I(inode)->ip_clusters != rec->e_cpos),
+			"Device %s, asking for sparse allocation: inode %llu, "
+			"cpos %u, clusters %u\n",
+			osb->dev_str,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			rec->e_cpos,
+			OCFS2_I(inode)->ip_clusters);
+
+	return 0;
+}
+
 static int ocfs2_dinode_sanity_check(struct inode *inode,
 				     struct ocfs2_extent_tree *et)
 {
@@ -148,6 +170,7 @@ static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_dinode_update_clusters,
+	.eo_insert_check	= ocfs2_dinode_insert_check,
 	.eo_sanity_check	= ocfs2_dinode_sanity_check,
 	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
 };
@@ -186,17 +209,10 @@ static void ocfs2_xattr_value_update_clusters(struct inode *inode,
 	le32_add_cpu(&xv->xr_clusters, clusters);
 }
 
-static int ocfs2_xattr_value_sanity_check(struct inode *inode,
-					  struct ocfs2_extent_tree *et)
-{
-	return 0;
-}
-
 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
-	.eo_sanity_check	= ocfs2_xattr_value_sanity_check,
 	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
 };
 
@@ -241,17 +257,10 @@ static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
 }
 
-static int ocfs2_xattr_tree_sanity_check(struct inode *inode,
-					 struct ocfs2_extent_tree *et)
-{
-	return 0;
-}
-
 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
-	.eo_sanity_check	= ocfs2_xattr_tree_sanity_check,
 	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
 	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 };
@@ -344,10 +353,25 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
 	et->et_ops->eo_update_clusters(inode, et, clusters);
 }
 
+static inline int ocfs2_et_insert_check(struct inode *inode,
+					struct ocfs2_extent_tree *et,
+					struct ocfs2_extent_rec *rec)
+{
+	int ret = 0;
+
+	if (et->et_ops->eo_insert_check)
+		ret = et->et_ops->eo_insert_check(inode, et, rec);
+	return ret;
+}
+
 static inline int ocfs2_et_sanity_check(struct inode *inode,
 					struct ocfs2_extent_tree *et)
 {
-	return et->et_ops->eo_sanity_check(inode, et);
+	int ret = 0;
+
+	if (et->et_ops->eo_sanity_check)
+		ret = et->et_ops->eo_sanity_check(inode, et);
+	return ret;
 }
 
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
@@ -4399,24 +4423,19 @@ static int ocfs2_insert_extent(struct ocfs2_super *osb,
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
 
-	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
-
 	mlog(0, "add %u clusters at position %u to inode %llu\n",
 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
-			(OCFS2_I(inode)->ip_clusters != cpos),
-			"Device %s, asking for sparse allocation: inode %llu, "
-			"cpos %u, clusters %u\n",
-			osb->dev_str,
-			(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
-			OCFS2_I(inode)->ip_clusters);
-
 	memset(&rec, 0, sizeof(rec));
 	rec.e_cpos = cpu_to_le32(cpos);
 	rec.e_blkno = cpu_to_le64(start_blk);
 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
 	rec.e_flags = flags;
+	status = ocfs2_et_insert_check(inode, et, &rec);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
 
 	status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
 					  &free_records, &insert);
-- 
cgit v1.2.3


From f99b9b7ccf6a691f653cec45f36bfdd1e94769c7 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Aug 2008 19:36:33 -0700
Subject: ocfs2: Make ocfs2_extent_tree the first-class representation of a
 tree.

We now have three different kinds of extent trees in ocfs2: inode data
(dinode), extended attributes (xattr_tree), and extended attribute
values (xattr_value).  There is a nice abstraction for them,
ocfs2_extent_tree, but it is hidden in alloc.c.  All the calling
functions have to pick amongst a varied API and pass in type bits and
often extraneous pointers.

A better way is to make ocfs2_extent_tree a first-class object.
Everyone converts their object to an ocfs2_extent_tree() via the
ocfs2_get_*_extent_tree() calls, then uses the ocfs2_extent_tree for all
tree calls to alloc.c.

This simplifies a lot of callers, making for readability.  It also
provides an easy way to add additional extent tree types, as they only
need to be defined in alloc.c with a ocfs2_get_<new>_extent_tree()
function.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c    | 300 ++++++++++++++++------------------------------------
 fs/ocfs2/alloc.h    | 111 ++++++++++---------
 fs/ocfs2/aops.c     |  16 +--
 fs/ocfs2/dir.c      |  20 ++--
 fs/ocfs2/file.c     |  36 ++++---
 fs/ocfs2/suballoc.c |  12 +--
 fs/ocfs2/suballoc.h |   6 +-
 fs/ocfs2/xattr.c    |  71 +++++++------
 8 files changed, 240 insertions(+), 332 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ce54730e18f..786a8298262 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,20 +49,6 @@
 
 #include "buffer_head_io.h"
 
-/*
- * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
- * the b-tree operations in ocfs2. Now all the b-tree operations are not
- * limited to ocfs2_dinode only. Any data which need to allocate clusters
- * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
- * and operation.
- *
- * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
- * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
- * ocfs2_extent_tree_operations abstract the normal operations we do for
- * the root of extent b-tree.
- */
-struct ocfs2_extent_tree;
 
 struct ocfs2_extent_tree_operations {
 	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
@@ -83,28 +69,38 @@ struct ocfs2_extent_tree_operations {
 					  struct ocfs2_extent_tree *et);
 };
 
-struct ocfs2_extent_tree {
-	enum ocfs2_extent_tree_type		et_type;
-	struct ocfs2_extent_tree_operations	*et_ops;
-	struct buffer_head			*et_root_bh;
-	struct ocfs2_extent_list		*et_root_el;
-	void					*et_object;
-	unsigned int				et_max_leaf_clusters;
-};
 
-static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
-{
-	struct ocfs2_dinode *di = et->et_object;
-
-	et->et_root_el = &di->id2.i_list;
-}
+/*
+ * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
+ * in the methods.
+ */
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 blkno);
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+					 struct ocfs2_extent_tree *et,
+					 u32 clusters);
+static int ocfs2_dinode_insert_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec);
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_dinode_update_clusters,
+	.eo_insert_check	= ocfs2_dinode_insert_check,
+	.eo_sanity_check	= ocfs2_dinode_sanity_check,
+	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
+};
 
 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					 u64 blkno)
 {
 	struct ocfs2_dinode *di = et->et_object;
 
-	BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
 	di->i_last_eb_blk = cpu_to_le64(blkno);
 }
 
@@ -112,7 +108,7 @@ static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
 	struct ocfs2_dinode *di = et->et_object;
 
-	BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
 	return le64_to_cpu(di->i_last_eb_blk);
 }
 
@@ -153,7 +149,7 @@ static int ocfs2_dinode_sanity_check(struct inode *inode,
 	int ret = 0;
 	struct ocfs2_dinode *di;
 
-	BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
 
 	di = et->et_object;
 	if (!OCFS2_IS_VALID_DINODE(di)) {
@@ -166,14 +162,13 @@ static int ocfs2_dinode_sanity_check(struct inode *inode,
 	return ret;
 }
 
-static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
-	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
-	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
-	.eo_update_clusters	= ocfs2_dinode_update_clusters,
-	.eo_insert_check	= ocfs2_dinode_insert_check,
-	.eo_sanity_check	= ocfs2_dinode_sanity_check,
-	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
-};
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	et->et_root_el = &di->id2.i_list;
+}
+
 
 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
 {
@@ -269,10 +264,8 @@ static void __ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
 				    struct inode *inode,
 				    struct buffer_head *bh,
 				    void *obj,
-				    enum ocfs2_extent_tree_type et_type,
 				    struct ocfs2_extent_tree_operations *ops)
 {
-	et->et_type = et_type;
 	et->et_ops = ops;
 	get_bh(bh);
 	et->et_root_bh = bh;
@@ -287,50 +280,31 @@ static void __ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
 		et->et_ops->eo_fill_max_leaf_clusters(inode, et);
 }
 
-static void ocfs2_get_dinode_extent_tree(struct ocfs2_extent_tree *et,
-					 struct inode *inode,
-					 struct buffer_head *bh)
+void ocfs2_get_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				  struct inode *inode,
+				  struct buffer_head *bh)
 {
-	__ocfs2_get_extent_tree(et, inode, bh, NULL, OCFS2_DINODE_EXTENT,
-				&ocfs2_dinode_et_ops);
+	__ocfs2_get_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
 }
 
-static void ocfs2_get_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
-					     struct inode *inode,
-					     struct buffer_head *bh)
+void ocfs2_get_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				      struct inode *inode,
+				      struct buffer_head *bh)
 {
 	__ocfs2_get_extent_tree(et, inode, bh, NULL,
-				OCFS2_XATTR_TREE_EXTENT,
 				&ocfs2_xattr_tree_et_ops);
 }
 
-static void ocfs2_get_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
-					     struct inode *inode,
-					     struct buffer_head *bh,
-					     struct ocfs2_xattr_value_root *xv)
+void ocfs2_get_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+				       struct inode *inode,
+				       struct buffer_head *bh,
+				       struct ocfs2_xattr_value_root *xv)
 {
 	__ocfs2_get_extent_tree(et, inode, bh, xv,
-				OCFS2_XATTR_VALUE_EXTENT,
 				&ocfs2_xattr_value_et_ops);
 }
 
-static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
-				  struct inode *inode,
-				  struct buffer_head *bh,
-				  enum ocfs2_extent_tree_type et_type,
-				  void *obj)
-{
-	if (et_type == OCFS2_DINODE_EXTENT)
-		ocfs2_get_dinode_extent_tree(et, inode, bh);
-	else if (et_type == OCFS2_XATTR_VALUE_EXTENT)
-		ocfs2_get_xattr_tree_extent_tree(et, inode, bh);
-	else if (et_type == OCFS2_XATTR_TREE_EXTENT)
-		ocfs2_get_xattr_value_extent_tree(et, inode, bh, obj);
-	else
-		BUG();
-}
-
-static void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
+void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
 {
 	brelse(et->et_root_bh);
 }
@@ -682,22 +656,18 @@ struct ocfs2_merge_ctxt {
  */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct buffer_head *root_bh,
-			   enum ocfs2_extent_tree_type type,
-			   void *obj)
+			   struct ocfs2_extent_tree *et)
 {
 	int retval;
 	struct ocfs2_extent_list *el = NULL;
 	struct ocfs2_extent_block *eb;
 	struct buffer_head *eb_bh = NULL;
 	u64 last_eb_blk = 0;
-	struct ocfs2_extent_tree et;
 
 	mlog_entry_void();
 
-	ocfs2_get_extent_tree(&et, inode, root_bh, type, obj);
-	el = et.et_root_el;
-	last_eb_blk = ocfs2_et_get_last_eb_blk(&et);
+	el = et->et_root_el;
+	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 
 	if (last_eb_blk) {
 		retval = ocfs2_read_block(osb, last_eb_blk,
@@ -717,7 +687,6 @@ bail:
 	if (eb_bh)
 		brelse(eb_bh);
 
-	ocfs2_put_extent_tree(&et);
 	mlog_exit(retval);
 	return retval;
 }
@@ -4406,16 +4375,15 @@ out:
  *
  * The caller needs to update fe->i_clusters
  */
-static int ocfs2_insert_extent(struct ocfs2_super *osb,
-			       handle_t *handle,
-			       struct inode *inode,
-			       struct buffer_head *root_bh,
-			       u32 cpos,
-			       u64 start_blk,
-			       u32 new_clusters,
-			       u8 flags,
-			       struct ocfs2_alloc_context *meta_ac,
-			       struct ocfs2_extent_tree *et)
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			handle_t *handle,
+			struct inode *inode,
+			struct ocfs2_extent_tree *et,
+			u32 cpos,
+			u64 start_blk,
+			u32 new_clusters,
+			u8 flags,
+			struct ocfs2_alloc_context *meta_ac)
 {
 	int status;
 	int uninitialized_var(free_records);
@@ -4464,7 +4432,7 @@ static int ocfs2_insert_extent(struct ocfs2_super *osb,
 	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
 	if (status < 0)
 		mlog_errno(status);
-	else if (et->et_type == OCFS2_DINODE_EXTENT)
+	else if (et->et_ops == &ocfs2_dinode_et_ops)
 		ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
@@ -4475,77 +4443,10 @@ bail:
 	return status;
 }
 
-int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
-			       handle_t *handle,
-			       struct inode *inode,
-			       struct buffer_head *root_bh,
-			       u32 cpos,
-			       u64 start_blk,
-			       u32 new_clusters,
-			       u8 flags,
-			       struct ocfs2_alloc_context *meta_ac)
-{
-	int status;
-	struct ocfs2_extent_tree et;
-
-	ocfs2_get_dinode_extent_tree(&et, inode, root_bh);
-	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
-				     cpos, start_blk, new_clusters,
-				     flags, meta_ac, &et);
-	ocfs2_put_extent_tree(&et);
-
-	return status;
-}
-
-int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
-				    handle_t *handle,
-				    struct inode *inode,
-				    struct buffer_head *root_bh,
-				    u32 cpos,
-				    u64 start_blk,
-				    u32 new_clusters,
-				    u8 flags,
-				    struct ocfs2_alloc_context *meta_ac,
-				    struct ocfs2_xattr_value_root *xv)
-{
-	int status;
-	struct ocfs2_extent_tree et;
-
-	ocfs2_get_xattr_value_extent_tree(&et, inode, root_bh, xv);
-	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
-				     cpos, start_blk, new_clusters,
-				     flags, meta_ac, &et);
-	ocfs2_put_extent_tree(&et);
-
-	return status;
-}
-
-int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
-				   handle_t *handle,
-				   struct inode *inode,
-				   struct buffer_head *root_bh,
-				   u32 cpos,
-				   u64 start_blk,
-				   u32 new_clusters,
-				   u8 flags,
-				   struct ocfs2_alloc_context *meta_ac)
-{
-	int status;
-	struct ocfs2_extent_tree et;
-
-	ocfs2_get_xattr_tree_extent_tree(&et, inode, root_bh);
-	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
-				     cpos, start_blk, new_clusters,
-				     flags, meta_ac, &et);
-	ocfs2_put_extent_tree(&et);
-
-	return status;
-}
-
 /*
  * Allcate and add clusters into the extent b-tree.
  * The new clusters(clusters_to_add) will be inserted at logical_offset.
- * The extent b-tree's root is root_el and it should be in root_bh, and
+ * The extent b-tree's root is specified by et, and
  * it is not limited to the file storage. Any extent tree can use this
  * function if it implements the proper ocfs2_extent_tree.
  */
@@ -4554,14 +4455,11 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 				u32 *logical_offset,
 				u32 clusters_to_add,
 				int mark_unwritten,
-				struct buffer_head *root_bh,
-				struct ocfs2_extent_list *root_el,
+				struct ocfs2_extent_tree *et,
 				handle_t *handle,
 				struct ocfs2_alloc_context *data_ac,
 				struct ocfs2_alloc_context *meta_ac,
-				enum ocfs2_alloc_restarted *reason_ret,
-				enum ocfs2_extent_tree_type type,
-				void *obj)
+				enum ocfs2_alloc_restarted *reason_ret)
 {
 	int status = 0;
 	int free_extents;
@@ -4575,8 +4473,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 	if (mark_unwritten)
 		flags = OCFS2_EXT_UNWRITTEN;
 
-	free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type,
-					      obj);
+	free_extents = ocfs2_num_free_extents(osb, inode, et);
 	if (free_extents < 0) {
 		status = free_extents;
 		mlog_errno(status);
@@ -4595,7 +4492,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 		goto leave;
 	} else if ((!free_extents)
 		   && (ocfs2_alloc_context_bits_left(meta_ac)
-		       < ocfs2_extend_meta_needed(root_el))) {
+		       < ocfs2_extend_meta_needed(et->et_root_el))) {
 		mlog(0, "filesystem is really fragmented...\n");
 		status = -EAGAIN;
 		reason = RESTART_META;
@@ -4613,7 +4510,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 	BUG_ON(num_bits > clusters_to_add);
 
 	/* reserve our write early -- insert_extent may update the inode */
-	status = ocfs2_journal_access(handle, inode, root_bh,
+	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -4623,28 +4520,15 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-	if (type == OCFS2_DINODE_EXTENT)
-		status = ocfs2_dinode_insert_extent(osb, handle, inode, root_bh,
-						    *logical_offset, block,
-						    num_bits, flags, meta_ac);
-	else if (type == OCFS2_XATTR_TREE_EXTENT)
-		status = ocfs2_xattr_tree_insert_extent(osb, handle,
-							inode, root_bh,
-							*logical_offset,
-							block, num_bits, flags,
-							meta_ac);
-	else
-		status = ocfs2_xattr_value_insert_extent(osb, handle,
-							 inode, root_bh,
-							 *logical_offset,
-							 block, num_bits, flags,
-							 meta_ac, obj);
+	status = ocfs2_insert_extent(osb, handle, inode, et,
+				     *logical_offset, block,
+				     num_bits, flags, meta_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	status = ocfs2_journal_dirty(handle, root_bh);
+	status = ocfs2_journal_dirty(handle, et->et_root_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -4915,25 +4799,21 @@ out:
  *
  * The caller is responsible for passing down meta_ac if we'll need it.
  */
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
-			      struct ocfs2_cached_dealloc_ctxt *dealloc,
-			      enum ocfs2_extent_tree_type et_type,
-			      void *obj)
+			      struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret, index;
 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
 	struct ocfs2_extent_rec split_rec;
 	struct ocfs2_path *left_path = NULL;
 	struct ocfs2_extent_list *el;
-	struct ocfs2_extent_tree et;
 
 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
 
-	ocfs2_get_extent_tree(&et, inode, root_bh, et_type, obj);
-
 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
 			    "that are being written to, but the feature bit "
@@ -4946,11 +4826,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 	/*
 	 * XXX: This should be fixed up so that we just re-insert the
 	 * next extent records.
+	 *
+	 * XXX: This is a hack on the extent tree, maybe it should be
+	 * an op?
 	 */
-	if (et_type == OCFS2_DINODE_EXTENT)
+	if (et->et_ops == &ocfs2_dinode_et_ops)
 		ocfs2_extent_map_trunc(inode, 0);
 
-	left_path = ocfs2_new_path(et.et_root_bh, et.et_root_el);
+	left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4981,7 +4864,7 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
 
-	ret = __ocfs2_mark_extent_written(inode, &et, handle, left_path,
+	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
 					  index, &split_rec, meta_ac,
 					  dealloc);
 	if (ret)
@@ -4989,7 +4872,6 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 
 out:
 	ocfs2_free_path(left_path);
-	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
@@ -5219,25 +5101,21 @@ out:
 	return ret;
 }
 
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
+int ocfs2_remove_extent(struct inode *inode,
+			struct ocfs2_extent_tree *et,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
-			struct ocfs2_cached_dealloc_ctxt *dealloc,
-			enum ocfs2_extent_tree_type et_type,
-			void *obj)
+			struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret, index;
 	u32 rec_range, trunc_range;
 	struct ocfs2_extent_rec *rec;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_path *path = NULL;
-	struct ocfs2_extent_tree et;
-
-	ocfs2_get_extent_tree(&et, inode, root_bh, et_type, obj);
 
 	ocfs2_extent_map_trunc(inode, 0);
 
-	path = ocfs2_new_path(et.et_root_bh, et.et_root_el);
+	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -5290,13 +5168,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 
 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len, &et);
+					 cpos, len, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	} else {
-		ret = ocfs2_split_tree(inode, &et, handle, path, index,
+		ret = ocfs2_split_tree(inode, et, handle, path, index,
 				       trunc_range, meta_ac);
 		if (ret) {
 			mlog_errno(ret);
@@ -5345,7 +5223,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 		}
 
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len, &et);
+					 cpos, len, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -5354,7 +5232,6 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 
 out:
 	ocfs2_free_path(path);
-	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
@@ -6773,6 +6650,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct page **pages = NULL;
 	loff_t end = osb->s_clustersize;
+	struct ocfs2_extent_tree et;
 
 	has_data = i_size_read(inode) ? 1 : 0;
 
@@ -6872,8 +6750,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		 * this proves to be false, we could always re-build
 		 * the in-inode data from our pages.
 		 */
-		ret = ocfs2_dinode_insert_extent(osb, handle, inode, di_bh,
-						 0, block, 1, 0, NULL);
+		ocfs2_get_dinode_extent_tree(&et, inode, di_bh);
+		ret = ocfs2_insert_extent(osb, handle, inode, &et,
+					  0, block, 1, 0, NULL);
+		ocfs2_put_extent_tree(&et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 5cc9a83cf1a..35ad07f9610 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,46 +26,66 @@
 #ifndef OCFS2_ALLOC_H
 #define OCFS2_ALLOC_H
 
-enum ocfs2_extent_tree_type {
-	OCFS2_DINODE_EXTENT = 0,
-	OCFS2_XATTR_VALUE_EXTENT,
-	OCFS2_XATTR_TREE_EXTENT,
-};
 
 /*
  * For xattr tree leaf, we limit the leaf byte size to be 64K.
  */
 #define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
 
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree becomes the first-class object for extent tree
+ * manipulation.  Callers of the alloc.c code need to fill it via one of
+ * the ocfs2_get_*_extent_tree() operations below.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree_operations;
+struct ocfs2_extent_tree {
+	struct ocfs2_extent_tree_operations	*et_ops;
+	struct buffer_head			*et_root_bh;
+	struct ocfs2_extent_list		*et_root_el;
+	void					*et_object;
+	unsigned int				et_max_leaf_clusters;
+};
+
+/*
+ * ocfs2_*_get_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.  The bh is referenced until
+ * ocfs2_put_extent_tree().
+ */
+void ocfs2_get_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				  struct inode *inode,
+				  struct buffer_head *bh);
+void ocfs2_get_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				      struct inode *inode,
+				      struct buffer_head *bh);
+void ocfs2_get_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+				       struct inode *inode,
+				       struct buffer_head *bh,
+				       struct ocfs2_xattr_value_root *xv);
+void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et);
+
 struct ocfs2_alloc_context;
-int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
-			       handle_t *handle,
-			       struct inode *inode,
-			       struct buffer_head *root_bh,
-			       u32 cpos,
-			       u64 start_blk,
-			       u32 new_clusters,
-			       u8 flags,
-			       struct ocfs2_alloc_context *meta_ac);
-int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
-				    handle_t *handle,
-				    struct inode *inode,
-				    struct buffer_head *root_bh,
-				    u32 cpos,
-				    u64 start_blk,
-				    u32 new_clusters,
-				    u8 flags,
-				    struct ocfs2_alloc_context *meta_ac,
-				    struct ocfs2_xattr_value_root *xv);
-int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
-				   handle_t *handle,
-				   struct inode *inode,
-				   struct buffer_head *root_bh,
-				   u32 cpos,
-				   u64 start_blk,
-				   u32 new_clusters,
-				   u8 flags,
-				   struct ocfs2_alloc_context *meta_ac);
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			handle_t *handle,
+			struct inode *inode,
+			struct ocfs2_extent_tree *et,
+			u32 cpos,
+			u64 start_blk,
+			u32 new_clusters,
+			u8 flags,
+			struct ocfs2_alloc_context *meta_ac);
+
 enum ocfs2_alloc_restarted {
 	RESTART_NONE = 0,
 	RESTART_TRANS,
@@ -76,32 +96,25 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 				u32 *logical_offset,
 				u32 clusters_to_add,
 				int mark_unwritten,
-				struct buffer_head *root_bh,
-				struct ocfs2_extent_list *root_el,
+				struct ocfs2_extent_tree *et,
 				handle_t *handle,
 				struct ocfs2_alloc_context *data_ac,
 				struct ocfs2_alloc_context *meta_ac,
-				enum ocfs2_alloc_restarted *reason_ret,
-				enum ocfs2_extent_tree_type type,
-				void *private);
+				enum ocfs2_alloc_restarted *reason_ret);
 struct ocfs2_cached_dealloc_ctxt;
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
-			      struct ocfs2_cached_dealloc_ctxt *dealloc,
-			      enum ocfs2_extent_tree_type et_type,
-			      void *private);
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_extent(struct inode *inode,
+			struct ocfs2_extent_tree *et,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
-			struct ocfs2_cached_dealloc_ctxt *dealloc,
-			enum ocfs2_extent_tree_type et_type,
-			void *private);
+			struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct buffer_head *root_bh,
-			   enum ocfs2_extent_tree_type et_type,
-			   void *private);
+			   struct ocfs2_extent_tree *et);
 
 /*
  * how many new metadata chunks would an allocation need at maximum?
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 530b1ff599c..ed937fa9e4e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1242,6 +1242,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 	int ret, i, new, should_zero = 0;
 	u64 v_blkno, p_blkno;
 	struct inode *inode = mapping->host;
+	struct ocfs2_extent_tree et;
 
 	new = phys == 0 ? 1 : 0;
 	if (new || unwritten)
@@ -1276,10 +1277,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 			goto out;
 		}
 	} else if (unwritten) {
-		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+		ocfs2_get_dinode_extent_tree(&et, inode, wc->w_di_bh);
+		ret = ocfs2_mark_extent_written(inode, &et,
 						wc->w_handle, cpos, 1, phys,
-						meta_ac, &wc->w_dealloc,
-						OCFS2_DINODE_EXTENT, NULL);
+						meta_ac, &wc->w_dealloc);
+		ocfs2_put_extent_tree(&et);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1666,6 +1668,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
+	struct ocfs2_extent_tree et;
 
 	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
@@ -1719,10 +1722,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
 		     clusters_to_alloc, extents_to_split);
 
-		ret = ocfs2_lock_allocators(inode, wc->w_di_bh, &di->id2.i_list,
+		ocfs2_get_dinode_extent_tree(&et, inode, wc->w_di_bh);
+		ret = ocfs2_lock_allocators(inode, &et,
 					    clusters_to_alloc, extents_to_split,
-					    &data_ac, &meta_ac,
-					    OCFS2_DINODE_EXTENT, NULL);
+					    &data_ac, &meta_ac);
+		ocfs2_put_extent_tree(&et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 5426a02c12b..2cdc5539034 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1192,6 +1192,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	struct buffer_head *dirdata_bh = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	handle_t *handle;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_get_dinode_extent_tree(&et, dir, di_bh);
 
 	alloc = ocfs2_clusters_for_bytes(sb, bytes);
 
@@ -1305,8 +1308,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * This should never fail as our extent list is empty and all
 	 * related blocks have been journaled already.
 	 */
-	ret = ocfs2_dinode_insert_extent(osb, handle, dir, di_bh, 0, blkno,
-					 len, 0, NULL);
+	ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
+				  0, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1337,8 +1340,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		}
 		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
 
-		ret = ocfs2_dinode_insert_extent(osb, handle, dir, di_bh, 1,
-						 blkno, len, 0, NULL);
+		ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
+					  blkno, len, 0, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
@@ -1360,6 +1363,7 @@ out:
 
 	brelse(dirdata_bh);
 
+	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
@@ -1437,6 +1441,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_dir_entry * de;
 	struct super_block *sb = osb->sb;
+	struct ocfs2_extent_tree et;
 
 	mlog_entry_void();
 
@@ -1480,10 +1485,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	spin_lock(&OCFS2_I(dir)->ip_lock);
 	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
-		num_free_extents = ocfs2_num_free_extents(osb, dir,
-							  parent_fe_bh,
-							  OCFS2_DINODE_EXTENT,
-							  NULL);
+		ocfs2_get_dinode_extent_tree(&et, dir, parent_fe_bh);
+		num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
+		ocfs2_put_extent_tree(&et);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f4273c2c209..ca3d38addbb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -509,14 +509,17 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 			 struct ocfs2_alloc_context *meta_ac,
 			 enum ocfs2_alloc_restarted *reason_ret)
 {
-	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	struct ocfs2_extent_list *el = &fe->id2.i_list;
+	int ret;
+	struct ocfs2_extent_tree et;
 
-	return ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
+	ocfs2_get_dinode_extent_tree(&et, inode, fe_bh);
+	ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
 					   clusters_to_add, mark_unwritten,
-					   fe_bh, el, handle,
-					   data_ac, meta_ac, reason_ret,
-					   OCFS2_DINODE_EXTENT, NULL);
+					   &et, handle,
+					   data_ac, meta_ac, reason_ret);
+	ocfs2_put_extent_tree(&et);
+
+	return ret;
 }
 
 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
@@ -533,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_tree et;
 
 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 
@@ -564,9 +568,10 @@ restart_all:
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
 	     clusters_to_add);
-	status = ocfs2_lock_allocators(inode, bh, &fe->id2.i_list,
-				       clusters_to_add, 0, &data_ac,
-				       &meta_ac, OCFS2_DINODE_EXTENT, NULL);
+	ocfs2_get_dinode_extent_tree(&et, inode, bh);
+	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				       &data_ac, &meta_ac);
+	ocfs2_put_extent_tree(&et);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
@@ -1236,11 +1241,13 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	handle_t *handle;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_get_dinode_extent_tree(&et, inode, di_bh);
 
-	ret = ocfs2_lock_allocators(inode, di_bh, &di->id2.i_list,
-				    0, 1, NULL, &meta_ac,
-				    OCFS2_DINODE_EXTENT, NULL);
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
 	if (ret) {
+		ocfs2_put_extent_tree(&et);
 		mlog_errno(ret);
 		return ret;
 	}
@@ -1269,8 +1276,8 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
-				  dealloc, OCFS2_DINODE_EXTENT, NULL);
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+				  dealloc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1297,6 +1304,7 @@ out:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
+	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f1871ca8381..8d3947e94a2 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1914,12 +1914,11 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
  * File systems which don't support holes call this from
  * ocfs2_extend_allocation().
  */
-int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
-			  struct ocfs2_extent_list *root_el,
+int ocfs2_lock_allocators(struct inode *inode,
+			  struct ocfs2_extent_tree *et,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac,
-			  enum ocfs2_extent_tree_type type, void *private)
+			  struct ocfs2_alloc_context **meta_ac)
 {
 	int ret = 0, num_free_extents;
 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
@@ -1931,8 +1930,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
 
 	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
 
-	num_free_extents = ocfs2_num_free_extents(osb, inode, root_bh,
-						  type, private);
+	num_free_extents = ocfs2_num_free_extents(osb, inode, et);
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
 		mlog_errno(ret);
@@ -1954,7 +1952,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
 	 */
 	if (!num_free_extents ||
 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
-		ret = ocfs2_reserve_new_metadata(osb, root_el, meta_ac);
+		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 028fd633b44..dd0963695ed 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -165,10 +165,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 int ocfs2_check_group_descriptor(struct super_block *sb,
 				 struct ocfs2_dinode *di,
 				 struct ocfs2_group_desc *gd);
-int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
-			  struct ocfs2_extent_list *root_el,
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac,
-			  enum ocfs2_extent_tree_type type, void *private);
+			  struct ocfs2_alloc_context **meta_ac);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 1b349c7367a..9c3d4dc3e2e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -206,22 +206,24 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_extent_list *root_el = &xv->xr_list;
 	u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+	struct ocfs2_extent_tree et;
 
 	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
 
+	ocfs2_get_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+
 restart_all:
 
-	status = ocfs2_lock_allocators(inode, xattr_bh, root_el,
-				       clusters_to_add, 0, &data_ac,
-				       &meta_ac, OCFS2_XATTR_VALUE_EXTENT, xv);
+	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				       &data_ac, &meta_ac);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	credits = ocfs2_calc_extend_credits(osb->sb, root_el, clusters_to_add);
+	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
@@ -244,14 +246,11 @@ restarted_transaction:
 					     &logical_start,
 					     clusters_to_add,
 					     0,
-					     xattr_bh,
-					     root_el,
+					     &et,
 					     handle,
 					     data_ac,
 					     meta_ac,
-					     &why,
-					     OCFS2_XATTR_VALUE_EXTENT,
-					     xv);
+					     &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -276,7 +275,7 @@ restarted_transaction:
 			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
-							    root_el,
+							    et.et_root_el,
 							    clusters_to_add);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
@@ -308,6 +307,7 @@ leave:
 		goto restart_all;
 	}
 
+	ocfs2_put_extent_tree(&et);
 	return status;
 }
 
@@ -323,11 +323,13 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 	struct inode *tl_inode = osb->osb_tl_inode;
 	handle_t *handle;
 	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_get_xattr_value_extent_tree(&et, inode, root_bh, xv);
 
-	ret = ocfs2_lock_allocators(inode, root_bh, &xv->xr_list,
-				    0, 1, NULL, &meta_ac,
-				    OCFS2_XATTR_VALUE_EXTENT, xv);
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
 	if (ret) {
+		ocfs2_put_extent_tree(&et);
 		mlog_errno(ret);
 		return ret;
 	}
@@ -356,8 +358,8 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = ocfs2_remove_extent(inode, root_bh, cpos, len, handle, meta_ac,
-				  dealloc, OCFS2_XATTR_VALUE_EXTENT, xv);
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+				  dealloc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -383,6 +385,7 @@ out:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
+	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
@@ -3622,26 +3625,24 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_xattr_block *xb =
-			(struct ocfs2_xattr_block *)root_bh->b_data;
-	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
-	struct ocfs2_extent_list *root_el = &xb_root->xt_list;
-	enum ocfs2_extent_tree_type type = OCFS2_XATTR_TREE_EXTENT;
+	struct ocfs2_extent_tree et;
 
 	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
 	     "previous xattr blkno = %llu\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     prev_cpos, prev_blkno);
 
-	ret = ocfs2_lock_allocators(inode, root_bh, root_el,
-				    clusters_to_add, 0, &data_ac,
-				    &meta_ac, type, NULL);
+	ocfs2_get_xattr_tree_extent_tree(&et, inode, root_bh);
+
+	ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				    &data_ac, &meta_ac);
 	if (ret) {
 		mlog_errno(ret);
 		goto leave;
 	}
 
-	credits = ocfs2_calc_extend_credits(osb->sb, root_el, clusters_to_add);
+	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -3705,9 +3706,8 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 
 	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
 	     num_bits, block, v_start);
-	ret = ocfs2_xattr_tree_insert_extent(osb, handle, inode, root_bh,
-					     v_start, block, num_bits,
-					     0, meta_ac);
+	ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
+				  num_bits, 0, meta_ac);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto leave;
@@ -3727,6 +3727,7 @@ leave:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
+	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
@@ -4331,9 +4332,11 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 	handle_t *handle;
 	struct ocfs2_xattr_block *xb =
 			(struct ocfs2_xattr_block *)root_bh->b_data;
-	struct ocfs2_extent_list *root_el = &xb->xb_attrs.xb_root.xt_list;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_get_xattr_tree_extent_tree(&et, inode, root_bh);
 
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
@@ -4342,10 +4345,9 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 
 	ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
 
-	ret = ocfs2_lock_allocators(inode, root_bh, root_el,
-				    0, 1, NULL, &meta_ac,
-				    OCFS2_XATTR_TREE_EXTENT, NULL);
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
 	if (ret) {
+		ocfs2_put_extent_tree(&et);
 		mlog_errno(ret);
 		return ret;
 	}
@@ -4374,8 +4376,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = ocfs2_remove_extent(inode, root_bh, cpos, len, handle, meta_ac,
-				  &dealloc, OCFS2_XATTR_TREE_EXTENT, NULL);
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+				  &dealloc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -4405,6 +4407,7 @@ out:
 
 	ocfs2_run_deallocs(osb, &dealloc);
 
+	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 1625f8ac151743e452ec062c2989669c508ffa48 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 21 Aug 2008 17:11:10 -0700
Subject: ocfs2: Comment struct ocfs2_extent_tree_operations.

struct ocfs2_extent_tree_operations provides methods for the different
on-disk btrees in ocfs2.  Describing what those methods do is probably a
good idea.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 786a8298262..06b9bd73d6d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,21 +50,62 @@
 #include "buffer_head_io.h"
 
 
+/*
+ * Operations for a specific extent tree type.
+ *
+ * To implement an on-disk btree (extent tree) type in ocfs2, add
+ * an ocfs2_extent_tree_operations structure and the matching
+ * ocfs2_get_<thingy>_extent_tree() function.  That's pretty much it
+ * for the allocation portion of the extent tree.
+ */
 struct ocfs2_extent_tree_operations {
+	/*
+	 * last_eb_blk is the block number of the right most leaf extent
+	 * block.  Most on-disk structures containing an extent tree store
+	 * this value for fast access.  The ->eo_set_last_eb_blk() and
+	 * ->eo_get_last_eb_blk() operations access this value.  They are
+	 *  both required.
+	 */
 	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
 				   u64 blkno);
 	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * The on-disk structure usually keeps track of how many total
+	 * clusters are stored in this extent tree.  This function updates
+	 * that value.  new_clusters is the delta, and must be
+	 * added to the total.  Required.
+	 */
 	void (*eo_update_clusters)(struct inode *inode,
 				   struct ocfs2_extent_tree *et,
 				   u32 new_clusters);
+
+	/*
+	 * If ->eo_insert_check() exists, it is called before rec is
+	 * inserted into the extent tree.  It is optional.
+	 */
 	int (*eo_insert_check)(struct inode *inode,
 			       struct ocfs2_extent_tree *et,
 			       struct ocfs2_extent_rec *rec);
 	int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
 
-	/* These are internal to ocfs2_extent_tree and don't have
-	 * accessor functions */
+	/*
+	 * --------------------------------------------------------------
+	 * The remaining are internal to ocfs2_extent_tree and don't have
+	 * accessor functions
+	 */
+
+	/*
+	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
+	 * It is required.
+	 */
 	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
+	 * it exists.  If it does not, et->et_max_leaf_clusters is set
+	 * to 0 (unlimited).  Optional.
+	 */
 	void (*eo_fill_max_leaf_clusters)(struct inode *inode,
 					  struct ocfs2_extent_tree *et);
 };
-- 
cgit v1.2.3


From 8d6220d6a74a33552cf877bcea25503d7f6a59e6 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 22 Aug 2008 12:46:09 -0700
Subject: ocfs2: Change ocfs2_get_*_extent_tree() to ocfs2_init_*_extent_tree()

The original get/put_extent_tree() functions held a reference on
et_root_bh.  However, every single caller already has a safe reference,
making the get/put cycle irrelevant.

We change ocfs2_get_*_extent_tree() to ocfs2_init_*_extent_tree().  It
no longer gets a reference on et_root_bh.  ocfs2_put_extent_tree() is
removed.  Callers now have a simpler init+use pattern.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 49 +++++++++++++++++++++----------------------------
 fs/ocfs2/alloc.h | 26 ++++++++++++--------------
 fs/ocfs2/aops.c  |  6 ++----
 fs/ocfs2/dir.c   |  6 ++----
 fs/ocfs2/file.c  | 10 +++-------
 fs/ocfs2/xattr.c | 14 ++++----------
 6 files changed, 44 insertions(+), 67 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 06b9bd73d6d..47201b67dbf 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -55,7 +55,7 @@
  *
  * To implement an on-disk btree (extent tree) type in ocfs2, add
  * an ocfs2_extent_tree_operations structure and the matching
- * ocfs2_get_<thingy>_extent_tree() function.  That's pretty much it
+ * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
  * for the allocation portion of the extent tree.
  */
 struct ocfs2_extent_tree_operations {
@@ -301,14 +301,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 };
 
-static void __ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
-				    struct inode *inode,
-				    struct buffer_head *bh,
-				    void *obj,
-				    struct ocfs2_extent_tree_operations *ops)
+static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+				     struct inode *inode,
+				     struct buffer_head *bh,
+				     void *obj,
+				     struct ocfs2_extent_tree_operations *ops)
 {
 	et->et_ops = ops;
-	get_bh(bh);
 	et->et_root_bh = bh;
 	if (!obj)
 		obj = (void *)bh->b_data;
@@ -321,33 +320,28 @@ static void __ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
 		et->et_ops->eo_fill_max_leaf_clusters(inode, et);
 }
 
-void ocfs2_get_dinode_extent_tree(struct ocfs2_extent_tree *et,
-				  struct inode *inode,
-				  struct buffer_head *bh)
-{
-	__ocfs2_get_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
-}
-
-void ocfs2_get_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
-				      struct inode *inode,
-				      struct buffer_head *bh)
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct inode *inode,
+				   struct buffer_head *bh)
 {
-	__ocfs2_get_extent_tree(et, inode, bh, NULL,
-				&ocfs2_xattr_tree_et_ops);
+	__ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
 }
 
-void ocfs2_get_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 				       struct inode *inode,
-				       struct buffer_head *bh,
-				       struct ocfs2_xattr_value_root *xv)
+				       struct buffer_head *bh)
 {
-	__ocfs2_get_extent_tree(et, inode, bh, xv,
-				&ocfs2_xattr_value_et_ops);
+	__ocfs2_init_extent_tree(et, inode, bh, NULL,
+				 &ocfs2_xattr_tree_et_ops);
 }
 
-void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct inode *inode,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_value_root *xv)
 {
-	brelse(et->et_root_bh);
+	__ocfs2_init_extent_tree(et, inode, bh, xv,
+				 &ocfs2_xattr_value_et_ops);
 }
 
 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -6791,10 +6785,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		 * this proves to be false, we could always re-build
 		 * the in-inode data from our pages.
 		 */
-		ocfs2_get_dinode_extent_tree(&et, inode, di_bh);
+		ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
 		ret = ocfs2_insert_extent(osb, handle, inode, &et,
 					  0, block, 1, 0, NULL);
-		ocfs2_put_extent_tree(&et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 35ad07f9610..70257c84cfb 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -41,7 +41,7 @@
  *
  * ocfs2_extent_tree becomes the first-class object for extent tree
  * manipulation.  Callers of the alloc.c code need to fill it via one of
- * the ocfs2_get_*_extent_tree() operations below.
+ * the ocfs2_init_*_extent_tree() operations below.
  *
  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
@@ -59,21 +59,19 @@ struct ocfs2_extent_tree {
 };
 
 /*
- * ocfs2_*_get_extent_tree() will fill an ocfs2_extent_tree from the
- * specified object buffer.  The bh is referenced until
- * ocfs2_put_extent_tree().
+ * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.
  */
-void ocfs2_get_dinode_extent_tree(struct ocfs2_extent_tree *et,
-				  struct inode *inode,
-				  struct buffer_head *bh);
-void ocfs2_get_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
-				      struct inode *inode,
-				      struct buffer_head *bh);
-void ocfs2_get_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct inode *inode,
+				   struct buffer_head *bh);
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 				       struct inode *inode,
-				       struct buffer_head *bh,
-				       struct ocfs2_xattr_value_root *xv);
-void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et);
+				       struct buffer_head *bh);
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct inode *inode,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_value_root *xv);
 
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ed937fa9e4e..259775eedb8 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1277,11 +1277,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 			goto out;
 		}
 	} else if (unwritten) {
-		ocfs2_get_dinode_extent_tree(&et, inode, wc->w_di_bh);
+		ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
 		ret = ocfs2_mark_extent_written(inode, &et,
 						wc->w_handle, cpos, 1, phys,
 						meta_ac, &wc->w_dealloc);
-		ocfs2_put_extent_tree(&et);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1722,11 +1721,10 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
 		     clusters_to_alloc, extents_to_split);
 
-		ocfs2_get_dinode_extent_tree(&et, inode, wc->w_di_bh);
+		ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
 		ret = ocfs2_lock_allocators(inode, &et,
 					    clusters_to_alloc, extents_to_split,
 					    &data_ac, &meta_ac);
-		ocfs2_put_extent_tree(&et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 2cdc5539034..167e6c96277 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	handle_t *handle;
 	struct ocfs2_extent_tree et;
 
-	ocfs2_get_dinode_extent_tree(&et, dir, di_bh);
+	ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
 
 	alloc = ocfs2_clusters_for_bytes(sb, bytes);
 
@@ -1363,7 +1363,6 @@ out:
 
 	brelse(dirdata_bh);
 
-	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
@@ -1485,9 +1484,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	spin_lock(&OCFS2_I(dir)->ip_lock);
 	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
-		ocfs2_get_dinode_extent_tree(&et, dir, parent_fe_bh);
+		ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
 		num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
-		ocfs2_put_extent_tree(&et);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ca3d38addbb..441c6a94059 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -512,12 +512,11 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 	int ret;
 	struct ocfs2_extent_tree et;
 
-	ocfs2_get_dinode_extent_tree(&et, inode, fe_bh);
+	ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
 	ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
 					   clusters_to_add, mark_unwritten,
 					   &et, handle,
 					   data_ac, meta_ac, reason_ret);
-	ocfs2_put_extent_tree(&et);
 
 	return ret;
 }
@@ -568,10 +567,9 @@ restart_all:
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
 	     clusters_to_add);
-	ocfs2_get_dinode_extent_tree(&et, inode, bh);
+	ocfs2_init_dinode_extent_tree(&et, inode, bh);
 	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
 				       &data_ac, &meta_ac);
-	ocfs2_put_extent_tree(&et);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
@@ -1243,11 +1241,10 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_extent_tree et;
 
-	ocfs2_get_dinode_extent_tree(&et, inode, di_bh);
+	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
 
 	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
 	if (ret) {
-		ocfs2_put_extent_tree(&et);
 		mlog_errno(ret);
 		return ret;
 	}
@@ -1304,7 +1301,6 @@ out:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
-	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 9c3d4dc3e2e..1a4de3dc2ba 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -211,7 +211,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 
 	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
 
-	ocfs2_get_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+	ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
 
 restart_all:
 
@@ -307,7 +307,6 @@ leave:
 		goto restart_all;
 	}
 
-	ocfs2_put_extent_tree(&et);
 	return status;
 }
 
@@ -325,11 +324,10 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_extent_tree et;
 
-	ocfs2_get_xattr_value_extent_tree(&et, inode, root_bh, xv);
+	ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
 
 	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
 	if (ret) {
-		ocfs2_put_extent_tree(&et);
 		mlog_errno(ret);
 		return ret;
 	}
@@ -385,7 +383,6 @@ out:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
-	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
@@ -3632,7 +3629,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     prev_cpos, prev_blkno);
 
-	ocfs2_get_xattr_tree_extent_tree(&et, inode, root_bh);
+	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
 	ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
 				    &data_ac, &meta_ac);
@@ -3727,7 +3724,6 @@ leave:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
-	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
@@ -4336,7 +4332,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct ocfs2_extent_tree et;
 
-	ocfs2_get_xattr_tree_extent_tree(&et, inode, root_bh);
+	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
@@ -4347,7 +4343,6 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 
 	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
 	if (ret) {
-		ocfs2_put_extent_tree(&et);
 		mlog_errno(ret);
 		return ret;
 	}
@@ -4407,7 +4402,6 @@ out:
 
 	ocfs2_run_deallocs(osb, &dealloc);
 
-	ocfs2_put_extent_tree(&et);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 28b8ca0b7f70b1b048d03dc0b9d87f58619e9791 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 1 Sep 2008 08:45:18 +0800
Subject: ocfs2: bug-fix for journal extend in xattr.

In ocfs2_extend_trans, when we can't extend the current
transaction, it will commit current transaction and restart
a new one. So if the previous credits we have allocated aren't
used(the block isn't dirtied before our extend), we will not
have enough credits for any future operation(it will cause jbd
complain and bug out). So check this and re-extend it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 1a4de3dc2ba..38e3e5e216b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1336,8 +1336,9 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	}
 
 	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-		/*set extended attribue in external blcok*/
+		/* set extended attribute in external block. */
 		ret = ocfs2_extend_trans(handle,
+					 OCFS2_INODE_UPDATE_CREDITS +
 					 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
 		if (ret) {
 			mlog_errno(ret);
@@ -3701,6 +3702,18 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		}
 	}
 
+	if (handle->h_buffer_credits < credits) {
+		/*
+		 * The journal has been restarted before, and don't
+		 * have enough space for the insertion, so extend it
+		 * here.
+		 */
+		ret = ocfs2_extend_trans(handle, credits);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	}
 	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
 	     num_bits, block, v_start);
 	ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
-- 
cgit v1.2.3


From 08413899db89d8d636c2a2d4ba5c356ab587d7ef Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 29 Aug 2008 09:00:19 +0800
Subject: ocfs2: Resolve deadlock in ocfs2_xattr_free_block.

In ocfs2_xattr_free_block, we take a cluster lock on xb_alloc_inode while we
have a transaction open. This will deadlock the downconvert thread, so fix
it.

We can clean up how xattr blocks are removed while here - this patch also
moves the mechanism of releasing xattr block (including both value, xattr
tree and xattr block) into this function.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 152 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 82 insertions(+), 70 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 38e3e5e216b..b2e25a828e3 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1427,51 +1427,6 @@ out:
 
 }
 
-static int ocfs2_xattr_free_block(handle_t *handle,
-				  struct ocfs2_super *osb,
-				  struct ocfs2_xattr_block *xb)
-{
-	struct inode *xb_alloc_inode;
-	struct buffer_head *xb_alloc_bh = NULL;
-	u64 blk = le64_to_cpu(xb->xb_blkno);
-	u16 bit = le16_to_cpu(xb->xb_suballoc_bit);
-	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
-	int ret = 0;
-
-	xb_alloc_inode = ocfs2_get_system_file_inode(osb,
-				EXTENT_ALLOC_SYSTEM_INODE,
-				le16_to_cpu(xb->xb_suballoc_slot));
-	if (!xb_alloc_inode) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
-	}
-	mutex_lock(&xb_alloc_inode->i_mutex);
-
-	ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_mutex;
-	}
-	ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_unlock;
-	}
-	ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
-				       bit, bg_blkno, 1);
-	if (ret < 0)
-		mlog_errno(ret);
-out_unlock:
-	ocfs2_inode_unlock(xb_alloc_inode, 1);
-	brelse(xb_alloc_bh);
-out_mutex:
-	mutex_unlock(&xb_alloc_inode->i_mutex);
-	iput(xb_alloc_inode);
-out:
-	return ret;
-}
-
 static int ocfs2_remove_value_outside(struct inode*inode,
 				      struct buffer_head *bh,
 				      struct ocfs2_xattr_header *header)
@@ -1533,6 +1488,84 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 	return ret;
 }
 
+static int ocfs2_xattr_free_block(struct inode *inode,
+				  u64 block)
+{
+	struct inode *xb_alloc_inode;
+	struct buffer_head *xb_alloc_bh = NULL;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle;
+	int ret = 0;
+	u64 blk, bg_blkno;
+	u16 bit;
+
+	ret = ocfs2_read_block(osb, block, &blk_bh,
+			       OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = ocfs2_xattr_block_remove(inode, blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	blk = le64_to_cpu(xb->xb_blkno);
+	bit = le16_to_cpu(xb->xb_suballoc_bit);
+	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+				EXTENT_ALLOC_SYSTEM_INODE,
+				le16_to_cpu(xb->xb_suballoc_slot));
+	if (!xb_alloc_inode) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	mutex_lock(&xb_alloc_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+				       bit, bg_blkno, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	ocfs2_inode_unlock(xb_alloc_inode, 1);
+	brelse(xb_alloc_bh);
+out_mutex:
+	mutex_unlock(&xb_alloc_inode->i_mutex);
+	iput(xb_alloc_inode);
+out:
+	brelse(blk_bh);
+	return ret;
+}
+
 /*
  * ocfs2_xattr_remove()
  *
@@ -1540,9 +1573,6 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
  */
 int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 {
-	struct ocfs2_xattr_block *xb;
-	struct buffer_head *blk_bh = NULL;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	handle_t *handle;
@@ -1561,22 +1591,10 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 			goto out;
 		}
 	}
-	if (di->i_xattr_loc) {
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_xattr_loc),
-				       &blk_bh, OCFS2_BH_CACHED, inode);
-		if (ret < 0) {
-			mlog_errno(ret);
-			return ret;
-		}
-		/*Verify the signature of xattr block*/
-		if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-			   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-			ret = -EFAULT;
-			goto out;
-		}
 
-		ret = ocfs2_xattr_block_remove(inode, blk_bh);
+	if (di->i_xattr_loc) {
+		ret = ocfs2_xattr_free_block(inode,
+					     le64_to_cpu(di->i_xattr_loc));
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1597,11 +1615,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 		goto out_commit;
 	}
 
-	if (di->i_xattr_loc) {
-		xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-		ocfs2_xattr_free_block(handle, osb, xb);
-		di->i_xattr_loc = cpu_to_le64(0);
-	}
+	di->i_xattr_loc = 0;
 
 	spin_lock(&oi->ip_lock);
 	oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
@@ -1614,8 +1628,6 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-	brelse(blk_bh);
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 1187c968852e3c668f3b9376083851f81f6eee22 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 3 Sep 2008 20:03:39 -0700
Subject: ocfs2: Limit inode allocation to 32bits.

ocfs2 inode numbers are block numbers.  For any filesystem with less
than 2^32 blocks, this is not a problem.  However, when ocfs2 starts
using JDB2, it will be able to support filesystems with more than 2^32
blocks.  This would result in inode numbers higher than 2^32.

The problem is that stat(2) can't handle those numbers on 32bit
machines.  The simple solution is to have ocfs2 allocate all inodes
below that boundary.

The suballoc code is changed to honor an optional block limit.  Only the
inode suballocator sets that limit - all other allocations stay unlimited.

The biggest trick is to grow the inode suballocator beneath that limit.
There's no point in allocating block groups that are above the limit,
then rejecting their elements later on.  We want to prevent the inode
allocator from ever having block groups above the limit.  This involves
a little gyration with the local alloc code.  If the local alloc window
is above the limit, it signals the caller to try the global bitmap but
does not disable the local alloc file (which can be used for other
allocations).

[ Minor cleanup - removed an ML_NOTICE comment. --Mark ]

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/localalloc.c | 55 ++++++++++++++++++++++++++++++++++
 fs/ocfs2/suballoc.c   | 83 +++++++++++++++++++++++++++++++++++++++++----------
 fs/ocfs2/suballoc.h   | 11 ++++---
 3 files changed, 130 insertions(+), 19 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index b889f10d809..02227c39251 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -570,6 +570,46 @@ out:
 	return status;
 }
 
+/* Check to see if the local alloc window is within ac->ac_max_block */
+static int ocfs2_local_alloc_in_range(struct inode *inode,
+				      struct ocfs2_alloc_context *ac,
+				      u32 bits_wanted)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+	int start;
+	u64 block_off;
+
+	if (!ac->ac_max_block)
+		return 1;
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	if (start == -1) {
+		mlog_errno(-ENOSPC);
+		return 0;
+	}
+
+	/*
+	 * Converting (bm_off + start + bits_wanted) to blocks gives us
+	 * the blkno just past our actual allocation.  This is perfect
+	 * to compare with ac_max_block.
+	 */
+	block_off = ocfs2_clusters_to_blocks(inode->i_sb,
+					     le32_to_cpu(la->la_bm_off) +
+					     start + bits_wanted);
+	mlog(0, "Checking %llu against %llu\n",
+	     (unsigned long long)block_off,
+	     (unsigned long long)ac->ac_max_block);
+	if (block_off > ac->ac_max_block)
+		return 0;
+
+	return 1;
+}
+
 /*
  * make sure we've got at least bits_wanted contiguous bits in the
  * local alloc. You lose them when you drop i_mutex.
@@ -658,6 +698,21 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 			goto bail;
 	}
 
+	if (ac->ac_max_block)
+		mlog(0, "Calling in_range for max block %llu\n",
+		     (unsigned long long)ac->ac_max_block);
+
+	if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
+					bits_wanted)) {
+		/*
+		 * The window is outside ac->ac_max_block.
+		 * This errno tells the caller to keep localalloc enabled
+		 * but to get the allocation from the main bitmap.
+		 */
+		status = -EFBIG;
+		goto bail;
+	}
+
 	ac->ac_inode = local_alloc_inode;
 	/* We should never use localalloc from another slot */
 	ac->ac_alloc_slot = osb->slot_num;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8d3947e94a2..213bdca16fe 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle,
 				  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 				   struct inode *alloc_inode,
-				   struct buffer_head *bh);
+				   struct buffer_head *bh,
+				   u64 max_block);
 
 static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
 				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
 				      u16 *bit_off, u16 *bits_found);
 static int ocfs2_block_group_search(struct inode *inode,
 				    struct buffer_head *group_bh,
 				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
 				    u16 *bit_off, u16 *bits_found);
 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 				     struct ocfs2_alloc_context *ac,
@@ -110,6 +113,9 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 						u64 data_blkno,
 						u64 *bg_blkno,
 						u16 *bg_bit_off);
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     struct ocfs2_alloc_context **ac);
 
 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 {
@@ -276,7 +282,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
  */
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 				   struct inode *alloc_inode,
-				   struct buffer_head *bh)
+				   struct buffer_head *bh,
+				   u64 max_block)
 {
 	int status, credits;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -294,9 +301,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 	mlog_entry_void();
 
 	cl = &fe->id2.i_chain;
-	status = ocfs2_reserve_clusters(osb,
-					le16_to_cpu(cl->cl_cpg),
-					&ac);
+	status = ocfs2_reserve_clusters_with_limit(osb,
+						   le16_to_cpu(cl->cl_cpg),
+						   max_block, &ac);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -469,7 +476,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
+						 ac->ac_max_block);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
@@ -590,6 +598,13 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 
 	(*ac)->ac_group_search = ocfs2_block_group_search;
 
+	/*
+	 * stat(2) can't handle i_ino > 32bits, so we tell the
+	 * lower levels not to allocate us a block group past that
+	 * limit.
+	 */
+	(*ac)->ac_max_block = (u32)~0U;
+
 	/*
 	 * slot is set when we successfully steal inode from other nodes.
 	 * It is reset in 3 places:
@@ -670,9 +685,9 @@ bail:
 /* Callers don't need to care which bitmap (local alloc or main) to
  * use so we figure it out for them, but unfortunately this clutters
  * things a bit. */
-int ocfs2_reserve_clusters(struct ocfs2_super *osb,
-			   u32 bits_wanted,
-			   struct ocfs2_alloc_context **ac)
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     struct ocfs2_alloc_context **ac)
 {
 	int status;
 
@@ -686,13 +701,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
 	}
 
 	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_max_block = max_block;
 
 	status = -ENOSPC;
 	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
 		status = ocfs2_reserve_local_alloc_bits(osb,
 							bits_wanted,
 							*ac);
-		if ((status < 0) && (status != -ENOSPC)) {
+		if (status == -EFBIG) {
+			/* The local alloc window is outside ac_max_block.
+			 * use the main bitmap. */
+			status = -ENOSPC;
+		} else if ((status < 0) && (status != -ENOSPC)) {
 			mlog_errno(status);
 			goto bail;
 		}
@@ -718,6 +738,13 @@ bail:
 	return status;
 }
 
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+}
+
 /*
  * More or less lifted from ext3. I'll leave their description below:
  *
@@ -1000,10 +1027,12 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
 static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
 				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
 				      u16 *bit_off, u16 *bits_found)
 {
 	int search = -ENOSPC;
 	int ret;
+	u64 blkoff;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u16 tmp_off, tmp_found;
@@ -1038,6 +1067,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 		if (ret)
 			return ret;
 
+		if (max_block) {
+			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+							  gd_cluster_off +
+							  tmp_off + tmp_found);
+			mlog(0, "Checking %llu against %llu\n",
+			     (unsigned long long)blkoff,
+			     (unsigned long long)max_block);
+			if (blkoff > max_block)
+				return -ENOSPC;
+		}
+
 		/* ocfs2_block_group_find_clear_bits() might
 		 * return success, but we still want to return
 		 * -ENOSPC unless it found the minimum number
@@ -1061,19 +1101,31 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 static int ocfs2_block_group_search(struct inode *inode,
 				    struct buffer_head *group_bh,
 				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
 				    u16 *bit_off, u16 *bits_found)
 {
 	int ret = -ENOSPC;
+	u64 blkoff;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
 
 	BUG_ON(min_bits != 1);
 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
 
-	if (bg->bg_free_bits_count)
+	if (bg->bg_free_bits_count) {
 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
 							group_bh, bits_wanted,
 							le16_to_cpu(bg->bg_bits),
 							bit_off, bits_found);
+		if (!ret && max_block) {
+			blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+				*bits_found;
+			mlog(0, "Checking %llu against %llu\n",
+			     (unsigned long long)blkoff,
+			     (unsigned long long)max_block);
+			if (blkoff > max_block)
+				ret = -ENOSPC;
+		}
+	}
 
 	return ret;
 }
@@ -1138,7 +1190,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	}
 
 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-				  bit_off, &found);
+				  ac->ac_max_block, bit_off, &found);
 	if (ret < 0) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
@@ -1210,11 +1262,12 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	status = -ENOSPC;
 	/* for now, the chain search is a bit simplistic. We just use
 	 * the 1st group with any empty bits. */
-	while ((status = ac->ac_group_search(alloc_inode, group_bh, bits_wanted,
-					     min_bits, bit_off, &tmp_bits)) == -ENOSPC) {
+	while ((status = ac->ac_group_search(alloc_inode, group_bh,
+					     bits_wanted, min_bits,
+					     ac->ac_max_block, bit_off,
+					     &tmp_bits)) == -ENOSPC) {
 		if (!bg->bg_next_group)
 			break;
-
 		if (prev_group_bh) {
 			brelse(prev_group_bh);
 			prev_group_bh = NULL;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index dd0963695ed..4df159d8f45 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -28,10 +28,11 @@
 
 typedef int (group_search_t)(struct inode *,
 			     struct buffer_head *,
-			     u32,
-			     u32,
-			     u16 *,
-			     u16 *);
+			     u32,			/* bits_wanted */
+			     u32,			/* min_bits */
+			     u64,			/* max_block */
+			     u16 *,			/* *bit_off */
+			     u16 *);			/* *bits_found */
 
 struct ocfs2_alloc_context {
 	struct inode *ac_inode;    /* which bitmap are we allocating from? */
@@ -51,6 +52,8 @@ struct ocfs2_alloc_context {
 	group_search_t *ac_group_search;
 
 	u64    ac_last_group;
+	u64    ac_max_block;  /* Highest block number to allocate. 0 is
+				 is the same as ~0 - unlimited */
 };
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
-- 
cgit v1.2.3


From 12462f1d9f0b96389497438dc2730c6f7410be82 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 3 Sep 2008 20:03:40 -0700
Subject: ocfs2: Add the 'inode64' mount option.

Now that ocfs2 limits inode numbers to 32bits, add a mount option to
disable the limit.  This parallels XFS.  64bit systems can handle the
larger inode numbers.

[ Added description of inode64 mount option in ocfs2.txt. --Mark ]

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2.h    |  1 +
 fs/ocfs2/suballoc.c |  5 +++--
 fs/ocfs2/super.c    | 17 +++++++++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6d3c10ddf48..78ae4f87e6b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -189,6 +189,7 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
 	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
 	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
+	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 213bdca16fe..d7a6f928c31 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -601,9 +601,10 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 	/*
 	 * stat(2) can't handle i_ino > 32bits, so we tell the
 	 * lower levels not to allocate us a block group past that
-	 * limit.
+	 * limit.  The 'inode64' mount option avoids this behavior.
 	 */
-	(*ac)->ac_max_block = (u32)~0U;
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
+		(*ac)->ac_max_block = (u32)~0U;
 
 	/*
 	 * slot is set when we successfully steal inode from other nodes.
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c85e525950a..1a51c8c53be 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -157,6 +157,7 @@ enum {
 	Opt_stack,
 	Opt_user_xattr,
 	Opt_nouser_xattr,
+	Opt_inode64,
 	Opt_err,
 };
 
@@ -178,6 +179,7 @@ static const match_table_t tokens = {
 	{Opt_stack, "cluster_stack=%s"},
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_inode64, "inode64"},
 	{Opt_err, NULL}
 };
 
@@ -411,6 +413,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		goto out;
 	}
 
+	/* Probably don't want this on remount; it might
+	 * mess with other nodes */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
+	    (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
+		goto out;
+	}
+
 	/* We're going to/from readonly mode. */
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		/* Lock here so the check of HARD_RO and the potential
@@ -930,6 +941,9 @@ static int ocfs2_parse_options(struct super_block *sb,
 			       OCFS2_STACK_LABEL_LEN);
 			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
 			break;
+		case Opt_inode64:
+			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -994,6 +1008,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
 			   osb->osb_cluster_stack);
 
+	if (opts & OCFS2_MOUNT_INODE64)
+		seq_printf(s, ",inode64");
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2b4e30fbde425828b17f0e9c8f8e3fd3ecb2bc75 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 3 Sep 2008 20:03:41 -0700
Subject: ocfs2: Switch over to JBD2.

ocfs2 wants JBD2 for many reasons, not the least of which is that JBD is
limiting our maximum filesystem size.

It's a pretty trivial change.  Most functions are just renamed.  The
only functional change is moving to Jan's inode-based ordered data mode.
It's better, too.

Because JBD2 reads and writes JBD journals, this is compatible with any
existing filesystem.  It can even interact with JBD-based ocfs2 as long
as the journal is formated for JBD.

We provide a compatibility option so that paranoid people can still use
JBD for the time being.  This will go away shortly.

[ Moved call of ocfs2_begin_ordered_truncate() from ocfs2_delete_inode() to
  ocfs2_truncate_for_delete(). --Mark ]

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c            | 28 ++++++----------
 fs/ocfs2/aops.c             | 21 +++++++++---
 fs/ocfs2/file.c             | 14 +++++---
 fs/ocfs2/inode.c            |  5 +++
 fs/ocfs2/inode.h            |  1 +
 fs/ocfs2/journal.c          | 72 ++++++++++++++++++++-------------------
 fs/ocfs2/journal.h          | 25 ++++++++++++--
 fs/ocfs2/ocfs2.h            |  7 +++-
 fs/ocfs2/ocfs2_jbd_compat.h | 82 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/super.c            | 10 +++---
 fs/ocfs2/uptodate.c         |  6 +++-
 11 files changed, 202 insertions(+), 69 deletions(-)
 create mode 100644 fs/ocfs2/ocfs2_jbd_compat.h

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 47201b67dbf..ebfe36ab2d5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6421,20 +6421,13 @@ bail:
 	return status;
 }
 
-static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 {
 	set_buffer_uptodate(bh);
 	mark_buffer_dirty(bh);
 	return 0;
 }
 
-static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
-{
-	set_buffer_uptodate(bh);
-	mark_buffer_dirty(bh);
-	return ocfs2_journal_dirty_data(handle, bh);
-}
-
 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 				     unsigned int from, unsigned int to,
 				     struct page *page, int zero, u64 *phys)
@@ -6453,17 +6446,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 	 * here if they aren't - ocfs2_map_page_blocks()
 	 * might've skipped some
 	 */
-	if (ocfs2_should_order_data(inode)) {
-		ret = walk_page_buffers(handle,
-					page_buffers(page),
-					from, to, &partial,
-					ocfs2_ordered_zero_func);
-		if (ret < 0)
-			mlog_errno(ret);
-	} else {
+	ret = walk_page_buffers(handle, page_buffers(page),
+				from, to, &partial,
+				ocfs2_zero_func);
+	if (ret < 0)
+		mlog_errno(ret);
+	else if (ocfs2_should_order_data(inode)) {
+		ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 		ret = walk_page_buffers(handle, page_buffers(page),
 					from, to, &partial,
-					ocfs2_writeback_zero_func);
+					ocfs2_journal_dirty_data);
+#endif
 		if (ret < 0)
 			mlog_errno(ret);
 	}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 259775eedb8..de179054a74 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -485,11 +485,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 	}
 
 	if (ocfs2_should_order_data(inode)) {
+		ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 		ret = walk_page_buffers(handle,
 					page_buffers(page),
 					from, to, NULL,
 					ocfs2_journal_dirty_data);
-		if (ret < 0) 
+#endif
+		if (ret < 0)
 			mlog_errno(ret);
 	}
 out:
@@ -669,7 +672,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 
-	journal_invalidatepage(journal, page, offset);
+	jbd2_journal_invalidatepage(journal, page, offset);
 }
 
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -678,7 +681,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
 
 	if (!page_has_buffers(page))
 		return 0;
-	return journal_try_to_free_buffers(journal, page, wait);
+	return jbd2_journal_try_to_free_buffers(journal, page, wait);
 }
 
 static ssize_t ocfs2_direct_IO(int rw,
@@ -1074,11 +1077,15 @@ static void ocfs2_write_failure(struct inode *inode,
 		tmppage = wc->w_pages[i];
 
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode))
+			if (ocfs2_should_order_data(inode)) {
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 				walk_page_buffers(wc->w_handle,
 						  page_buffers(tmppage),
 						  from, to, NULL,
 						  ocfs2_journal_dirty_data);
+#endif
+			}
 
 			block_commit_write(tmppage, from, to);
 		}
@@ -1917,11 +1924,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		}
 
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode))
+			if (ocfs2_should_order_data(inode)) {
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 				walk_page_buffers(wc->w_handle,
 						  page_buffers(tmppage),
 						  from, to, NULL,
 						  ocfs2_journal_dirty_data);
+#endif
+			}
 			block_commit_write(tmppage, from, to);
 		}
 	}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 441c6a94059..c95318bc00c 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -185,7 +185,7 @@ static int ocfs2_sync_file(struct file *file,
 		goto bail;
 
 	journal = osb->journal->j_journal;
-	err = journal_force_commit(journal);
+	err = jbd2_journal_force_commit(journal);
 
 bail:
 	mlog_exit(err);
@@ -941,9 +941,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 			goto bail_unlock;
 		}
 
-		if (i_size_read(inode) > attr->ia_size)
+		if (i_size_read(inode) > attr->ia_size) {
+			if (ocfs2_should_order_data(inode)) {
+				status = ocfs2_begin_ordered_truncate(inode,
+								      attr->ia_size);
+				if (status)
+					goto bail_unlock;
+			}
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
-		else
+		} else
 			status = ocfs2_extend_file(inode, bh, attr->ia_size);
 		if (status < 0) {
 			if (status != -ENOSPC)
@@ -1888,7 +1894,7 @@ out_dio:
 		 */
 		if (old_size != i_size_read(inode) ||
 		    old_clusters != OCFS2_I(inode)->ip_clusters) {
-			ret = journal_force_commit(osb->journal->j_journal);
+			ret = jbd2_journal_force_commit(osb->journal->j_journal);
 			if (ret < 0)
 				written = ret;
 		}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4738dd25bb9..9d92c859ac9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -534,6 +534,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 	 * data and fast symlinks.
 	 */
 	if (fe->i_clusters) {
+		if (ocfs2_should_order_data(inode))
+			ocfs2_begin_ordered_truncate(inode, 0);
+
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
@@ -1100,6 +1103,8 @@ void ocfs2_clear_inode(struct inode *inode)
 	oi->ip_last_trans = 0;
 	oi->ip_dir_start_lookup = 0;
 	oi->ip_blkno = 0ULL;
+	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+				       &oi->ip_jinode);
 
 bail:
 	mlog_exit_void();
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 499bc62e758..f66e4340f17 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -71,6 +71,7 @@ struct ocfs2_inode_info
 	struct ocfs2_extent_map		ip_extent_map;
 
 	struct inode			vfs_inode;
+	struct jbd2_inode		ip_jinode;
 };
 
 /*
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c47bc2a809c..373d94366a4 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 		goto finally;
 	}
 
-	journal_lock_updates(journal->j_journal);
-	status = journal_flush(journal->j_journal);
-	journal_unlock_updates(journal->j_journal);
+	jbd2_journal_lock_updates(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal);
+	jbd2_journal_unlock_updates(journal->j_journal);
 	if (status < 0) {
 		up_write(&journal->j_trans_barrier);
 		mlog_errno(status);
@@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 
 	down_read(&osb->journal->j_trans_barrier);
 
-	handle = journal_start(journal, max_buffs);
+	handle = jbd2_journal_start(journal, max_buffs);
 	if (IS_ERR(handle)) {
 		up_read(&osb->journal->j_trans_barrier);
 
@@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 
 	BUG_ON(!handle);
 
-	ret = journal_stop(handle);
+	ret = jbd2_journal_stop(handle);
 	if (ret < 0)
 		mlog_errno(ret);
 
@@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
  * transaction. extend_trans will either extend the current handle by
  * nblocks, or commit it and start a new one with nblocks credits.
  *
- * This might call journal_restart() which will commit dirty buffers
+ * This might call jbd2_journal_restart() which will commit dirty buffers
  * and then restart the transaction. Before calling
  * ocfs2_extend_trans(), any changed blocks should have been
  * dirtied. After calling it, all blocks which need to be changed must
@@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #ifdef CONFIG_OCFS2_DEBUG_FS
 	status = 1;
 #else
-	status = journal_extend(handle, nblocks);
+	status = jbd2_journal_extend(handle, nblocks);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #endif
 
 	if (status > 0) {
-		mlog(0, "journal_extend failed, trying journal_restart\n");
-		status = journal_restart(handle, nblocks);
+		mlog(0,
+		     "jbd2_journal_extend failed, trying "
+		     "jbd2_journal_restart\n");
+		status = jbd2_journal_restart(handle, nblocks);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle,
 	switch (type) {
 	case OCFS2_JOURNAL_ACCESS_CREATE:
 	case OCFS2_JOURNAL_ACCESS_WRITE:
-		status = journal_get_write_access(handle, bh);
+		status = jbd2_journal_get_write_access(handle, bh);
 		break;
 
 	case OCFS2_JOURNAL_ACCESS_UNDO:
-		status = journal_get_undo_access(handle, bh);
+		status = jbd2_journal_get_undo_access(handle, bh);
 		break;
 
 	default:
@@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle,
 	mlog_entry("(bh->b_blocknr=%llu)\n",
 		   (unsigned long long)bh->b_blocknr);
 
-	status = journal_dirty_metadata(handle, bh);
+	status = jbd2_journal_dirty_metadata(handle, bh);
 	if (status < 0)
 		mlog(ML_ERROR, "Could not dirty metadata buffer. "
 		     "(bh->b_blocknr=%llu)\n",
@@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle,
 	return status;
 }
 
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 int ocfs2_journal_dirty_data(handle_t *handle,
 			     struct buffer_head *bh)
 {
@@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
 
 	return err;
 }
+#endif
 
-#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
 {
@@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
 	spin_lock(&journal->j_state_lock);
 	journal->j_commit_interval = commit_interval;
 	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
-		journal->j_flags |= JFS_BARRIER;
+		journal->j_flags |= JBD2_BARRIER;
 	else
-		journal->j_flags &= ~JFS_BARRIER;
+		journal->j_flags &= ~JBD2_BARRIER;
 	spin_unlock(&journal->j_state_lock);
 }
 
@@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
 
 	/* call the kernels journal init function now */
-	j_journal = journal_init_inode(inode);
+	j_journal = jbd2_journal_init_inode(inode);
 	if (j_journal == NULL) {
 		mlog(ML_ERROR, "Linux journal layer error\n");
 		status = -EINVAL;
 		goto done;
 	}
 
-	mlog(0, "Returned from journal_init_inode\n");
+	mlog(0, "Returned from jbd2_journal_init_inode\n");
 	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
 
 	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
@@ -639,7 +643,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	if (journal->j_state != OCFS2_JOURNAL_LOADED)
 		goto done;
 
-	/* need to inc inode use count as journal_destroy will iput. */
+	/* need to inc inode use count - jbd2_journal_destroy will iput. */
 	if (!igrab(inode))
 		BUG();
 
@@ -668,9 +672,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
 
 	if (ocfs2_mount_local(osb)) {
-		journal_lock_updates(journal->j_journal);
-		status = journal_flush(journal->j_journal);
-		journal_unlock_updates(journal->j_journal);
+		jbd2_journal_lock_updates(journal->j_journal);
+		status = jbd2_journal_flush(journal->j_journal);
+		jbd2_journal_unlock_updates(journal->j_journal);
 		if (status < 0)
 			mlog_errno(status);
 	}
@@ -686,7 +690,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	}
 
 	/* Shutdown the kernel journal system */
-	journal_destroy(journal->j_journal);
+	jbd2_journal_destroy(journal->j_journal);
 
 	OCFS2_I(inode)->ip_open_count--;
 
@@ -711,15 +715,15 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
 {
 	int olderr;
 
-	olderr = journal_errno(journal);
+	olderr = jbd2_journal_errno(journal);
 	if (olderr) {
 		mlog(ML_ERROR, "File system error %d recorded in "
 		     "journal %u.\n", olderr, slot);
 		mlog(ML_ERROR, "File system on device %s needs checking.\n",
 		     sb->s_id);
 
-		journal_ack_err(journal);
-		journal_clear_err(journal);
+		jbd2_journal_ack_err(journal);
+		jbd2_journal_clear_err(journal);
 	}
 }
 
@@ -734,7 +738,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 
 	osb = journal->j_osb;
 
-	status = journal_load(journal->j_journal);
+	status = jbd2_journal_load(journal->j_journal);
 	if (status < 0) {
 		mlog(ML_ERROR, "Failed to load journal!\n");
 		goto done;
@@ -778,7 +782,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
 
 	BUG_ON(!journal);
 
-	status = journal_wipe(journal->j_journal, full);
+	status = jbd2_journal_wipe(journal->j_journal, full);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1229,19 +1233,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	}
 
 	mlog(0, "calling journal_init_inode\n");
-	journal = journal_init_inode(inode);
+	journal = jbd2_journal_init_inode(inode);
 	if (journal == NULL) {
 		mlog(ML_ERROR, "Linux journal layer error\n");
 		status = -EIO;
 		goto done;
 	}
 
-	status = journal_load(journal);
+	status = jbd2_journal_load(journal);
 	if (status < 0) {
 		mlog_errno(status);
 		if (!igrab(inode))
 			BUG();
-		journal_destroy(journal);
+		jbd2_journal_destroy(journal);
 		goto done;
 	}
 
@@ -1249,9 +1253,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 
 	/* wipe the journal */
 	mlog(0, "flushing the journal.\n");
-	journal_lock_updates(journal);
-	status = journal_flush(journal);
-	journal_unlock_updates(journal);
+	jbd2_journal_lock_updates(journal);
+	status = jbd2_journal_flush(journal);
+	jbd2_journal_unlock_updates(journal);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -1272,7 +1276,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	if (!igrab(inode))
 		BUG();
 
-	journal_destroy(journal);
+	jbd2_journal_destroy(journal);
 
 done:
 	/* drop the lock on this nodes journal */
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 08d1add1487..d4d14e9a3ce 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,7 +27,12 @@
 #define OCFS2_JOURNAL_H
 
 #include <linux/fs.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
 
 enum ocfs2_journal_state {
 	OCFS2_JOURNAL_FREE = 0,
@@ -215,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
  *                          buffer. Will have to call ocfs2_journal_dirty once
  *                          we've actually dirtied it. Type is one of . or .
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
- *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
- *                             the current handle commits.
+ *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
+ *                           the current handle commits.
  */
 
 /* You must always start_trans with a number of buffs > 0, but it's
@@ -268,8 +273,10 @@ int                  ocfs2_journal_access(handle_t *handle,
  */
 int                  ocfs2_journal_dirty(handle_t *handle,
 					 struct buffer_head *bh);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 int                  ocfs2_journal_dirty_data(handle_t *handle,
 					      struct buffer_head *bh);
+#endif
 
 /*
  *  Credit Macros:
@@ -430,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	return credits;
 }
 
+static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+}
+
+static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+					       loff_t new_size)
+{
+	return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
+						   new_size);
+}
+
 #endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 78ae4f87e6b..a21a465490c 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,7 +34,12 @@
 #include <linux/workqueue.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
 
 /* For union ocfs2_dlm_lksb */
 #include "stackglue.h"
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
new file mode 100644
index 00000000000..b91c78f8f55
--- /dev/null
+++ b/fs/ocfs2/ocfs2_jbd_compat.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_jbd_compat.h
+ *
+ * Compatibility defines for JBD.
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_JBD_COMPAT_H
+#define OCFS2_JBD_COMPAT_H
+
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# error Should not have been included
+#endif
+
+struct jbd2_inode {
+	unsigned int dummy;
+};
+
+#define JBD2_BARRIER			JFS_BARRIER
+#define JBD2_DEFAULT_MAX_COMMIT_AGE	JBD_DEFAULT_MAX_COMMIT_AGE
+
+#define jbd2_journal_ack_err			journal_ack_err
+#define jbd2_journal_clear_err			journal_clear_err
+#define jbd2_journal_destroy			journal_destroy
+#define jbd2_journal_dirty_metadata		journal_dirty_metadata
+#define jbd2_journal_errno			journal_errno
+#define jbd2_journal_extend			journal_extend
+#define jbd2_journal_flush			journal_flush
+#define jbd2_journal_force_commit		journal_force_commit
+#define jbd2_journal_get_write_access		journal_get_write_access
+#define jbd2_journal_get_undo_access		journal_get_undo_access
+#define jbd2_journal_init_inode			journal_init_inode
+#define jbd2_journal_invalidatepage		journal_invalidatepage
+#define jbd2_journal_load			journal_load
+#define jbd2_journal_lock_updates		journal_lock_updates
+#define jbd2_journal_restart			journal_restart
+#define jbd2_journal_start			journal_start
+#define jbd2_journal_start_commit		journal_start_commit
+#define jbd2_journal_stop			journal_stop
+#define jbd2_journal_try_to_free_buffers	journal_try_to_free_buffers
+#define jbd2_journal_unlock_updates		journal_unlock_updates
+#define jbd2_journal_wipe			journal_wipe
+#define jbd2_log_wait_commit			log_wait_commit
+
+static inline int jbd2_journal_file_inode(handle_t *handle,
+					  struct jbd2_inode *inode)
+{
+	return 0;
+}
+
+static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+						      loff_t new_size)
+{
+	return 0;
+}
+
+static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
+					       struct inode *inode)
+{
+	return;
+}
+
+static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
+						  struct jbd2_inode *jinode)
+{
+	return;
+}
+
+
+#endif  /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 1a51c8c53be..8b4c5c67dcd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -212,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
 		ocfs2_schedule_truncate_log_flush(osb, 0);
 	}
 
-	if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+	if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+				      &target)) {
 		if (wait)
-			log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
-					target);
+			jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+					     target);
 	}
 	return 0;
 }
@@ -332,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
 	if (!oi)
 		return NULL;
 
+	jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
 	return &oi->vfs_inode;
 }
 
@@ -896,7 +898,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 			if (option < 0)
 				return 0;
 			if (option == 0)
-				option = JBD_DEFAULT_MAX_COMMIT_AGE;
+				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 			mopt->commit_interval = HZ * option;
 			break;
 		case Opt_localalloc:
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index e26459e7d55..52351402089 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,7 +53,11 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/rbtree.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+#endif
 
 #define MLOG_MASK_PREFIX ML_UPTODATE
 
-- 
cgit v1.2.3


From b0f73cfc36ed62decdd3f78e943bbfd00ee80e49 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Fri, 5 Sep 2008 11:29:14 -0700
Subject: ocfs2: Add xattr mount option in ocfs2_show_options()

Patch adds check for [no]user_xattr in ocfs2_show_options() that completes
the list of all mount options.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/super.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8b4c5c67dcd..d2027cec8f3 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1010,6 +1010,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
 			   osb->osb_cluster_stack);
 
+	if (opts & OCFS2_MOUNT_NOUSERXATTR)
+		seq_printf(s, ",nouser_xattr");
+	else
+		seq_printf(s, ",user_xattr");
+
 	if (opts & OCFS2_MOUNT_INODE64)
 		seq_printf(s, ",inode64");
 
-- 
cgit v1.2.3


From 06b240d8af21ddee4cfec3b0f02b81d9f168a98a Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 19 Sep 2008 22:16:34 +0800
Subject: ocfs2/xattr.c: Fix a bug when inserting xattr.

During the process of xatt insertion, we use binary search
to find the right place and "low" is set to it. But when
there is one xattr which has the same name hash as the inserted
one, low is the wrong value. So set it to the right position.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index b2e25a828e3..b1f2a164e7d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4003,8 +4003,10 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 			else if (name_hash <
 				 le32_to_cpu(tmp_xe->xe_name_hash))
 				high = tmp - 1;
-			else
+			else {
+				low = tmp;
 				break;
+			}
 		}
 
 		xe = &xh->xh_entries[low];
-- 
cgit v1.2.3


From 5a09561199e7f8d3feaaa01c39372050e140b775 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 19 Sep 2008 22:17:41 +0800
Subject: ocfs2: Add empty bucket support in xattr.

As Mark mentioned, it may be time-consuming when we remove the
empty xattr bucket, so this patch try to let empty bucket exist
in xattr operation. The modification includes:
1. Remove the functin of bucket and extent record deletion during
   xattr delete.
2. In xattr set:
   1) Don't clean the last entry so that if the bucket is empty,
      the hash value of the bucket is the hash value of the entry
      which is deleted last.
   2) During insert, if we meet with an empty bucket, just use the
      1st entry.
3. In binary search of xattr bucket, use the bucket hash value(which
   stored in the 1st xattr entry) to find the right place.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 197 ++++++++++++-------------------------------------------
 1 file changed, 43 insertions(+), 154 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index b1f2a164e7d..64700c3fc24 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2301,9 +2301,12 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 
 		/*
 		 * Check whether the hash of the last entry in our
-		 * bucket is larger than the search one.
+		 * bucket is larger than the search one. for an empty
+		 * bucket, the last one is also the first one.
 		 */
-		xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+		if (xh->xh_count)
+			xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+
 		last_hash = le32_to_cpu(xe->xe_name_hash);
 
 		/* record lower_bh which may be the insert place. */
@@ -2450,7 +2453,8 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 		if (i == 0)
 			num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
 
-		mlog(0, "iterating xattr bucket %llu\n", blkno);
+		mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
+		     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
 		if (func) {
 			ret = func(inode, &bucket, para);
 			if (ret) {
@@ -3915,8 +3919,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
 
 /*
  * Handle the normal xattr set, including replace, delete and new.
- * When the bucket is empty, "is_empty" is set and the caller can
- * free this bucket.
  *
  * Note: "local" indicates the real data's locality. So we can't
  * just its bucket locality by its length.
@@ -3925,8 +3927,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 					 struct ocfs2_xattr_info *xi,
 					 struct ocfs2_xattr_search *xs,
 					 u32 name_hash,
-					 int local,
-					 int *is_empty)
+					 int local)
 {
 	struct ocfs2_xattr_entry *last, *xe;
 	int name_len = strlen(xi->name);
@@ -3979,14 +3980,23 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 			ocfs2_xattr_set_local(xe, local);
 			return;
 		} else {
-			/* Remove the old entry. */
+			/*
+			 * Remove the old entry if there is more than one.
+			 * We don't remove the last entry so that we can
+			 * use it to indicate the hash value of the empty
+			 * bucket.
+			 */
 			last -= 1;
-			memmove(xe, xe + 1,
-				(void *)last - (void *)xe);
-			memset(last, 0, sizeof(struct ocfs2_xattr_entry));
 			le16_add_cpu(&xh->xh_count, -1);
-			if (xh->xh_count == 0 && is_empty)
-				*is_empty = 1;
+			if (xh->xh_count) {
+				memmove(xe, xe + 1,
+					(void *)last - (void *)xe);
+				memset(last, 0,
+				       sizeof(struct ocfs2_xattr_entry));
+			} else
+				xh->xh_free_start =
+					cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+
 			return;
 		}
 	} else {
@@ -3994,7 +4004,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 		int low = 0, high = count - 1, tmp;
 		struct ocfs2_xattr_entry *tmp_xe;
 
-		while (low <= high) {
+		while (low <= high && count) {
 			tmp = (low + high) / 2;
 			tmp_xe = &xh->xh_entries[tmp];
 
@@ -4090,8 +4100,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 					   struct ocfs2_xattr_info *xi,
 					   struct ocfs2_xattr_search *xs,
 					   u32 name_hash,
-					   int local,
-					   int *bucket_empty)
+					   int local)
 {
 	int i, ret;
 	handle_t *handle = NULL;
@@ -4130,8 +4139,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 		}
 	}
 
-	ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash,
-				     local, bucket_empty);
+	ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
 
 	/*Only dirty the blocks we have touched in set xattr. */
 	ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
@@ -4280,69 +4288,6 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
 	return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
 }
 
-/*
- * Remove the xattr bucket pointed by bucket_bh.
- * All the buckets after it in the same xattr extent rec will be
- * move forward one by one.
- */
-static int ocfs2_rm_xattr_bucket(struct inode *inode,
-				 struct buffer_head *first_bh,
-				 struct ocfs2_xattr_bucket *bucket)
-{
-	int ret = 0, credits;
-	struct ocfs2_xattr_header *xh =
-				(struct ocfs2_xattr_header *)first_bh->b_data;
-	u16 bucket_num = le16_to_cpu(xh->xh_num_buckets);
-	u64 end, start = bucket->bhs[0]->b_blocknr;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	handle_t *handle;
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
-	end = first_bh->b_blocknr + (bucket_num - 1) * blk_per_bucket;
-
-	mlog(0, "rm xattr bucket %llu\n", start);
-	/*
-	 * We need to update the first xattr_header and all the buckets starting
-	 * from start in this xattr rec.
-	 *
-	 * XXX: Should we empty the old last bucket here?
-	 */
-	credits = 1 + end - start;
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		return ret;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, first_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-
-	while (start < end) {
-		ret = ocfs2_cp_xattr_bucket(inode, handle,
-					    start + blk_per_bucket,
-					    start, 0);
-		if (ret) {
-			mlog_errno(ret);
-			goto out_commit;
-		}
-		start += blk_per_bucket;
-	}
-
-	/* update the first_bh. */
-	xh->xh_num_buckets = cpu_to_le16(bucket_num - 1);
-	ocfs2_journal_dirty(handle, first_bh);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-	return ret;
-}
-
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
 				  struct buffer_head *root_bh,
 				  u64 blkno,
@@ -4432,57 +4377,6 @@ out:
 	return ret;
 }
 
-/*
- * Free the xattr bucket indicated by xs->bucket and if all the buckets
- * in the clusters is free, free the clusters also.
- */
-static int ocfs2_xattr_bucket_shrink(struct inode *inode,
-				     struct ocfs2_xattr_info *xi,
-				     struct ocfs2_xattr_search *xs,
-				     u32 name_hash)
-{
-	int ret;
-	u32 e_cpos, num_clusters;
-	u64 p_blkno;
-	struct buffer_head *first_bh = NULL;
-	struct ocfs2_xattr_header *first_xh;
-	struct ocfs2_xattr_block *xb =
-			(struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
-
-	BUG_ON(xs->header->xh_count != 0);
-
-	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
-				  &e_cpos, &num_clusters,
-				  &xb->xb_attrs.xb_root.xt_list);
-	if (ret) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno,
-			       &first_bh, OCFS2_BH_CACHED, inode);
-	if (ret) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	ret = ocfs2_rm_xattr_bucket(inode, first_bh, &xs->bucket);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
-	if (first_xh->xh_num_buckets == 0)
-		ret = ocfs2_rm_xattr_cluster(inode, xs->xattr_bh,
-					     p_blkno, e_cpos,
-					     num_clusters);
-
-out:
-	brelse(first_bh);
-	return ret;
-}
-
 static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 					 struct ocfs2_xattr_search *xs)
 {
@@ -4534,7 +4428,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 				     struct ocfs2_xattr_info *xi,
 				     struct ocfs2_xattr_search *xs)
 {
-	int ret, local = 1, bucket_empty = 0;
+	int ret, local = 1;
 	size_t value_len;
 	char *val = (char *)xi->value;
 	struct ocfs2_xattr_entry *xe = xs->here;
@@ -4580,34 +4474,29 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 		xi->value_len = OCFS2_XATTR_ROOT_SIZE;
 	}
 
-	ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash,
-					      local, &bucket_empty);
+	ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	if (value_len > OCFS2_XATTR_INLINE_SIZE) {
-		/* allocate the space now for the outside block storage. */
-		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-							   value_len);
-		if (ret) {
-			mlog_errno(ret);
+	if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+		goto out;
 
-			if (xs->not_found) {
-				/*
-				 * We can't allocate enough clusters for outside
-				 * storage and we have allocated xattr already,
-				 * so need to remove it.
-				 */
-				ocfs2_xattr_bucket_remove_xs(inode, xs);
-			}
-			goto out;
+	/* allocate the space now for the outside block storage. */
+	ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+						   value_len);
+	if (ret) {
+		mlog_errno(ret);
+
+		if (xs->not_found) {
+			/*
+			 * We can't allocate enough clusters for outside
+			 * storage and we have allocated xattr already,
+			 * so need to remove it.
+			 */
+			ocfs2_xattr_bucket_remove_xs(inode, xs);
 		}
-	} else {
-		if (bucket_empty)
-			ret = ocfs2_xattr_bucket_shrink(inode, xi,
-							xs, name_hash);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 009d37502a7b9fc89741e66b4454afca4edc1c26 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 6 Oct 2008 16:16:08 -0700
Subject: ocfs2: Remove pointless !!

ocfs2_stack_supports_plocks() doesn't need this to properly return a zero or
one value.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stackglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 7150f5dce95..68b668b0e60 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -290,7 +290,7 @@ EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
 
 int ocfs2_stack_supports_plocks(void)
 {
-	return !!(active_stack && active_stack->sp_ops->plock);
+	return active_stack && active_stack->sp_ops->plock;
 }
 EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
 
-- 
cgit v1.2.3


From 4cc8124584610fbe087ea2bed29ca52d2d0aa84a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 7 Oct 2008 11:02:04 -0700
Subject: ocfs2: make la_debug_mutex static

It can also be moved into ocfs2_la_debug_read().

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/localalloc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 02227c39251..b1c634d676a 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -76,8 +76,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 
 #ifdef CONFIG_OCFS2_FS_STATS
 
-DEFINE_MUTEX(la_debug_mutex);
-
 static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
 {
 	file->private_data = inode->i_private;
@@ -89,6 +87,7 @@ static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
 static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
 				   size_t count, loff_t *ppos)
 {
+	static DEFINE_MUTEX(la_debug_mutex);
 	struct ocfs2_super *osb = file->private_data;
 	int written, ret;
 	char *buf = osb->local_alloc_debug_buf;
-- 
cgit v1.2.3


From fd8351f83d413b41da956109cf429c15881886e2 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 7 Oct 2008 12:50:46 -0700
Subject: ocfs2: use smaller counters in ocfs2_remove_xattr_clusters_from_cache

i and b_len don't really need to be u64's. Xattr extent lengths should be
limited by the VFS, and then the size of our on-disk length field.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/uptodate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 52351402089..187b99ff036 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -562,7 +562,7 @@ void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
 					    sector_t block,
 					    u32 c_len)
 {
-	u64 i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
+	unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
 
 	for (i = 0; i < b_len; i++, block++)
 		ocfs2_remove_block_from_cache(inode, block);
-- 
cgit v1.2.3


From a81cb88b64a479b78c6dd5666678d50171865db8 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 7 Oct 2008 14:25:16 -0700
Subject: ocfs2: Don't check for NULL before brelse()

This is pointless as brelse() already does the check.

Signed-off-by: Mark Fasheh
---
 fs/ocfs2/alloc.c      | 33 +++++++-------------
 fs/ocfs2/aops.c       |  3 +-
 fs/ocfs2/dir.c        | 24 +++++----------
 fs/ocfs2/file.c       |  9 ++----
 fs/ocfs2/inode.c      |  7 ++---
 fs/ocfs2/ioctl.c      |  3 +-
 fs/ocfs2/journal.c    |  9 ++----
 fs/ocfs2/localalloc.c | 15 ++++------
 fs/ocfs2/namei.c      | 83 ++++++++++++++++-----------------------------------
 fs/ocfs2/suballoc.c   | 29 +++++++-----------
 fs/ocfs2/super.c      |  3 +-
 fs/ocfs2/symlink.c    |  3 +-
 12 files changed, 74 insertions(+), 147 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ebfe36ab2d5..052c4cf7db9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -719,8 +719,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 
 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
 bail:
-	if (eb_bh)
-		brelse(eb_bh);
+	brelse(eb_bh);
 
 	mlog_exit(retval);
 	return retval;
@@ -806,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
 bail:
 	if (status < 0) {
 		for(i = 0; i < wanted; i++) {
-			if (bhs[i])
-				brelse(bhs[i]);
+			brelse(bhs[i]);
 			bhs[i] = NULL;
 		}
 	}
@@ -1017,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 bail:
 	if (new_eb_bhs) {
 		for (i = 0; i < new_blocks; i++)
-			if (new_eb_bhs[i])
-				brelse(new_eb_bhs[i]);
+			brelse(new_eb_bhs[i]);
 		kfree(new_eb_bhs);
 	}
 
@@ -1116,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	new_eb_bh = NULL;
 	status = 0;
 bail:
-	if (new_eb_bh)
-		brelse(new_eb_bh);
+	brelse(new_eb_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1177,10 +1173,8 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		if (bh) {
-			brelse(bh);
-			bh = NULL;
-		}
+		brelse(bh);
+		bh = NULL;
 
 		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
 					  inode);
@@ -1199,8 +1193,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 		if (le16_to_cpu(el->l_next_free_rec) <
 		    le16_to_cpu(el->l_count)) {
-			if (lowest_bh)
-				brelse(lowest_bh);
+			brelse(lowest_bh);
 			lowest_bh = bh;
 			get_bh(lowest_bh);
 		}
@@ -1214,8 +1207,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	*target_bh = lowest_bh;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
@@ -4471,8 +4463,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 		ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
-	if (last_eb_bh)
-		brelse(last_eb_bh);
+	brelse(last_eb_bh);
 
 	mlog_exit(status);
 	return status;
@@ -5677,8 +5668,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 bail:
 	if (tl_inode)
 		iput(tl_inode);
-	if (tl_bh)
-		brelse(tl_bh);
+	brelse(tl_bh);
 
 	if (status < 0 && (*tl_copy)) {
 		kfree(*tl_copy);
@@ -7115,8 +7105,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 		mlog(ML_NOTICE,
 		     "Truncate completion has non-empty dealloc context\n");
 
-	if (tc->tc_last_eb_bh)
-		brelse(tc->tc_last_eb_bh);
+	brelse(tc->tc_last_eb_bh);
 
 	kfree(tc);
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index de179054a74..98e16fb49e4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -128,8 +128,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 	err = 0;
 
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(err);
 	return err;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 167e6c96277..3614651dcdb 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -716,8 +716,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
 			     i > 0; i--) {
 				tmp = ocfs2_bread(inode, ++blk, &err, 1);
-				if (tmp)
-					brelse(tmp);
+				brelse(tmp);
 			}
 			last_ra_blk = blk;
 			ra_sectors = 8;
@@ -899,10 +898,8 @@ int ocfs2_find_files_on_disk(const char *name,
 leave:
 	if (status < 0) {
 		*dirent = NULL;
-		if (*dirent_bh) {
-			brelse(*dirent_bh);
-			*dirent_bh = NULL;
-		}
+		brelse(*dirent_bh);
+		*dirent_bh = NULL;
 	}
 
 	mlog_exit(status);
@@ -951,8 +948,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
 
 	ret = 0;
 bail:
-	if (dirent_bh)
-		brelse(dirent_bh);
+	brelse(dirent_bh);
 
 	mlog_exit(ret);
 	return ret;
@@ -1127,8 +1123,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 
 	status = 0;
 bail:
-	if (new_bh)
-		brelse(new_bh);
+	brelse(new_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1574,8 +1569,7 @@ bail:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
-	if (new_bh)
-		brelse(new_bh);
+	brelse(new_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1702,8 +1696,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 
 	status = 0;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
@@ -1762,7 +1755,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 	*ret_de_bh = bh;
 	bh = NULL;
 out:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 	return ret;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c95318bc00c..408d5a66591 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -671,10 +671,8 @@ leave:
 		restart_func = 0;
 		goto restart_all;
 	}
-	if (bh) {
-		brelse(bh);
-		bh = NULL;
-	}
+	brelse(bh);
+	bh = NULL;
 
 	mlog_exit(status);
 	return status;
@@ -991,8 +989,7 @@ bail_unlock_rw:
 	if (size_change)
 		ocfs2_rw_unlock(inode, 1);
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 9d92c859ac9..05ad1186a16 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1174,10 +1174,9 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
 	return bh;
 
 fail:
-	if (bh) {
-		brelse(bh);
-		bh = NULL;
-	}
+	brelse(bh);
+	bh = NULL;
+
 	*err = -EIO;
 	return NULL;
 }
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7b142f0ce99..9fcd36dcc9a 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -102,8 +102,7 @@ bail_unlock:
 bail:
 	mutex_unlock(&inode->i_mutex);
 
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 373d94366a4..562ba652593 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -554,8 +554,7 @@ done:
 	if (status < 0) {
 		if (inode_lock)
 			ocfs2_inode_unlock(inode, 1);
-		if (bh != NULL)
-			brelse(bh);
+		brelse(bh);
 		if (inode) {
 			OCFS2_I(inode)->ip_open_count--;
 			iput(inode);
@@ -869,8 +868,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
 
 bail:
 	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
-		if (bhs[i])
-			brelse(bhs[i]);
+		brelse(bhs[i]);
 	mlog_exit(status);
 	return status;
 }
@@ -1286,8 +1284,7 @@ done:
 	if (inode)
 		iput(inode);
 
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index b1c634d676a..1c4f0645fb3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -294,8 +294,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 
 bail:
 	if (status < 0)
-		if (alloc_bh)
-			brelse(alloc_bh);
+		brelse(alloc_bh);
 	if (inode)
 		iput(inode);
 
@@ -411,8 +410,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 
 out_unlock:
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);
 
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
@@ -488,8 +486,7 @@ bail:
 		*alloc_copy = NULL;
 	}
 
-	if (alloc_bh)
-		brelse(alloc_bh);
+	brelse(alloc_bh);
 
 	if (inode) {
 		mutex_unlock(&inode->i_mutex);
@@ -557,8 +554,7 @@ out_unlock:
 out_mutex:
 	mutex_unlock(&main_bm_inode->i_mutex);
 
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);
 
 	iput(main_bm_inode);
 
@@ -1281,8 +1277,7 @@ bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);
 
 	if (main_bm_inode)
 		iput(main_bm_inode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 76d1d131430..7d0dd5c95eb 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -328,14 +328,9 @@ leave:
 	if (status == -ENOSPC)
 		mlog(0, "Disk is full\n");
 
-	if (new_fe_bh)
-		brelse(new_fe_bh);
-
-	if (de_bh)
-		brelse(de_bh);
-
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
+	brelse(new_fe_bh);
+	brelse(de_bh);
+	brelse(parent_fe_bh);
 
 	if ((status < 0) && inode)
 		iput(inode);
@@ -648,12 +643,9 @@ out_unlock_inode:
 out:
 	ocfs2_inode_unlock(dir, 1);
 
-	if (de_bh)
-		brelse(de_bh);
-	if (fe_bh)
-		brelse(fe_bh);
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
+	brelse(de_bh);
+	brelse(fe_bh);
+	brelse(parent_fe_bh);
 
 	mlog_exit(err);
 
@@ -852,17 +844,10 @@ leave:
 		iput(orphan_dir);
 	}
 
-	if (fe_bh)
-		brelse(fe_bh);
-
-	if (dirent_bh)
-		brelse(dirent_bh);
-
-	if (parent_node_bh)
-		brelse(parent_node_bh);
-
-	if (orphan_entry_bh)
-		brelse(orphan_entry_bh);
+	brelse(fe_bh);
+	brelse(dirent_bh);
+	brelse(parent_node_bh);
+	brelse(orphan_entry_bh);
 
 	mlog_exit(status);
 
@@ -1373,24 +1358,15 @@ bail:
 
 	if (new_inode)
 		iput(new_inode);
-	if (newfe_bh)
-		brelse(newfe_bh);
-	if (old_inode_bh)
-		brelse(old_inode_bh);
-	if (old_dir_bh)
-		brelse(old_dir_bh);
-	if (new_dir_bh)
-		brelse(new_dir_bh);
-	if (new_de_bh)
-		brelse(new_de_bh);
-	if (old_de_bh)
-		brelse(old_de_bh);
-	if (old_inode_de_bh)
-		brelse(old_inode_de_bh);
-	if (orphan_entry_bh)
-		brelse(orphan_entry_bh);
-	if (insert_entry_bh)
-		brelse(insert_entry_bh);
+	brelse(newfe_bh);
+	brelse(old_inode_bh);
+	brelse(old_dir_bh);
+	brelse(new_dir_bh);
+	brelse(new_de_bh);
+	brelse(old_de_bh);
+	brelse(old_inode_de_bh);
+	brelse(orphan_entry_bh);
+	brelse(insert_entry_bh);
 
 	mlog_exit(status);
 
@@ -1493,8 +1469,7 @@ bail:
 
 	if (bhs) {
 		for(i = 0; i < blocks; i++)
-			if (bhs[i])
-				brelse(bhs[i]);
+			brelse(bhs[i]);
 		kfree(bhs);
 	}
 
@@ -1660,12 +1635,9 @@ bail:
 
 	ocfs2_inode_unlock(dir, 1);
 
-	if (new_fe_bh)
-		brelse(new_fe_bh);
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
-	if (de_bh)
-		brelse(de_bh);
+	brelse(new_fe_bh);
+	brelse(parent_fe_bh);
+	brelse(de_bh);
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
@@ -1760,8 +1732,7 @@ leave:
 		iput(orphan_dir_inode);
 	}
 
-	if (orphan_dir_bh)
-		brelse(orphan_dir_bh);
+	brelse(orphan_dir_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1830,8 +1801,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
 
 leave:
-	if (orphan_dir_bh)
-		brelse(orphan_dir_bh);
+	brelse(orphan_dir_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1899,8 +1869,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	}
 
 leave:
-	if (target_de_bh)
-		brelse(target_de_bh);
+	brelse(target_de_bh);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d7a6f928c31..08d8844a3c2 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -130,10 +130,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 		iput(inode);
 		ac->ac_inode = NULL;
 	}
-	if (ac->ac_bh) {
-		brelse(ac->ac_bh);
-		ac->ac_bh = NULL;
-	}
+	brelse(ac->ac_bh);
+	ac->ac_bh = NULL;
 }
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -401,8 +399,7 @@ bail:
 	if (ac)
 		ocfs2_free_alloc_context(ac);
 
-	if (bg_bh)
-		brelse(bg_bh);
+	brelse(bg_bh);
 
 	mlog_exit(status);
 	return status;
@@ -494,8 +491,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	get_bh(bh);
 	ac->ac_bh = bh;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
@@ -1269,10 +1265,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 					     &tmp_bits)) == -ENOSPC) {
 		if (!bg->bg_next_group)
 			break;
-		if (prev_group_bh) {
-			brelse(prev_group_bh);
-			prev_group_bh = NULL;
-		}
+
+		brelse(prev_group_bh);
+		prev_group_bh = NULL;
+
 		next_group = le64_to_cpu(bg->bg_next_group);
 		prev_group_bh = group_bh;
 		group_bh = NULL;
@@ -1367,10 +1363,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
-	if (group_bh)
-		brelse(group_bh);
-	if (prev_group_bh)
-		brelse(prev_group_bh);
+	brelse(group_bh);
+	brelse(prev_group_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1844,8 +1838,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 	}
 
 bail:
-	if (group_bh)
-		brelse(group_bh);
+	brelse(group_bh);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index d2027cec8f3..304b63ac78c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -762,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	return status;
 
 read_super_error:
-	if (bh != NULL)
-		brelse(bh);
+	brelse(bh);
 
 	if (inode)
 		iput(inode);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 8c5879c7f84..c6c94b55774 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -158,8 +158,7 @@ bail:
 		kunmap(page);
 		page_cache_release(page);
 	}
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	return ERR_PTR(status);
 }
-- 
cgit v1.2.3


From 40daa16a3441abe822bfcc748150116a77aee2ea Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 7 Oct 2008 14:31:42 -0700
Subject: ocfs2: Uninline ocfs2_xattr_name_hash()

This is too big to be inlined.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 64700c3fc24..e21a1a8b425 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -144,11 +144,11 @@ static inline struct xattr_handler *ocfs2_xattr_handler(int name_index)
 	return handler;
 }
 
-static inline u32 ocfs2_xattr_name_hash(struct inode *inode,
-					char *prefix,
-					int prefix_len,
-					char *name,
-					int name_len)
+static u32 ocfs2_xattr_name_hash(struct inode *inode,
+				 char *prefix,
+				 int prefix_len,
+				 char *name,
+				 int name_len)
 {
 	/* Get hash value of uuid from super block */
 	u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
-- 
cgit v1.2.3


From 99219aea68b5bff4f182858372b43181ad3bdb34 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 7 Oct 2008 14:52:59 -0700
Subject: ocfs2: Move trusted and user attribute support into xattr.c

Per Christoph Hellwig's suggestion - don't split these up. It's not like we
gained much by having the two tiny files around.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/Makefile        |   4 +-
 fs/ocfs2/xattr.c         | 110 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr_trusted.c |  82 -----------------------------------
 fs/ocfs2/xattr_user.c    |  94 ----------------------------------------
 4 files changed, 111 insertions(+), 179 deletions(-)
 delete mode 100644 fs/ocfs2/xattr_trusted.c
 delete mode 100644 fs/ocfs2/xattr_user.c

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 21323da4085..589dcdfdfe3 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -35,9 +35,7 @@ ocfs2-objs := \
 	sysfile.o 		\
 	uptodate.o		\
 	ver.o			\
-	xattr.o			\
-	xattr_user.o		\
-	xattr_trusted.o
+	xattr.o
 
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e21a1a8b425..0f556b00235 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -37,6 +37,9 @@
 #include <linux/writeback.h>
 #include <linux/falloc.h>
 #include <linux/sort.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
 
 #define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
@@ -4740,3 +4743,110 @@ static int ocfs2_delete_xattr_index_block(struct inode *inode,
 out:
 	return ret;
 }
+
+/*
+ * 'trusted' attributes support
+ */
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+
+static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+				       size_t list_size, const char *name,
+				       size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ocfs2_xattr_trusted_list,
+	.get	= ocfs2_xattr_trusted_get,
+	.set	= ocfs2_xattr_trusted_set,
+};
+
+
+/*
+ * 'user' attributes support
+ */
+
+#define XATTR_USER_PREFIX "user."
+
+static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+	const size_t total_len = prefix_len + name_len + 1;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
+				void *buffer, size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
+				const void *value, size_t size, int flags)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ocfs2_xattr_user_list,
+	.get	= ocfs2_xattr_user_get,
+	.set	= ocfs2_xattr_user_set,
+};
diff --git a/fs/ocfs2/xattr_trusted.c b/fs/ocfs2/xattr_trusted.c
deleted file mode 100644
index 4c589c447aa..00000000000
--- a/fs/ocfs2/xattr_trusted.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * xattr_trusted.c
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * CREDITS:
- * Lots of code in this file is taken from ext3.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define MLOG_MASK_PREFIX ML_INODE
-#include <cluster/masklog.h>
-
-#include "ocfs2.h"
-#include "alloc.h"
-#include "dlmglue.h"
-#include "file.h"
-#include "ocfs2_fs.h"
-#include "xattr.h"
-
-#define XATTR_TRUSTED_PREFIX "trusted."
-
-static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
-				       size_t list_size, const char *name,
-				       size_t name_len)
-{
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
-		memcpy(list + prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
-static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
-				   void *buffer, size_t size)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
-			       buffer, size);
-}
-
-static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
-				   const void *value, size_t size, int flags)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
-	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
-			       size, flags);
-}
-
-struct xattr_handler ocfs2_xattr_trusted_handler = {
-	.prefix	= XATTR_TRUSTED_PREFIX,
-	.list	= ocfs2_xattr_trusted_list,
-	.get	= ocfs2_xattr_trusted_get,
-	.set	= ocfs2_xattr_trusted_set,
-};
diff --git a/fs/ocfs2/xattr_user.c b/fs/ocfs2/xattr_user.c
deleted file mode 100644
index 93ba7163778..00000000000
--- a/fs/ocfs2/xattr_user.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * xattr_user.c
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * CREDITS:
- * Lots of code in this file is taken from ext3.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define MLOG_MASK_PREFIX ML_INODE
-#include <cluster/masklog.h>
-
-#include "ocfs2.h"
-#include "alloc.h"
-#include "dlmglue.h"
-#include "file.h"
-#include "ocfs2_fs.h"
-#include "xattr.h"
-
-#define XATTR_USER_PREFIX "user."
-
-static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
-				    size_t list_size, const char *name,
-				    size_t name_len)
-{
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
-	const size_t total_len = prefix_len + name_len + 1;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_USER_PREFIX, prefix_len);
-		memcpy(list + prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
-static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
-				void *buffer, size_t size)
-{
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
-		return -EOPNOTSUPP;
-	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
-			       buffer, size);
-}
-
-static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
-				const void *value, size_t size, int flags)
-{
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
-		return -EOPNOTSUPP;
-
-	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
-			       size, flags);
-}
-
-struct xattr_handler ocfs2_xattr_user_handler = {
-	.prefix	= XATTR_USER_PREFIX,
-	.list	= ocfs2_xattr_user_list,
-	.get	= ocfs2_xattr_user_get,
-	.set	= ocfs2_xattr_user_set,
-};
-- 
cgit v1.2.3


From 2057e5c6780d86939a199031cdbafb81e6f88aac Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 9 Oct 2008 23:06:13 +0800
Subject: ocfs2: Calculate EA hash only by its suffix.

According to Christoph Hellwig's advice, the hash value of EA
is only calculated by its suffix.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 35 +++++------------------------------
 1 file changed, 5 insertions(+), 30 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 0f556b00235..092a1231898 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -148,21 +148,13 @@ static inline struct xattr_handler *ocfs2_xattr_handler(int name_index)
 }
 
 static u32 ocfs2_xattr_name_hash(struct inode *inode,
-				 char *prefix,
-				 int prefix_len,
-				 char *name,
+				 const char *name,
 				 int name_len)
 {
 	/* Get hash value of uuid from super block */
 	u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
 	int i;
 
-	/* hash extended attribute prefix */
-	for (i = 0; i < prefix_len; i++) {
-		hash = (hash << OCFS2_HASH_SHIFT) ^
-		       (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
-		       *prefix++;
-	}
 	/* hash extended attribute name */
 	for (i = 0; i < name_len; i++) {
 		hash = (hash << OCFS2_HASH_SHIFT) ^
@@ -183,14 +175,9 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
 				   struct ocfs2_xattr_entry *entry)
 {
 	u32 hash = 0;
-	struct xattr_handler *handler =
-			ocfs2_xattr_handler(ocfs2_xattr_get_type(entry));
-	char *prefix = handler->prefix;
 	char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
-	int prefix_len = strlen(handler->prefix);
 
-	hash = ocfs2_xattr_name_hash(inode, prefix, prefix_len, name,
-				     entry->xe_name_len);
+	hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
 	entry->xe_name_hash = cpu_to_le32(hash);
 
 	return;
@@ -2093,18 +2080,6 @@ cleanup:
 	return ret;
 }
 
-static inline u32 ocfs2_xattr_hash_by_name(struct inode *inode,
-					   int name_index,
-					   const char *suffix_name)
-{
-	struct xattr_handler *handler = ocfs2_xattr_handler(name_index);
-	char *prefix = handler->prefix;
-	int prefix_len = strlen(handler->prefix);
-
-	return ocfs2_xattr_name_hash(inode, prefix, prefix_len,
-				     (char *)suffix_name, strlen(suffix_name));
-}
-
 /*
  * Find the xattr extent rec which may contains name_hash.
  * e_cpos will be the first name hash of the xattr rec.
@@ -2395,7 +2370,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
 	struct ocfs2_extent_list *el = &xb_root->xt_list;
 	u64 p_blkno = 0;
 	u32 first_hash, num_clusters = 0;
-	u32 name_hash = ocfs2_xattr_hash_by_name(inode, name_index, name);
+	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
 
 	if (le16_to_cpu(el->l_next_free_rec) == 0)
 		return -ENODATA;
@@ -4435,8 +4410,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 	size_t value_len;
 	char *val = (char *)xi->value;
 	struct ocfs2_xattr_entry *xe = xs->here;
-	u32 name_hash = ocfs2_xattr_hash_by_name(inode,
-						 xi->name_index, xi->name);
+	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
+					      strlen(xi->name));
 
 	if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
 		/*
-- 
cgit v1.2.3


From 936b8834366ec05f2a6993f73afd8348cac9718e Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 9 Oct 2008 23:06:14 +0800
Subject: ocfs2: Refactor xattr list and remove ocfs2_xattr_handler().

According to Christoph Hellwig's advice, we really don't need
a ->list to handle one xattr's list. Just a map from index to
xattr prefix is enough. And I also refactor the old list method
with the reference from fs/xfs/linux-2.6/xfs_xattr.c and the
xattr list method in btrfs.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 95 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 60 insertions(+), 35 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 092a1231898..8f522f2f84a 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -137,14 +137,14 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
 					  struct buffer_head *xb_bh);
 
-static inline struct xattr_handler *ocfs2_xattr_handler(int name_index)
+static inline const char *ocfs2_xattr_prefix(int name_index)
 {
 	struct xattr_handler *handler = NULL;
 
 	if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
 		handler = ocfs2_xattr_handler_map[name_index];
 
-	return handler;
+	return handler ? handler->prefix : NULL;
 }
 
 static u32 ocfs2_xattr_name_hash(struct inode *inode,
@@ -452,33 +452,56 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
 	return ret;
 }
 
+static int ocfs2_xattr_list_entry(char *buffer, size_t size,
+				  size_t *result, const char *prefix,
+				  const char *name, int name_len)
+{
+	char *p = buffer + *result;
+	int prefix_len = strlen(prefix);
+	int total_len = prefix_len + name_len + 1;
+
+	*result += total_len;
+
+	/* we are just looking for how big our buffer needs to be */
+	if (!size)
+		return 0;
+
+	if (*result > size)
+		return -ERANGE;
+
+	memcpy(p, prefix, prefix_len);
+	memcpy(p + prefix_len, name, name_len);
+	p[prefix_len + name_len] = '\0';
+
+	return 0;
+}
+
 static int ocfs2_xattr_list_entries(struct inode *inode,
 				    struct ocfs2_xattr_header *header,
 				    char *buffer, size_t buffer_size)
 {
-	size_t rest = buffer_size;
-	int i;
+	size_t result = 0;
+	int i, type, ret;
+	const char *prefix, *name;
 
 	for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
-		struct xattr_handler *handler =
-			ocfs2_xattr_handler(ocfs2_xattr_get_type(entry));
-
-		if (handler) {
-			size_t size = handler->list(inode, buffer, rest,
-					((char *)header +
-					le16_to_cpu(entry->xe_name_offset)),
-					entry->xe_name_len);
-			if (buffer) {
-				if (size > rest)
-					return -ERANGE;
-				buffer += size;
-			}
-			rest -= size;
+		type = ocfs2_xattr_get_type(entry);
+		prefix = ocfs2_xattr_prefix(type);
+
+		if (prefix) {
+			name = (const char *)header +
+				le16_to_cpu(entry->xe_name_offset);
+
+			ret = ocfs2_xattr_list_entry(buffer, buffer_size,
+						     &result, prefix, name,
+						     entry->xe_name_len);
+			if (ret)
+				return ret;
 		}
 	}
 
-	return buffer_size - rest;
+	return result;
 }
 
 static int ocfs2_xattr_ibody_list(struct inode *inode,
@@ -2456,6 +2479,7 @@ out:
 struct ocfs2_xattr_tree_list {
 	char *buffer;
 	size_t buffer_size;
+	size_t result;
 };
 
 static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
@@ -2481,17 +2505,17 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 				   struct ocfs2_xattr_bucket *bucket,
 				   void *para)
 {
-	int ret = 0;
+	int ret = 0, type;
 	struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
-	size_t size;
 	int i, block_off, new_offset;
+	const char *prefix, *name;
 
 	for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
-		struct xattr_handler *handler =
-			ocfs2_xattr_handler(ocfs2_xattr_get_type(entry));
+		type = ocfs2_xattr_get_type(entry);
+		prefix = ocfs2_xattr_prefix(type);
 
-		if (handler) {
+		if (prefix) {
 			ret = ocfs2_xattr_bucket_get_name_value(inode,
 								bucket->xh,
 								i,
@@ -2499,16 +2523,16 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 								&new_offset);
 			if (ret)
 				break;
-			size = handler->list(inode, xl->buffer, xl->buffer_size,
-					     bucket->bhs[block_off]->b_data +
-					     new_offset,
-					     entry->xe_name_len);
-			if (xl->buffer) {
-				if (size > xl->buffer_size)
-					return -ERANGE;
-				xl->buffer += size;
-			}
-			xl->buffer_size -= size;
+
+			name = (const char *)bucket->bhs[block_off]->b_data +
+				new_offset;
+			ret = ocfs2_xattr_list_entry(xl->buffer,
+						     xl->buffer_size,
+						     &xl->result,
+						     prefix, name,
+						     entry->xe_name_len);
+			if (ret)
+				break;
 		}
 	}
 
@@ -2527,6 +2551,7 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
 	struct ocfs2_xattr_tree_list xl = {
 		.buffer = buffer,
 		.buffer_size = buffer_size,
+		.result = 0,
 	};
 
 	if (le16_to_cpu(el->l_next_free_rec) == 0)
@@ -2554,7 +2579,7 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
 		name_hash = e_cpos - 1;
 	}
 
-	ret = buffer_size - xl.buffer_size;
+	ret = xl.result;
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From da1e90985a0e767e44397c9db0937e236033fa58 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 9 Oct 2008 17:20:29 -0700
Subject: ocfs2: Separate out sync reads from ocfs2_read_blocks()

The ocfs2_read_blocks() function currently handles sync reads, cached,
reads, and sometimes cached reads.  We're going to add some
functionality to it, so first we should simplify it.  The uncached,
synchronous reads are much easer to handle as a separate function, so we
instroduce ocfs2_read_blocks_sync().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/buffer_head_io.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/buffer_head_io.h |  2 ++
 fs/ocfs2/inode.c          |  7 ++--
 fs/ocfs2/journal.c        |  5 ++-
 fs/ocfs2/resize.c         |  8 ++---
 5 files changed, 96 insertions(+), 10 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f136639f5b4..ca4ab7ce85b 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 	/* remove from dirty list before I/O. */
 	clear_buffer_dirty(bh);
 
-	get_bh(bh); /* for end_buffer_write_sync() */                   
+	get_bh(bh); /* for end_buffer_write_sync() */
 	bh->b_end_io = end_buffer_write_sync;
 	submit_bh(WRITE, bh);
 
@@ -88,6 +88,88 @@ out:
 	return ret;
 }
 
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[])
+{
+	int status = 0;
+	unsigned int i;
+	struct buffer_head *bh;
+
+	if (!nr) {
+		mlog(ML_BH_IO, "No buffers will be read!\n");
+		goto bail;
+	}
+
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(osb->sb, block++);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "trying to sync read a jbd "
+			     "managed bh (blocknr = %llu), skipping\n",
+			     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		if (buffer_dirty(bh)) {
+			/* This should probably be a BUG, or
+			 * at least return an error. */
+			mlog(ML_ERROR,
+			     "trying to sync read a dirty "
+			     "buffer! (blocknr = %llu), skipping\n",
+			     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		lock_buffer(bh);
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "block %llu had the JBD bit set "
+			     "while I was in lock_buffer!",
+			     (unsigned long long)bh->b_blocknr);
+			BUG();
+		}
+
+		clear_buffer_uptodate(bh);
+		get_bh(bh); /* for end_buffer_read_sync() */
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+	}
+
+	for (i = nr; i > 0; i--) {
+		bh = bhs[i - 1];
+
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "the journal got the buffer while it was "
+			     "locked for io! (blocknr = %llu)\n",
+			     (unsigned long long)bh->b_blocknr);
+			BUG();
+		}
+
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. */
+			status = -EIO;
+			put_bh(bh);
+			bhs[i - 1] = NULL;
+		}
+	}
+
+bail:
+	return status;
+}
+
 int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		      struct buffer_head *bhs[], int flags,
 		      struct inode *inode)
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c2e78614c3e..71646b470ac 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -46,6 +46,8 @@ int ocfs2_read_blocks(struct ocfs2_super          *osb,
 		      struct buffer_head  *bhs[],
 		      int                  flags,
 		      struct inode        *inode);
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[]);
 
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 				struct buffer_head *bh);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 05ad1186a16..52229703394 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -460,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 		}
 	}
 
-	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
-				  can_lock ? inode : NULL);
+	if (can_lock)
+		status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
+					  inode);
+	else
+		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 562ba652593..10c51b562be 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -850,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
 
 		/* We are reading journal data which should not
 		 * be put in the uptodate cache */
-		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
-					   p_blkno, p_blocks, bhs, 0,
-					   NULL);
+		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
+						p_blkno, p_blocks, bhs);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 8166968e901..472d854796c 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
 		if (cluster > clusters)
 			break;
 
-		ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+		ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
 		if (ret < 0) {
 			mlog_errno(ret);
 			break;
@@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode,
 	 * update the superblock last.
 	 * It doesn't matter if the write failed.
 	 */
-	ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
-			       &super_bh, 0, NULL);
+	ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
+				     &super_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -540,7 +540,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 		goto out_unlock;
 	}
 
-	ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+	ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
 	if (ret < 0) {
 		mlog(ML_ERROR, "Can't read the group descriptor # %llu "
 		     "from the device.", (unsigned long long)input->group);
-- 
cgit v1.2.3


From 31d33073ca38603dea705dae45e094a64ca062d6 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 9 Oct 2008 17:20:30 -0700
Subject: ocfs2: Require an inode for ocfs2_read_block(s)().

Now that synchronous readers are using ocfs2_read_blocks_sync(), all
callers of ocfs2_read_blocks() are passing an inode.  Use it
unconditionally.  Since it's there, we don't need to pass the
ocfs2_super either.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c          | 30 +++++++++----------
 fs/ocfs2/aops.c           | 10 +++----
 fs/ocfs2/buffer_head_io.c | 35 ++++++++--------------
 fs/ocfs2/buffer_head_io.h | 18 +++++-------
 fs/ocfs2/dir.c            | 12 ++++----
 fs/ocfs2/dlmglue.c        |  9 +++---
 fs/ocfs2/extent_map.c     | 12 ++++----
 fs/ocfs2/file.c           | 12 ++++----
 fs/ocfs2/inode.c          |  6 ++--
 fs/ocfs2/journal.c        |  2 +-
 fs/ocfs2/localalloc.c     |  8 ++---
 fs/ocfs2/namei.c          |  5 ++--
 fs/ocfs2/resize.c         |  4 +--
 fs/ocfs2/slot_map.c       |  5 ++--
 fs/ocfs2/suballoc.c       | 17 +++++------
 fs/ocfs2/symlink.c        |  5 ++--
 fs/ocfs2/xattr.c          | 74 ++++++++++++++++++++++-------------------------
 17 files changed, 116 insertions(+), 148 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 052c4cf7db9..a164e09491f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -705,8 +705,8 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 
 	if (last_eb_blk) {
-		retval = ocfs2_read_block(osb, last_eb_blk,
-					  &eb_bh, OCFS2_BH_CACHED, inode);
+		retval = ocfs2_read_block(inode, last_eb_blk,
+					  &eb_bh, OCFS2_BH_CACHED);
 		if (retval < 0) {
 			mlog_errno(retval);
 			goto bail;
@@ -1176,8 +1176,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 		brelse(bh);
 		bh = NULL;
 
-		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
-					  inode);
+		status = ocfs2_read_block(inode, blkno, &bh, OCFS2_BH_CACHED);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1541,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode,
 
 		brelse(bh);
 		bh = NULL;
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
-				       &bh, OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, blkno, &bh, OCFS2_BH_CACHED);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4296,9 +4294,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
 		 * may want it later.
 		 */
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+		ret = ocfs2_read_block(inode,
 				       ocfs2_et_get_last_eb_blk(et), &bh,
-				       OCFS2_BH_CACHED, inode);
+				       OCFS2_BH_CACHED);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
@@ -4764,9 +4762,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	if (path->p_tree_depth) {
 		struct ocfs2_extent_block *eb;
 
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+		ret = ocfs2_read_block(inode,
 				       ocfs2_et_get_last_eb_blk(et),
-				       &last_eb_bh, OCFS2_BH_CACHED, inode);
+				       &last_eb_bh, OCFS2_BH_CACHED);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
@@ -4923,9 +4921,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 
 	depth = path->p_tree_depth;
 	if (depth > 0) {
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+		ret = ocfs2_read_block(inode,
 				       ocfs2_et_get_last_eb_blk(et),
-				       &last_eb_bh, OCFS2_BH_CACHED, inode);
+				       &last_eb_bh, OCFS2_BH_CACHED);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -5592,8 +5590,8 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
-				  OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh,
+				  OCFS2_BH_CACHED);
 	if (status < 0) {
 		iput(inode);
 		mlog_errno(status);
@@ -6991,8 +6989,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
 
 	if (fe->id2.i_list.l_tree_depth) {
-		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
-					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh, OCFS2_BH_CACHED);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 98e16fb49e4..f232a0e3c30 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -68,9 +68,8 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				  OCFS2_I(inode)->ip_blkno,
-				  &bh, OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+				  &bh, OCFS2_BH_CACHED);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -260,13 +259,12 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 {
 	int ret;
 	struct buffer_head *di_bh = NULL;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 
-	ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
-			       OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh,
+			       OCFS2_BH_CACHED);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index ca4ab7ce85b..718dbe5607c 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -170,22 +170,20 @@ bail:
 	return status;
 }
 
-int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
-		      struct buffer_head *bhs[], int flags,
-		      struct inode *inode)
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags)
 {
 	int status = 0;
-	struct super_block *sb;
 	int i, ignore_cache = 0;
 	struct buffer_head *bh;
 
-	mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
-		   (unsigned long long)block, nr, flags, inode);
+	mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
+		   inode, (unsigned long long)block, nr, flags);
 
-	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
-	       (!inode || !(flags & OCFS2_BH_CACHED)));
+	BUG_ON(!inode);
+	BUG_ON((flags & OCFS2_BH_READAHEAD) && !(flags & OCFS2_BH_CACHED));
 
-	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+	if (bhs == NULL) {
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
@@ -204,19 +202,12 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		goto bail;
 	}
 
-	sb = osb->sb;
-
-	if (flags & OCFS2_BH_CACHED && !inode)
-		flags &= ~OCFS2_BH_CACHED;
-
-	if (inode)
-		mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+	mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
 	for (i = 0 ; i < nr ; i++) {
 		if (bhs[i] == NULL) {
-			bhs[i] = sb_getblk(sb, block++);
+			bhs[i] = sb_getblk(inode->i_sb, block++);
 			if (bhs[i] == NULL) {
-				if (inode)
-					mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+				mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 				status = -EIO;
 				mlog_errno(status);
 				goto bail;
@@ -347,11 +338,9 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		/* Always set the buffer in the cache, even if it was
 		 * a forced read, or read-ahead which hasn't yet
 		 * completed. */
-		if (inode)
-			ocfs2_set_buffer_uptodate(inode, bh);
+		ocfs2_set_buffer_uptodate(inode, bh);
 	}
-	if (inode)
-		mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 
 	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
 	     (unsigned long long)block, nr,
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 71646b470ac..fd0d774ac35 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,19 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
 			     int uptodate);
 
-static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+static inline int ocfs2_read_block(struct inode	       *inode,
 				   u64                  off,
 				   struct buffer_head **bh,
-				   int                  flags,
-				   struct inode        *inode);
+				   int                  flags);
 
 int ocfs2_write_block(struct ocfs2_super          *osb,
 		      struct buffer_head  *bh,
 		      struct inode        *inode);
-int ocfs2_read_blocks(struct ocfs2_super          *osb,
+int ocfs2_read_blocks(struct inode	  *inode,
 		      u64                  block,
 		      int                  nr,
 		      struct buffer_head  *bhs[],
-		      int                  flags,
-		      struct inode        *inode);
+		      int                  flags);
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 			   unsigned int nr, struct buffer_head *bhs[]);
 
@@ -55,9 +53,8 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 #define OCFS2_BH_CACHED            1
 #define OCFS2_BH_READAHEAD         8
 
-static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
-				   struct buffer_head **bh, int flags,
-				   struct inode *inode)
+static inline int ocfs2_read_block(struct inode *inode, u64 off,
+				   struct buffer_head **bh, int flags)
 {
 	int status = 0;
 
@@ -67,8 +64,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
 		goto bail;
 	}
 
-	status = ocfs2_read_blocks(osb, off, 1, bh,
-				   flags, inode);
+	status = ocfs2_read_blocks(inode, off, 1, bh, flags);
 
 bail:
 	return status;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 3614651dcdb..828437ca91b 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -188,8 +188,8 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, dir);
+	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh,
+			       OCFS2_BH_CACHED);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -417,8 +417,8 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, dir);
+	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -596,8 +596,8 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 	struct ocfs2_inline_data *data;
 	struct ocfs2_dir_entry *de;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED);
 	if (ret) {
 		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index eae3d643a5e..3b2cd0f8721 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2024,8 +2024,8 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 	} else {
 		/* Boo, we have to go to disk. */
 		/* read bh, cast, ocfs2_refresh_inode */
-		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
-					  bh, OCFS2_BH_CACHED, inode);
+		status = ocfs2_read_block(inode, oi->ip_blkno,
+					  bh, OCFS2_BH_CACHED);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail_refresh;
@@ -2086,11 +2086,10 @@ static int ocfs2_assign_bh(struct inode *inode,
 		return 0;
 	}
 
-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+	status = ocfs2_read_block(inode,
 				  OCFS2_I(inode)->ip_blkno,
 				  ret_bh,
-				  OCFS2_BH_CACHED,
-				  inode);
+				  OCFS2_BH_CACHED);
 	if (status < 0)
 		mlog_errno(status);
 
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index a7b1cfa735b..5b482214bb7 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,8 +293,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk,
-			       &eb_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, last_eb_blk,
+			       &eb_bh, OCFS2_BH_CACHED);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -382,9 +382,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
 		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
 			goto no_more_extents;
 
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+		ret = ocfs2_read_block(inode,
 				       le64_to_cpu(eb->h_next_leaf_blk),
-				       &next_eb_bh, OCFS2_BH_CACHED, inode);
+				       &next_eb_bh, OCFS2_BH_CACHED);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -631,8 +631,8 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	if (ret == 0)
 		goto out;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 408d5a66591..7a809be54e8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -545,8 +545,8 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
-				  OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh,
+				  OCFS2_BH_CACHED);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1132,8 +1132,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 	struct buffer_head *bh = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-			       oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, oi->ip_blkno, &bh, OCFS2_BH_CACHED);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1159,9 +1158,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 	struct buffer_head *di_bh = NULL;
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       OCFS2_I(inode)->ip_blkno, &di_bh,
-				       OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+				       &di_bh, OCFS2_BH_CACHED);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 52229703394..6ec31b92a47 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -461,8 +461,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	}
 
 	if (can_lock)
-		status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
-					  inode);
+		status = ocfs2_read_block(inode, args->fi_blkno, &bh, 0);
 	else
 		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
 	if (status < 0) {
@@ -1166,8 +1165,7 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
 		goto fail;
 	}
 
-	tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
-				  readflags, inode);
+	tmperr = ocfs2_read_block(inode, p_blkno, &bh, readflags);
 	if (tmperr < 0)
 		goto fail;
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 10c51b562be..9854fb7315b 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1134,7 +1134,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
 	}
 	SET_INODE_JOURNAL(inode);
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 1c4f0645fb3..b77b67bb277 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -248,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
-				  &alloc_bh, 0, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+				  &alloc_bh, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -459,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 
 	mutex_lock(&inode->i_mutex);
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
-				  &alloc_bh, 0, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+				  &alloc_bh, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 7d0dd5c95eb..e5fc9345dd3 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1752,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
-	status = ocfs2_read_block(osb,
+	status = ocfs2_read_block(orphan_dir_inode,
 				  OCFS2_I(orphan_dir_inode)->ip_blkno,
-				  &orphan_dir_bh, OCFS2_BH_CACHED,
-				  orphan_dir_inode);
+				  &orphan_dir_bh, OCFS2_BH_CACHED);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 472d854796c..92dcd935056 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -332,8 +332,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
 					      first_new_cluster - 1);
 
-	ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
-			       main_bm_inode);
+	ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh,
+			       OCFS2_BH_CACHED);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bb5ff8939bf..82d986bff7f 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 	 * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
 	 * this is not true, the read of -1 (UINT64_MAX) will fail.
 	 */
-	ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
-				si->si_inode);
+	ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, 0);
 	if (ret == 0) {
 		spin_lock(&osb->osb_lock);
 		ocfs2_update_slot_info(si);
@@ -404,7 +403,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 		     (unsigned long long)blkno);
 
 		bh = NULL;  /* Acquire a fresh bh */
-		status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+		status = ocfs2_read_block(si->si_inode, blkno, &bh, 0);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 08d8844a3c2..f0056b7d435 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1172,8 +1172,8 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	struct ocfs2_group_desc *gd;
 	struct inode *alloc_inode = ac->ac_inode;
 
-	ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
-			       &group_bh, OCFS2_BH_CACHED, alloc_inode);
+	ret = ocfs2_read_block(alloc_inode, gd_blkno,
+			       &group_bh, OCFS2_BH_CACHED);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1242,9 +1242,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	     bits_wanted, chain,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
 
-	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+	status = ocfs2_read_block(alloc_inode,
 				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
-				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
+				  &group_bh, OCFS2_BH_CACHED);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1272,9 +1272,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 		next_group = le64_to_cpu(bg->bg_next_group);
 		prev_group_bh = group_bh;
 		group_bh = NULL;
-		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+		status = ocfs2_read_block(alloc_inode,
 					  next_group, &group_bh,
-					  OCFS2_BH_CACHED, alloc_inode);
+					  OCFS2_BH_CACHED);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1777,7 +1777,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 {
 	int status = 0;
 	u32 tmp_used;
-	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
 	struct buffer_head *group_bh = NULL;
@@ -1796,8 +1795,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
 	     (unsigned long long)bg_blkno, start_bit);
 
-	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
-				  alloc_inode);
+	status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh,
+				  OCFS2_BH_CACHED);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index c6c94b55774..8788dc26316 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,11 +84,10 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
 
 	mlog_entry_void();
 
-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+	status = ocfs2_read_block(inode,
 				  OCFS2_I(inode)->ip_blkno,
 				  bh,
-				  OCFS2_BH_CACHED,
-				  inode);
+				  OCFS2_BH_CACHED);
 	if (status < 0) {
 		mlog_errno(status);
 		link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8f522f2f84a..63037bd7892 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -537,9 +537,9 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 	if (!di->i_xattr_loc)
 		return ret;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+	ret = ocfs2_read_block(inode,
 			       le64_to_cpu(di->i_xattr_loc),
-			       &blk_bh, OCFS2_BH_CACHED, inode);
+			       &blk_bh, OCFS2_BH_CACHED);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -672,8 +672,8 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 		/* Copy ocfs2_xattr_value */
 		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-			ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
-					       &bh, OCFS2_BH_CACHED, inode);
+			ret = ocfs2_read_block(inode, blkno,
+					       &bh, OCFS2_BH_CACHED);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -764,9 +764,9 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 
 	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+	ret = ocfs2_read_block(inode,
 			       le64_to_cpu(di->i_xattr_loc),
-			       &blk_bh, OCFS2_BH_CACHED, inode);
+			       &blk_bh, OCFS2_BH_CACHED);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -922,8 +922,8 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 
 		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-			ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
-					       &bh, OCFS2_BH_CACHED, inode);
+			ret = ocfs2_read_block(inode, blkno,
+					       &bh, OCFS2_BH_CACHED);
 			if (ret) {
 				mlog_errno(ret);
 				goto out_commit;
@@ -1514,8 +1514,8 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 	u64 blk, bg_blkno;
 	u16 bit;
 
-	ret = ocfs2_read_block(osb, block, &blk_bh,
-			       OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, block, &blk_bh,
+			       OCFS2_BH_CACHED);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1773,9 +1773,9 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 	if (!di->i_xattr_loc)
 		return ret;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+	ret = ocfs2_read_block(inode,
 			       le64_to_cpu(di->i_xattr_loc),
-			       &blk_bh, OCFS2_BH_CACHED, inode);
+			       &blk_bh, OCFS2_BH_CACHED);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -2216,9 +2216,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
 			break;
 		}
 
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+		ret = ocfs2_read_block(inode,
 				       header_bh->b_blocknr + block_off,
-				       &name_bh, OCFS2_BH_CACHED, inode);
+				       &name_bh, OCFS2_BH_CACHED);
 		if (ret) {
 			mlog_errno(ret);
 			break;
@@ -2269,8 +2269,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 	u32 last_hash;
 	u64 blkno;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno,
-			       &bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, p_blkno, &bh, OCFS2_BH_CACHED);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -2286,8 +2285,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 
 		blkno = p_blkno + bucket * blk_per_bucket;
 
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
-				       &bh, OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, blkno, &bh, OCFS2_BH_CACHED);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2359,10 +2357,9 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		 * If we have found the xattr enty, read all the blocks in
 		 * this bucket.
 		 */
-		ret = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
-					xs->bucket.bhs[0]->b_blocknr + 1,
+		ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
 					blk_per_bucket - 1, &xs->bucket.bhs[1],
-					OCFS2_BH_CACHED, inode);
+					OCFS2_BH_CACHED);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2438,9 +2435,8 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 	     clusters, blkno);
 
 	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
-		ret = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
-					blkno, blk_per_bucket,
-					bucket.bhs, OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
+					bucket.bhs, OCFS2_BH_CACHED);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2705,10 +2701,10 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
 
 	if (!xs->not_found) {
 		if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
-			ret = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
+			ret = ocfs2_read_blocks(inode,
 					xs->bucket.bhs[0]->b_blocknr + 1,
 					blk_per_bucket - 1, &xs->bucket.bhs[1],
-					OCFS2_BH_CACHED, inode);
+					OCFS2_BH_CACHED);
 			if (ret) {
 				mlog_errno(ret);
 				return ret;
@@ -2913,8 +2909,8 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	if (!bhs)
 		return -ENOMEM;
 
-	ret = ocfs2_read_blocks(osb, blkno, blk_per_bucket, bhs,
-				OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs,
+				OCFS2_BH_CACHED);
 	if (ret)
 		goto out;
 
@@ -3114,8 +3110,8 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 			goto out;
 		}
 
-		ret = ocfs2_read_block(osb, prev_blkno,
-				       &old_bh, OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, prev_blkno,
+				       &old_bh, OCFS2_BH_CACHED);
 		if (ret < 0) {
 			mlog_errno(ret);
 			brelse(new_bh);
@@ -3168,9 +3164,9 @@ static int ocfs2_read_xattr_bucket(struct inode *inode,
 	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
 	if (!new)
-		return ocfs2_read_blocks(OCFS2_SB(inode->i_sb), blkno,
+		return ocfs2_read_blocks(inode, blkno,
 					 blk_per_bucket, bhs,
-					 OCFS2_BH_CACHED, inode);
+					 OCFS2_BH_CACHED);
 
 	for (i = 0; i < blk_per_bucket; i++) {
 		bhs[i] = sb_getblk(inode->i_sb, blkno + i);
@@ -3485,7 +3481,7 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 	ocfs2_journal_dirty(handle, first_bh);
 
 	/* update the new bucket header. */
-	ret = ocfs2_read_block(osb, to_blk_start, &bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, to_blk_start, &bh, OCFS2_BH_CACHED);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -3872,8 +3868,8 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_read_block(osb, p_blkno,
-			       &first_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, p_blkno,
+			       &first_bh, OCFS2_BH_CACHED);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4115,10 +4111,10 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
 
 	if (!xs->bucket.bhs[1]) {
-		ret = ocfs2_read_blocks(osb,
+		ret = ocfs2_read_blocks(inode,
 					xs->bucket.bhs[0]->b_blocknr + 1,
 					blk_per_bucket - 1, &xs->bucket.bhs[1],
-					OCFS2_BH_CACHED, inode);
+					OCFS2_BH_CACHED);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4224,8 +4220,8 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
 	value_blk += header_bh->b_blocknr;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), value_blk,
-			       &value_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, value_blk,
+			       &value_bh, OCFS2_BH_CACHED);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
-- 
cgit v1.2.3


From 0fcaa56a2a020dd6f90c202b7084e6f4cbedb6c2 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 9 Oct 2008 17:20:31 -0700
Subject: ocfs2: Simplify ocfs2_read_block()

More than 30 callers of ocfs2_read_block() pass exactly OCFS2_BH_CACHED.
Only six pass a different flag set.  Rather than have every caller care,
let's make ocfs2_read_block() take no flags and always do a cached read.
The remaining six places can call ocfs2_read_blocks() directly.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c          | 25 ++++++++++---------------
 fs/ocfs2/aops.c           |  6 ++----
 fs/ocfs2/buffer_head_io.h |  7 +++----
 fs/ocfs2/dir.c            |  9 +++------
 fs/ocfs2/dlmglue.c        |  8 ++------
 fs/ocfs2/extent_map.c     |  8 +++-----
 fs/ocfs2/file.c           |  7 +++----
 fs/ocfs2/inode.c          |  4 ++--
 fs/ocfs2/journal.c        |  2 +-
 fs/ocfs2/localalloc.c     |  8 ++++----
 fs/ocfs2/namei.c          |  2 +-
 fs/ocfs2/resize.c         |  3 +--
 fs/ocfs2/slot_map.c       |  2 +-
 fs/ocfs2/suballoc.c       | 11 ++++-------
 fs/ocfs2/symlink.c        |  5 +----
 fs/ocfs2/xattr.c          | 42 ++++++++++++++----------------------------
 16 files changed, 55 insertions(+), 94 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a164e09491f..0cc2deb9394 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -706,7 +706,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 
 	if (last_eb_blk) {
 		retval = ocfs2_read_block(inode, last_eb_blk,
-					  &eb_bh, OCFS2_BH_CACHED);
+					  &eb_bh);
 		if (retval < 0) {
 			mlog_errno(retval);
 			goto bail;
@@ -1176,7 +1176,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 		brelse(bh);
 		bh = NULL;
 
-		status = ocfs2_read_block(inode, blkno, &bh, OCFS2_BH_CACHED);
+		status = ocfs2_read_block(inode, blkno, &bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1540,7 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode,
 
 		brelse(bh);
 		bh = NULL;
-		ret = ocfs2_read_block(inode, blkno, &bh, OCFS2_BH_CACHED);
+		ret = ocfs2_read_block(inode, blkno, &bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4294,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
 		 * may want it later.
 		 */
-		ret = ocfs2_read_block(inode,
-				       ocfs2_et_get_last_eb_blk(et), &bh,
-				       OCFS2_BH_CACHED);
+		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
@@ -4762,9 +4760,8 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	if (path->p_tree_depth) {
 		struct ocfs2_extent_block *eb;
 
-		ret = ocfs2_read_block(inode,
-				       ocfs2_et_get_last_eb_blk(et),
-				       &last_eb_bh, OCFS2_BH_CACHED);
+		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+				       &last_eb_bh);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
@@ -4921,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 
 	depth = path->p_tree_depth;
 	if (depth > 0) {
-		ret = ocfs2_read_block(inode,
-				       ocfs2_et_get_last_eb_blk(et),
-				       &last_eb_bh, OCFS2_BH_CACHED);
+		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+				       &last_eb_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -5590,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh,
-				  OCFS2_BH_CACHED);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		iput(inode);
 		mlog_errno(status);
@@ -6990,7 +6985,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 
 	if (fe->id2.i_list.l_tree_depth) {
 		status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
-					  &last_eb_bh, OCFS2_BH_CACHED);
+					  &last_eb_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f232a0e3c30..c22543b3342 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -68,8 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
-				  &bh, OCFS2_BH_CACHED);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -263,8 +262,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh,
-			       OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index fd0d774ac35..a2ef9e5f8bf 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -33,8 +33,7 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
 
 static inline int ocfs2_read_block(struct inode	       *inode,
 				   u64                  off,
-				   struct buffer_head **bh,
-				   int                  flags);
+				   struct buffer_head **bh);
 
 int ocfs2_write_block(struct ocfs2_super          *osb,
 		      struct buffer_head  *bh,
@@ -54,7 +53,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 #define OCFS2_BH_READAHEAD         8
 
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
-				   struct buffer_head **bh, int flags)
+				   struct buffer_head **bh)
 {
 	int status = 0;
 
@@ -64,7 +63,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
 		goto bail;
 	}
 
-	status = ocfs2_read_blocks(inode, off, 1, bh, flags);
+	status = ocfs2_read_blocks(inode, off, 1, bh, OCFS2_BH_CACHED);
 
 bail:
 	return status;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 828437ca91b..459e6b8467d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -188,8 +188,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh,
-			       OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -417,8 +416,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -596,8 +594,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 	struct ocfs2_inline_data *data;
 	struct ocfs2_dir_entry *de;
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3b2cd0f8721..ec684426034 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 	} else {
 		/* Boo, we have to go to disk. */
 		/* read bh, cast, ocfs2_refresh_inode */
-		status = ocfs2_read_block(inode, oi->ip_blkno,
-					  bh, OCFS2_BH_CACHED);
+		status = ocfs2_read_block(inode, oi->ip_blkno, bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail_refresh;
@@ -2086,10 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode,
 		return 0;
 	}
 
-	status = ocfs2_read_block(inode,
-				  OCFS2_I(inode)->ip_blkno,
-				  ret_bh,
-				  OCFS2_BH_CACHED);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
 	if (status < 0)
 		mlog_errno(status);
 
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 5b482214bb7..2baedac5823 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,8 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 
-	ret = ocfs2_read_block(inode, last_eb_blk,
-			       &eb_bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -384,7 +383,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
 
 		ret = ocfs2_read_block(inode,
 				       le64_to_cpu(eb->h_next_leaf_blk),
-				       &next_eb_bh, OCFS2_BH_CACHED);
+				       &next_eb_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -631,8 +630,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	if (ret == 0)
 		goto out;
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7a809be54e8..8d3225a7807 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -545,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh,
-				  OCFS2_BH_CACHED);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1132,7 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 	struct buffer_head *bh = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	ret = ocfs2_read_block(inode, oi->ip_blkno, &bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1159,7 +1158,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
-				       &di_bh, OCFS2_BH_CACHED);
+				       &di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 6ec31b92a47..c5ee9e3cf80 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -461,7 +461,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	}
 
 	if (can_lock)
-		status = ocfs2_read_block(inode, args->fi_blkno, &bh, 0);
+		status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, 0);
 	else
 		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
 	if (status < 0) {
@@ -1165,7 +1165,7 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
 		goto fail;
 	}
 
-	tmperr = ocfs2_read_block(inode, p_blkno, &bh, readflags);
+	tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
 	if (tmperr < 0)
 		goto fail;
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9854fb7315b..d161fe5e3bd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1134,7 +1134,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
 	}
 	SET_INODE_JOURNAL(inode);
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh, 0);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index b77b67bb277..3ea740d15fe 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -248,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
-				  &alloc_bh, 0);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+				   &alloc_bh, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -459,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 
 	mutex_lock(&inode->i_mutex);
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
-				  &alloc_bh, 0);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+				   &alloc_bh, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e5fc9345dd3..485a6aa0ad3 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1754,7 +1754,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 
 	status = ocfs2_read_block(orphan_dir_inode,
 				  OCFS2_I(orphan_dir_inode)->ip_blkno,
-				  &orphan_dir_bh, OCFS2_BH_CACHED);
+				  &orphan_dir_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 92dcd935056..ffd48db229a 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
 					      first_new_cluster - 1);
 
-	ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh,
-			       OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 82d986bff7f..357d3fe18c3 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -403,7 +403,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 		     (unsigned long long)blkno);
 
 		bh = NULL;  /* Acquire a fresh bh */
-		status = ocfs2_read_block(si->si_inode, blkno, &bh, 0);
+		status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, 0);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f0056b7d435..c5ff18b46b5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1172,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	struct ocfs2_group_desc *gd;
 	struct inode *alloc_inode = ac->ac_inode;
 
-	ret = ocfs2_read_block(alloc_inode, gd_blkno,
-			       &group_bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1244,7 +1243,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 
 	status = ocfs2_read_block(alloc_inode,
 				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
-				  &group_bh, OCFS2_BH_CACHED);
+				  &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1273,8 +1272,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 		prev_group_bh = group_bh;
 		group_bh = NULL;
 		status = ocfs2_read_block(alloc_inode,
-					  next_group, &group_bh,
-					  OCFS2_BH_CACHED);
+					  next_group, &group_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1795,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
 	     (unsigned long long)bg_blkno, start_bit);
 
-	status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh,
-				  OCFS2_BH_CACHED);
+	status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 8788dc26316..cbd03dfdc7b 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,10 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
 
 	mlog_entry_void();
 
-	status = ocfs2_read_block(inode,
-				  OCFS2_I(inode)->ip_blkno,
-				  bh,
-				  OCFS2_BH_CACHED);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
 	if (status < 0) {
 		mlog_errno(status);
 		link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 63037bd7892..c25780a70df 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -537,9 +537,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 	if (!di->i_xattr_loc)
 		return ret;
 
-	ret = ocfs2_read_block(inode,
-			       le64_to_cpu(di->i_xattr_loc),
-			       &blk_bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -672,8 +670,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 		/* Copy ocfs2_xattr_value */
 		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-			ret = ocfs2_read_block(inode, blkno,
-					       &bh, OCFS2_BH_CACHED);
+			ret = ocfs2_read_block(inode, blkno, &bh);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -764,9 +761,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 
 	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
-	ret = ocfs2_read_block(inode,
-			       le64_to_cpu(di->i_xattr_loc),
-			       &blk_bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -922,8 +917,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 
 		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-			ret = ocfs2_read_block(inode, blkno,
-					       &bh, OCFS2_BH_CACHED);
+			ret = ocfs2_read_block(inode, blkno, &bh);
 			if (ret) {
 				mlog_errno(ret);
 				goto out_commit;
@@ -1514,8 +1508,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 	u64 blk, bg_blkno;
 	u16 bit;
 
-	ret = ocfs2_read_block(inode, block, &blk_bh,
-			       OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, block, &blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1773,9 +1766,7 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 	if (!di->i_xattr_loc)
 		return ret;
 
-	ret = ocfs2_read_block(inode,
-			       le64_to_cpu(di->i_xattr_loc),
-			       &blk_bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -2216,9 +2207,8 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
 			break;
 		}
 
-		ret = ocfs2_read_block(inode,
-				       header_bh->b_blocknr + block_off,
-				       &name_bh, OCFS2_BH_CACHED);
+		ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
+				       &name_bh);
 		if (ret) {
 			mlog_errno(ret);
 			break;
@@ -2269,7 +2259,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 	u32 last_hash;
 	u64 blkno;
 
-	ret = ocfs2_read_block(inode, p_blkno, &bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, p_blkno, &bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -2285,7 +2275,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 
 		blkno = p_blkno + bucket * blk_per_bucket;
 
-		ret = ocfs2_read_block(inode, blkno, &bh, OCFS2_BH_CACHED);
+		ret = ocfs2_read_block(inode, blkno, &bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2898,7 +2888,6 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	u64 blkno = bucket->bhs[0]->b_blocknr;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	u16 xh_free_start;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	size_t blocksize = inode->i_sb->s_blocksize;
 	handle_t *handle;
 	struct buffer_head **bhs;
@@ -3110,8 +3099,7 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 			goto out;
 		}
 
-		ret = ocfs2_read_block(inode, prev_blkno,
-				       &old_bh, OCFS2_BH_CACHED);
+		ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			brelse(new_bh);
@@ -3481,7 +3469,7 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 	ocfs2_journal_dirty(handle, first_bh);
 
 	/* update the new bucket header. */
-	ret = ocfs2_read_block(inode, to_blk_start, &bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, to_blk_start, &bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -3868,8 +3856,7 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_read_block(inode, p_blkno,
-			       &first_bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, p_blkno, &first_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4220,8 +4207,7 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
 	value_blk += header_bh->b_blocknr;
 
-	ret = ocfs2_read_block(inode, value_blk,
-			       &value_bh, OCFS2_BH_CACHED);
+	ret = ocfs2_read_block(inode, value_blk, &value_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
-- 
cgit v1.2.3


From 07446dc72cffcc6e2672d0e54061dcd1858725ba Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 9 Oct 2008 17:20:32 -0700
Subject: ocfs2: Move ocfs2_bread() into dir.c

dir.c is the only place using ocfs2_bread(), so let's make it static to
that file.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dir.c   | 43 +++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/inode.c | 50 --------------------------------------------------
 fs/ocfs2/inode.h |  2 --
 3 files changed, 43 insertions(+), 52 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 459e6b8467d..ef2bb856f73 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);
 
+static struct buffer_head *ocfs2_bread(struct inode *inode,
+				       int block, int *err, int reada)
+{
+	struct buffer_head *bh = NULL;
+	int tmperr;
+	u64 p_blkno;
+	int readflags = OCFS2_BH_CACHED;
+
+	if (reada)
+		readflags |= OCFS2_BH_READAHEAD;
+
+	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!reada);
+		return NULL;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+					     NULL);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	if (tmperr < 0) {
+		mlog_errno(tmperr);
+		goto fail;
+	}
+
+	tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+	if (tmperr < 0)
+		goto fail;
+
+	tmperr = 0;
+
+	*err = 0;
+	return bh;
+
+fail:
+	brelse(bh);
+	bh = NULL;
+
+	*err = -EIO;
+	return NULL;
+}
+
 /*
  * bh passed here can be an inode block or a dir data block, depending
  * on the inode inline data flag.
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c5ee9e3cf80..8381c26b21a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1132,56 +1132,6 @@ void ocfs2_drop_inode(struct inode *inode)
 	mlog_exit_void();
 }
 
-/*
- * TODO: this should probably be merged into ocfs2_get_block
- *
- * However, you now need to pay attention to the cont_prepare_write()
- * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
- * expects never to extend).
- */
-struct buffer_head *ocfs2_bread(struct inode *inode,
-				int block, int *err, int reada)
-{
-	struct buffer_head *bh = NULL;
-	int tmperr;
-	u64 p_blkno;
-	int readflags = OCFS2_BH_CACHED;
-
-	if (reada)
-		readflags |= OCFS2_BH_READAHEAD;
-
-	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
-	    i_size_read(inode)) {
-		BUG_ON(!reada);
-		return NULL;
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-					     NULL);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-	if (tmperr < 0) {
-		mlog_errno(tmperr);
-		goto fail;
-	}
-
-	tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
-	if (tmperr < 0)
-		goto fail;
-
-	tmperr = 0;
-
-	*err = 0;
-	return bh;
-
-fail:
-	brelse(bh);
-	bh = NULL;
-
-	*err = -EIO;
-	return NULL;
-}
-
 /*
  * This is called from our getattr.
  */
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index f66e4340f17..2f37af9bcc4 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -117,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache;
 
 extern const struct address_space_operations ocfs2_aops;
 
-struct buffer_head *ocfs2_bread(struct inode *inode, int block,
-				int *err, int reada);
 void ocfs2_clear_inode(struct inode *inode);
 void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
-- 
cgit v1.2.3


From 5e0b3dec0107540244ba343f983ef4f972db20de Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 9 Oct 2008 17:20:33 -0700
Subject: ocfs2: Kill the last naked wait_on_buffer() for cached reads.

ocfs2's cached buffer I/O goes through ocfs2_read_block(s)().  dir.c had
a naked wait_on_buffer() to wait for some readahead, but it should
use ocfs2_read_block() instead.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dir.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ef2bb856f73..60be3ba1f5d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -302,14 +302,13 @@ restart:
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 			goto next;
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
-			/* read error, skip block & hope for the best */
+		if (ocfs2_read_block(dir, block, &bh)) {
+			/* read error, skip block & hope for the best.
+			 * ocfs2_read_block() has released the bh. */
 			ocfs2_error(dir->i_sb, "reading directory %llu, "
 				    "offset %lu\n",
 				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
 				    block);
-			brelse(bh);
 			goto next;
 		}
 		i = ocfs2_search_dirblock(bh, dir, name, namelen,
-- 
cgit v1.2.3


From d4a8c93c8248534bdedb07f83c9aebd6f7d1d579 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 9 Oct 2008 17:20:34 -0700
Subject: ocfs2: Make cached block reads the common case.

ocfs2_read_blocks() currently requires the CACHED flag for cached I/O.
However, that's the common case.  Let's flip it around and provide an
IGNORE_CACHE flag for the special users.  This has the added benefit of
cleaning up the code some (ignore_cache takes on its special meaning
earlier in the loop).

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/buffer_head_io.c | 19 +++++++++++--------
 fs/ocfs2/buffer_head_io.h |  4 ++--
 fs/ocfs2/dir.c            |  2 +-
 fs/ocfs2/inode.c          |  3 ++-
 fs/ocfs2/journal.c        |  3 ++-
 fs/ocfs2/localalloc.c     |  4 ++--
 fs/ocfs2/slot_map.c       |  6 ++++--
 7 files changed, 24 insertions(+), 17 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 718dbe5607c..7e947c67246 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -181,7 +181,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 		   inode, (unsigned long long)block, nr, flags);
 
 	BUG_ON(!inode);
-	BUG_ON((flags & OCFS2_BH_READAHEAD) && !(flags & OCFS2_BH_CACHED));
+	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
+	       (flags & OCFS2_BH_IGNORE_CACHE));
 
 	if (bhs == NULL) {
 		status = -EINVAL;
@@ -214,7 +215,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 			}
 		}
 		bh = bhs[i];
-		ignore_cache = 0;
+		ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
 
 		/* There are three read-ahead cases here which we need to
 		 * be concerned with. All three assume a buffer has
@@ -240,26 +241,27 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 		 *    before our is-it-in-flight check.
 		 */
 
-		if (flags & OCFS2_BH_CACHED &&
-		    !ocfs2_buffer_uptodate(inode, bh)) {
+		if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
 			mlog(ML_UPTODATE,
 			     "bh (%llu), inode %llu not uptodate\n",
 			     (unsigned long long)bh->b_blocknr,
 			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+			/* We're using ignore_cache here to say
+			 * "go to disk" */
 			ignore_cache = 1;
 		}
 
 		/* XXX: Can we ever get this and *not* have the cached
 		 * flag set? */
 		if (buffer_jbd(bh)) {
-			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+			if (ignore_cache)
 				mlog(ML_BH_IO, "trying to sync read a jbd "
 					       "managed bh (blocknr = %llu)\n",
 				     (unsigned long long)bh->b_blocknr);
 			continue;
 		}
 
-		if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+		if (ignore_cache) {
 			if (buffer_dirty(bh)) {
 				/* This should probably be a BUG, or
 				 * at least return an error. */
@@ -294,7 +296,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 			 * previously read-ahead buffer may have
 			 * completed I/O while we were waiting for the
 			 * buffer lock. */
-			if ((flags & OCFS2_BH_CACHED)
+			if (!(flags & OCFS2_BH_IGNORE_CACHE)
 			    && !(flags & OCFS2_BH_READAHEAD)
 			    && ocfs2_buffer_uptodate(inode, bh)) {
 				unlock_buffer(bh);
@@ -344,7 +346,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 
 	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
 	     (unsigned long long)block, nr,
-	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
+	     ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
+	     flags);
 
 bail:
 
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index a2ef9e5f8bf..75e1dcb1ade 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -49,7 +49,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 				struct buffer_head *bh);
 
-#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_IGNORE_CACHE      1
 #define OCFS2_BH_READAHEAD         8
 
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
@@ -63,7 +63,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
 		goto bail;
 	}
 
-	status = ocfs2_read_blocks(inode, off, 1, bh, OCFS2_BH_CACHED);
+	status = ocfs2_read_blocks(inode, off, 1, bh, 0);
 
 bail:
 	return status;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 60be3ba1f5d..026e6eb8518 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -88,7 +88,7 @@ static struct buffer_head *ocfs2_bread(struct inode *inode,
 	struct buffer_head *bh = NULL;
 	int tmperr;
 	u64 p_blkno;
-	int readflags = OCFS2_BH_CACHED;
+	int readflags = 0;
 
 	if (reada)
 		readflags |= OCFS2_BH_READAHEAD;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8381c26b21a..4903688f72a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -461,7 +461,8 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	}
 
 	if (can_lock)
-		status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, 0);
+		status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+					   OCFS2_BH_IGNORE_CACHE);
 	else
 		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
 	if (status < 0) {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index d161fe5e3bd..81e40677eec 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1134,7 +1134,8 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
 	}
 	SET_INODE_JOURNAL(inode);
 
-	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, 0);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+				   OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 3ea740d15fe..687b28713c3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -249,7 +249,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 	}
 
 	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
-				   &alloc_bh, 0);
+				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -460,7 +460,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 	mutex_lock(&inode->i_mutex);
 
 	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
-				   &alloc_bh, 0);
+				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 357d3fe18c3..bdda2d8f850 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,7 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 	 * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
 	 * this is not true, the read of -1 (UINT64_MAX) will fail.
 	 */
-	ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, 0);
+	ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
+				OCFS2_BH_IGNORE_CACHE);
 	if (ret == 0) {
 		spin_lock(&osb->osb_lock);
 		ocfs2_update_slot_info(si);
@@ -403,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 		     (unsigned long long)blkno);
 
 		bh = NULL;  /* Acquire a fresh bh */
-		status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, 0);
+		status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
+					   OCFS2_BH_IGNORE_CACHE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
-- 
cgit v1.2.3


From 1efd47f87317030cb7e37821b8562a8162c1223f Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 14 Oct 2008 18:31:46 -0700
Subject: ocfs2: fix build error

I merged the latest ocfs2_read_blocks() changes in xattr.c wrong. This makes
Ocfs2 compile again.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs/ocfs2')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c25780a70df..802c4149221 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2349,7 +2349,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		 */
 		ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
 					blk_per_bucket - 1, &xs->bucket.bhs[1],
-					OCFS2_BH_CACHED);
+					0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2426,7 +2426,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 
 	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
 		ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
-					bucket.bhs, OCFS2_BH_CACHED);
+					bucket.bhs, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2694,7 +2694,7 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
 			ret = ocfs2_read_blocks(inode,
 					xs->bucket.bhs[0]->b_blocknr + 1,
 					blk_per_bucket - 1, &xs->bucket.bhs[1],
-					OCFS2_BH_CACHED);
+					0);
 			if (ret) {
 				mlog_errno(ret);
 				return ret;
@@ -2898,8 +2898,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	if (!bhs)
 		return -ENOMEM;
 
-	ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs,
-				OCFS2_BH_CACHED);
+	ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
 	if (ret)
 		goto out;
 
@@ -3153,8 +3152,7 @@ static int ocfs2_read_xattr_bucket(struct inode *inode,
 
 	if (!new)
 		return ocfs2_read_blocks(inode, blkno,
-					 blk_per_bucket, bhs,
-					 OCFS2_BH_CACHED);
+					 blk_per_bucket, bhs, 0);
 
 	for (i = 0; i < blk_per_bucket; i++) {
 		bhs[i] = sb_getblk(inode->i_sb, blkno + i);
@@ -4101,7 +4099,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 		ret = ocfs2_read_blocks(inode,
 					xs->bucket.bhs[0]->b_blocknr + 1,
 					blk_per_bucket - 1, &xs->bucket.bhs[1],
-					OCFS2_BH_CACHED);
+					0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
-- 
cgit v1.2.3