From 890871be854b5f5e43e7ba2475f706209906cc24 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 2 Sep 2009 16:24:52 -0400
Subject: Btrfs: switch extent_map to a rw lock

There are two main users of the extent_map tree.  The
first is regular file inodes, where it is evenly spread
between readers and writers.

The second is the chunk allocation tree, which maps blocks from
logical addresses to phyiscal ones, and it is 99.99% reads.

The mapping tree is a point of lock contention during heavy IO
workloads, so this commit switches things to a rw lock.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 04b53b5ebe5..f1df1171861 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -612,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 
 		while (1) {
-			spin_lock(&em_tree->lock);
+			write_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
-			spin_unlock(&em_tree->lock);
+			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
 				break;
@@ -748,9 +748,9 @@ static noinline int cow_file_range(struct inode *inode,
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
 		while (1) {
-			spin_lock(&em_tree->lock);
+			write_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
-			spin_unlock(&em_tree->lock);
+			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
 				break;
@@ -1081,9 +1081,9 @@ out_check:
 			em->bdev = root->fs_info->fs_devices->latest_bdev;
 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
 			while (1) {
-				spin_lock(&em_tree->lock);
+				write_lock(&em_tree->lock);
 				ret = add_extent_mapping(em_tree, em);
-				spin_unlock(&em_tree->lock);
+				write_unlock(&em_tree->lock);
 				if (ret != -EEXIST) {
 					free_extent_map(em);
 					break;
@@ -1670,13 +1670,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		failrec->last_mirror = 0;
 		failrec->bio_flags = 0;
 
-		spin_lock(&em_tree->lock);
+		read_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, failrec->len);
 		if (em->start > start || em->start + em->len < start) {
 			free_extent_map(em);
 			em = NULL;
 		}
-		spin_unlock(&em_tree->lock);
+		read_unlock(&em_tree->lock);
 
 		if (!em || IS_ERR(em)) {
 			kfree(failrec);
@@ -4069,11 +4069,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	int compressed;
 
 again:
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
 	if (em)
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	if (em) {
 		if (em->start > start || em->start + em->len <= start)
@@ -4264,7 +4264,7 @@ insert:
 	}
 
 	err = 0;
-	spin_lock(&em_tree->lock);
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	/* it is possible that someone inserted the extent into the tree
 	 * while we had the lock dropped.  It is also possible that
@@ -4304,7 +4304,7 @@ insert:
 			err = 0;
 		}
 	}
-	spin_unlock(&em_tree->lock);
+	write_unlock(&em_tree->lock);
 out:
 	if (path)
 		btrfs_free_path(path);
-- 
cgit v1.2.3


From 2c64c53d8d30d43d0670482503a3914dfd3d6d46 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 2 Sep 2009 15:04:12 -0400
Subject: Btrfs: cache values for locking extents

Many of the btrfs extent state tree users follow the same pattern.
They lock an extent range in the tree, do some operation and then
unlock.

This translates to at least 2 rbtree searches, and maybe more if they
are doing operations on the extent state tree.  A locked extent
in the tree isn't going to be merged or changed, and so we can
safely return the extent state structure as a cached handle.

This changes set_extent_bit to give back a cached handle, and also
changes both set_extent_bit and clear_extent_bit to use the cached
handle if it is available.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1df1171861..e494545c420 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -854,7 +854,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	int limit = 10 * 1024 * 1042;
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
-			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+			 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
 	while (start < end) {
 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
 		async_cow->inode = inode;
@@ -4420,7 +4420,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		 */
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
+				 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
 		btrfs_finish_ordered_io(page->mapping->host,
 					page_start, page_end);
 		btrfs_put_ordered_extent(ordered);
@@ -4429,7 +4429,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 	clear_extent_bit(tree, page_start, page_end,
 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
 		 EXTENT_ORDERED,
-		 1, 1, GFP_NOFS);
+		 1, 1, NULL, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
-- 
cgit v1.2.3


From 9655d2982b53fdb38a9e0f2f11315b99b92d66e2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 2 Sep 2009 15:22:30 -0400
Subject: Btrfs: use a cached state for extent state operations during delalloc

This changes the btrfs code to find delalloc ranges in the extent state
tree to use the new state caching code from set/test bit.  It reduces
one of the biggest causes of rbtree searches in the writeback path.

test_range_bit is also modified to take the cached state as a starting
point while searching.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e494545c420..3f8e93de298 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1376,7 +1376,7 @@ again:
 
 	/* already ordered? We're done */
 	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			     EXTENT_ORDERED, 0)) {
+			     EXTENT_ORDERED, 0, NULL)) {
 		goto out;
 	}
 
@@ -1417,7 +1417,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 	int ret;
 
 	ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
-			     EXTENT_ORDERED, 0);
+			     EXTENT_ORDERED, 0, NULL);
 	if (ret)
 		return 0;
 
@@ -1795,7 +1795,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		return 0;
 
 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
-	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
 				  GFP_NOFS);
 		return 0;
-- 
cgit v1.2.3


From 8b62b72b26bcd72082c4a69d179dd906bcc22200 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 2 Sep 2009 16:53:46 -0400
Subject: Btrfs: Use PagePrivate2 to track pages in the data=ordered code.

Btrfs writes go through delalloc to the data=ordered code.  This
makes sure that all of the data is on disk before the metadata
that references it.  The tracking means that we have to make sure
each page in an extent is fully written before we add that extent into
the on-disk btree.

This was done in the past by setting the EXTENT_ORDERED bit for the
range of an extent when it was added to the data=ordered code, and then
clearing the EXTENT_ORDERED bit in the extent state tree as each page
finished IO.

One of the reasons we had to do this was because sometimes pages are
magically dirtied without page_mkwrite being called.  The EXTENT_ORDERED
bit is checked at writepage time, and if it isn't there, our page become
dirty without going through the proper path.

These bit operations make for a number of rbtree searches for each page,
and can cause considerable lock contention.

This commit switches from the EXTENT_ORDERED bit to use PagePrivate2.
As pages go into the ordered code, PagePrivate2 is set on each one.
This is a cheap operation because we already have all the pages locked
and ready to go.

As IO finishes, the PagePrivate2 bit is cleared and the ordered
accoutning is updated for each page.

At writepage time, if the PagePrivate2 bit is missing, we go into the
writepage fixup code to handle improperly dirtied pages.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3f8e93de298..739a245e25d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -426,7 +426,7 @@ again:
 			extent_clear_unlock_delalloc(inode,
 						     &BTRFS_I(inode)->io_tree,
 						     start, end, NULL, 1, 0,
-						     0, 1, 1, 1);
+						     0, 1, 1, 1, 0);
 			ret = 0;
 			goto free_pages_out;
 		}
@@ -641,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 					     async_extent->start,
 					     async_extent->start +
 					     async_extent->ram_size - 1,
-					     NULL, 1, 1, 0, 1, 1, 0);
+					     NULL, 1, 1, 0, 1, 1, 0, 0);
 
 		ret = btrfs_submit_compressed_write(inode,
 				    async_extent->start,
@@ -714,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
 			extent_clear_unlock_delalloc(inode,
 						     &BTRFS_I(inode)->io_tree,
 						     start, end, NULL, 1, 1,
-						     1, 1, 1, 1);
+						     1, 1, 1, 1, 0);
 			*nr_written = *nr_written +
 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 			*page_started = 1;
@@ -777,11 +777,14 @@ static noinline int cow_file_range(struct inode *inode,
 		/* we're not doing compressed IO, don't unlock the first
 		 * page (which the caller expects to stay locked), don't
 		 * clear any dirty bits and don't set any writeback bits
+		 *
+		 * Do set the Private2 bit so we know this page was properly
+		 * setup for writepage
 		 */
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					     start, start + ram_size - 1,
 					     locked_page, unlock, 1,
-					     1, 0, 0, 0);
+					     1, 0, 0, 0, 1);
 		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
@@ -1102,7 +1105,7 @@ out_check:
 
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					cur_offset, cur_offset + num_bytes - 1,
-					locked_page, 1, 1, 1, 0, 0, 0);
+					locked_page, 1, 1, 1, 0, 0, 0, 1);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
@@ -1375,10 +1378,8 @@ again:
 	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
 
 	/* already ordered? We're done */
-	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			     EXTENT_ORDERED, 0, NULL)) {
+	if (PagePrivate2(page))
 		goto out;
-	}
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
@@ -1414,11 +1415,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 	struct inode *inode = page->mapping->host;
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
 
-	ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
-			     EXTENT_ORDERED, 0, NULL);
-	if (ret)
+	/* this page is properly in the ordered list */
+	if (TestClearPagePrivate2(page))
 		return 0;
 
 	if (PageChecked(page))
@@ -1624,6 +1623,7 @@ nocow:
 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 				struct extent_state *state, int uptodate)
 {
+	ClearPagePrivate2(page);
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
@@ -4403,13 +4403,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
+
+	/*
+	 * we have the page locked, so new writeback can't start,
+	 * and the dirty bit won't be cleared while we are here.
+	 *
+	 * Wait for IO on this page so that we can safely clear
+	 * the PagePrivate2 bit and do ordered accounting
+	 */
 	wait_on_page_writeback(page);
+
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	if (offset) {
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-
 	lock_extent(tree, page_start, page_end, GFP_NOFS);
 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
 					   page_offset(page));
@@ -4421,14 +4429,19 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
-		btrfs_finish_ordered_io(page->mapping->host,
-					page_start, page_end);
+		/*
+		 * whoever cleared the private bit is responsible
+		 * for the finish_ordered_io
+		 */
+		if (TestClearPagePrivate2(page)) {
+			btrfs_finish_ordered_io(page->mapping->host,
+						page_start, page_end);
+		}
 		btrfs_put_ordered_extent(ordered);
 		lock_extent(tree, page_start, page_end, GFP_NOFS);
 	}
 	clear_extent_bit(tree, page_start, page_end,
-		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_ORDERED,
+		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
 		 1, 1, NULL, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
-- 
cgit v1.2.3


From a1ed835e1ab5795f91b198d08c43e2f56848dcf3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 11 Sep 2009 12:27:37 -0400
Subject: Btrfs: Fix extent replacment race

Data COW means that whenever we write to a file, we replace any old
extent pointers with new ones.  There was a window where a readpage
might find the old extent pointers on disk and cache them in the
extent_map tree in ram in the middle of a given write replacing them.

Even though both the readpage and the write had their respective bytes
in the file locked, the extent readpage inserts may cover more bytes than
it had locked down.

This commit closes the race by keeping the new extent pinned in the extent
map tree until after the on-disk btree is properly setup with the new
extent pointers.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 739a245e25d..233fe6f2612 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -232,7 +232,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_drop_extents(trans, root, inode, start,
-				 aligned_end, aligned_end, start, &hint_byte);
+				 aligned_end, aligned_end, start,
+				 &hint_byte, 1);
 	BUG_ON(ret);
 
 	if (isize > actual_end)
@@ -241,7 +242,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 				   inline_len, compressed_size,
 				   compressed_pages);
 	BUG_ON(ret);
-	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 	return 0;
 }
 
@@ -1455,9 +1456,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(!path);
 
 	path->leave_spinning = 1;
+
+	/*
+	 * we may be replacing one extent in the tree with another.
+	 * The new extent is pinned in the extent map, and we don't want
+	 * to drop it from the cache until it is completely in the btree.
+	 *
+	 * So, tell btrfs_drop_extents to leave this extent in the cache.
+	 * the caller is expected to unpin it and allow it to be merged
+	 * with the others.
+	 */
 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
 				 file_pos + num_bytes, locked_end,
-				 file_pos, &hint);
+				 file_pos, &hint, 0);
 	BUG_ON(ret);
 
 	ins.objectid = inode->i_ino;
@@ -1485,7 +1496,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 
 	inode_add_bytes(inode, num_bytes);
-	btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
 
 	ins.objectid = disk_bytenr;
 	ins.offset = disk_num_bytes;
@@ -1596,6 +1606,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 						ordered_extent->len,
 						compressed, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
+		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+				   ordered_extent->file_offset,
+				   ordered_extent->len);
 		BUG_ON(ret);
 	}
 	unlock_extent(io_tree, ordered_extent->file_offset,
@@ -2940,7 +2953,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 						 cur_offset,
 						 cur_offset + hole_size,
 						 block_end,
-						 cur_offset, &hint_byte);
+						 cur_offset, &hint_byte, 1);
 			if (err)
 				break;
 			err = btrfs_insert_file_extent(trans, root,
@@ -5086,6 +5099,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 						  0, 0, 0,
 						  BTRFS_FILE_EXTENT_PREALLOC);
 		BUG_ON(ret);
+		btrfs_drop_extent_cache(inode, cur_offset,
+					cur_offset + ins.offset -1, 0);
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
 		alloc_hint = ins.objectid + ins.offset;
-- 
cgit v1.2.3


From 50a9b214bc6c052caa05a210ebfc1bdf0d7085b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 11 Sep 2009 12:33:12 -0400
Subject: Btrfs: fix btrfs page_mkwrite to return locked page

This closes a whole where the page may be written before
the page_mkwrite caller has a chance to dirty it

(thanks to Nick Piggin)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 233fe6f2612..c846482e798 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4552,11 +4552,14 @@ again:
 	}
 	ClearPageChecked(page);
 	set_page_dirty(page);
+	SetPageUptodate(page);
 
 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
+	if (!ret)
+		return VM_FAULT_LOCKED;
 	unlock_page(page);
 out:
 	return ret;
-- 
cgit v1.2.3


From 93c82d575055f1bd0277acae6f966bebafd80dd5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 11 Sep 2009 12:36:29 -0400
Subject: Btrfs: zero page past end of inline file items

When btrfs_get_extent is reading inline file items for readpage,
it needs to copy the inline extent into the page.  If the
inline extent doesn't cover all of the page, that means there
is a hole in the file, or that our file is smaller than one
page.

readpage does zeroing for the case where the file is smaller than one
page, but nobody is currently zeroing for the case where there is
a hole after the inline item.

This commit changes btrfs_get_extent to zero fill the page past
the end of the inline item.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c846482e798..88f9df7bfda 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4233,6 +4233,11 @@ again:
 				map = kmap(page);
 				read_extent_buffer(leaf, map + pg_offset, ptr,
 						   copy_size);
+				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+					memset(map + pg_offset + copy_size, 0,
+					       PAGE_CACHE_SIZE - pg_offset -
+					       copy_size);
+				}
 				kunmap(page);
 			}
 			flush_dcache_page(page);
-- 
cgit v1.2.3


From b917b7c3be50435fa8257591b964934e917f2d45 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Sep 2009 16:07:03 -0400
Subject: Btrfs: search for an allocation hint while filling file COW

The allocator has some nice knobs for sending hints about where
to try and allocate new blocks, but when we're doing file allocations
we're not sending any hint at all.

This commit adds a simple extent map search to see if we can
quickly and easily find a hint for the allocator.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 941f1b71cd2..81ba6654c33 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -726,6 +726,15 @@ static noinline int cow_file_range(struct inode *inode,
 	BUG_ON(disk_num_bytes >
 	       btrfs_super_total_bytes(&root->fs_info->super_copy));
 
+
+	read_lock(&BTRFS_I(inode)->extent_tree.lock);
+	em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
+				   start, num_bytes);
+	if (em) {
+		alloc_hint = em->block_start;
+		free_extent_map(em);
+	}
+	read_unlock(&BTRFS_I(inode)->extent_tree.lock);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
 	while (disk_num_bytes > 0) {
@@ -738,7 +747,6 @@ static noinline int cow_file_range(struct inode *inode,
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
 		em->orig_start = em->start;
-
 		ram_size = ins.offset;
 		em->len = ins.offset;
 
-- 
cgit v1.2.3


From 13a8a7c8c47e542b3cdb45bec3f431f96af79361 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Mon, 21 Sep 2009 15:56:00 -0400
Subject: Btrfs: do not reuse objectid of deleted snapshot/subvol

The new back reference format does not allow reusing objectid of
deleted snapshot/subvol. So we use ++highest_objectid to allocate
objectid for new snapshot/subvol.

Now we use ++highest_objectid to allocate objectid for both new inode
and new snapshot/subvolume, so this patch removes 'find hole' code in
btrfs_find_free_objectid.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81ba6654c33..9e81f3184f2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3624,9 +3624,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (ret != 0)
 		goto fail;
 
-	if (objectid > root->highest_inode)
-		root->highest_inode = objectid;
-
 	inode->i_uid = current_fsuid();
 
 	if (dir && (dir->i_mode & S_ISGID)) {
-- 
cgit v1.2.3


From 4df27c4d5cc1dda54ed7d0a8389347f2df359cf9 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Mon, 21 Sep 2009 15:56:00 -0400
Subject: Btrfs: change how subvolumes are organized

btrfs allows subvolumes and snapshots anywhere in the directory tree.
If we snapshot a subvolume that contains a link to other subvolume
called subvolA, subvolA can be accessed through both the original
subvolume and the snapshot. This is similar to creating hard link to
directory, and has the very similar problems.

The aim of this patch is enforcing there is only one access point to
each subvolume. Only the first directory entry (the one added when
the subvolume/snapshot was created) is treated as valid access point.
The first directory entry is distinguished by checking root forward
reference. If the corresponding root forward reference is missing,
we know the entry is not the first one.

This patch also adds snapshot/subvolume rename support, the code
allows rename subvolume link across subvolumes.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 368 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 262 insertions(+), 106 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9e81f3184f2..6036b36789c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2373,6 +2373,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	return ret;
 }
 
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct inode *dir, u64 objectid,
+			const char *name, int name_len)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u64 index;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+				   name, name_len, -1);
+	BUG_ON(!di || IS_ERR(di));
+
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	BUG_ON(ret);
+	btrfs_release_path(root, path);
+
+	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+				 objectid, root->root_key.objectid,
+				 dir->i_ino, &index, name, name_len);
+	if (ret < 0) {
+		BUG_ON(ret != -ENOENT);
+		di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+						 name, name_len);
+		BUG_ON(!di || IS_ERR(di));
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(root, path);
+		index = key.offset;
+	}
+
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+					 index, name, name_len, -1);
+	BUG_ON(!di || IS_ERR(di));
+
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	BUG_ON(ret);
+	btrfs_release_path(root, path);
+
+	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, dir);
+	BUG_ON(ret);
+	dir->i_sb->s_dirt = 1;
+
+	btrfs_free_path(path);
+	return 0;
+}
+
 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
@@ -2382,29 +2445,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr = 0;
 
-	/*
-	 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
-	 * the root of a subvolume or snapshot
-	 */
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
-	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		return -ENOTEMPTY;
-	}
 
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
+	if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+		err = btrfs_unlink_subvol(trans, root, dir,
+					  BTRFS_I(inode)->location.objectid,
+					  dentry->d_name.name,
+					  dentry->d_name.len);
+		goto out;
+	}
+
 	err = btrfs_orphan_add(trans, inode);
 	if (err)
-		goto fail_trans;
+		goto out;
 
 	/* now the directory is empty */
 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
 	if (!err)
 		btrfs_i_size_write(inode, 0);
-
-fail_trans:
+out:
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
@@ -3091,29 +3156,67 @@ out_err:
  * is kind of like crossing a mount point.
  */
 static int fixup_tree_root_location(struct btrfs_root *root,
-			     struct btrfs_key *location,
-			     struct btrfs_root **sub_root,
-			     struct dentry *dentry)
+				    struct inode *dir,
+				    struct dentry *dentry,
+				    struct btrfs_key *location,
+				    struct btrfs_root **sub_root)
 {
-	struct btrfs_root_item *ri;
+	struct btrfs_path *path;
+	struct btrfs_root *new_root;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	int ret;
+	int err = 0;
 
-	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
-		return 0;
-	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
-		return 0;
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out;
+	}
 
-	*sub_root = btrfs_read_fs_root(root->fs_info, location,
-					dentry->d_name.name,
-					dentry->d_name.len);
-	if (IS_ERR(*sub_root))
-		return PTR_ERR(*sub_root);
+	err = -ENOENT;
+	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
+				  BTRFS_I(dir)->root->root_key.objectid,
+				  location->objectid);
+	if (ret) {
+		if (ret < 0)
+			err = ret;
+		goto out;
+	}
 
-	ri = &(*sub_root)->root_item;
-	location->objectid = btrfs_root_dirid(ri);
-	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
-	location->offset = 0;
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+		goto out;
 
-	return 0;
+	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
+				   (unsigned long)(ref + 1),
+				   dentry->d_name.len);
+	if (ret)
+		goto out;
+
+	btrfs_release_path(root->fs_info->tree_root, path);
+
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+	if (IS_ERR(new_root)) {
+		err = PTR_ERR(new_root);
+		goto out;
+	}
+
+	if (btrfs_root_refs(&new_root->root_item) == 0) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	*sub_root = new_root;
+	location->objectid = btrfs_root_dirid(&new_root->root_item);
+	location->type = BTRFS_INODE_ITEM_KEY;
+	location->offset = 0;
+	err = 0;
+out:
+	btrfs_free_path(path);
+	return err;
 }
 
 static void inode_tree_add(struct inode *inode)
@@ -3246,11 +3349,34 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 	return inode;
 }
 
+static struct inode *new_simple_dir(struct super_block *s,
+				    struct btrfs_key *key,
+				    struct btrfs_root *root)
+{
+	struct inode *inode = new_inode(s);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	init_btrfs_i(inode);
+
+	BTRFS_I(inode)->root = root;
+	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+	BTRFS_I(inode)->dummy_inode = 1;
+
+	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	return inode;
+}
+
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode;
-	struct btrfs_inode *bi = BTRFS_I(dir);
-	struct btrfs_root *root = bi->root;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
 	int ret;
@@ -3263,17 +3389,25 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	if (ret < 0)
 		return ERR_PTR(ret);
 
-	inode = NULL;
-	if (location.objectid) {
-		ret = fixup_tree_root_location(root, &location, &sub_root,
-						dentry);
-		if (ret < 0)
-			return ERR_PTR(ret);
-		if (ret > 0)
-			return ERR_PTR(-ENOENT);
+	if (location.objectid == 0)
+		return NULL;
+
+	if (location.type == BTRFS_INODE_ITEM_KEY) {
+		inode = btrfs_iget(dir->i_sb, &location, root);
+		return inode;
+	}
+
+	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
+
+	ret = fixup_tree_root_location(root, dir, dentry,
+				       &location, &sub_root);
+	if (ret < 0) {
+		if (ret != -ENOENT)
+			inode = ERR_PTR(ret);
+		else
+			inode = new_simple_dir(dir->i_sb, &location, sub_root);
+	} else {
 		inode = btrfs_iget(dir->i_sb, &location, sub_root);
-		if (IS_ERR(inode))
-			return ERR_CAST(inode);
 	}
 	return inode;
 }
@@ -3283,9 +3417,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct inode *inode;
 
-	if (dentry->d_name.len > BTRFS_NAME_LEN)
-		return ERR_PTR(-ENAMETOOLONG);
-
 	inode = btrfs_lookup_dentry(dir, dentry);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
@@ -3691,26 +3822,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct inode *parent_inode, struct inode *inode,
 		   const char *name, int name_len, int add_backref, u64 index)
 {
-	int ret;
+	int ret = 0;
 	struct btrfs_key key;
 	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
 
-	key.objectid = inode->i_ino;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-	key.offset = 0;
+	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
+	} else {
+		key.objectid = inode->i_ino;
+		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+		key.offset = 0;
+	}
+
+	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+					 key.objectid, root->root_key.objectid,
+					 parent_inode->i_ino,
+					 index, name, name_len);
+	} else if (add_backref) {
+		ret = btrfs_insert_inode_ref(trans, root,
+					     name, name_len, inode->i_ino,
+					     parent_inode->i_ino, index);
+	}
 
-	ret = btrfs_insert_dir_item(trans, root, name, name_len,
-				    parent_inode->i_ino,
-				    &key, btrfs_inode_type(inode),
-				    index);
 	if (ret == 0) {
-		if (add_backref) {
-			ret = btrfs_insert_inode_ref(trans, root,
-						     name, name_len,
-						     inode->i_ino,
-						     parent_inode->i_ino,
-						     index);
-		}
+		ret = btrfs_insert_dir_item(trans, root, name, name_len,
+					    parent_inode->i_ino, &key,
+					    btrfs_inode_type(inode), index);
+		BUG_ON(ret);
+
 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
 				   name_len * 2);
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -4800,31 +4940,29 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
+	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
 	struct inode *new_inode = new_dentry->d_inode;
 	struct inode *old_inode = old_dentry->d_inode;
 	struct timespec ctime = CURRENT_TIME;
 	u64 index = 0;
+	u64 root_objectid;
 	int ret;
 
-	/* we're not allowed to rename between subvolumes */
-	if (BTRFS_I(old_inode)->root->root_key.objectid !=
-	    BTRFS_I(new_dir)->root->root_key.objectid)
+	/* we only allow rename subvolume link between subvolumes */
+	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
 		return -EXDEV;
 
-	if (S_ISDIR(old_inode->i_mode) && new_inode &&
-	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+	if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+	    (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
 		return -ENOTEMPTY;
-	}
 
-	/* to rename a snapshot or subvolume, we need to juggle the
-	 * backrefs.  This isn't coded yet
-	 */
-	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
-		return -EXDEV;
+	if (S_ISDIR(old_inode->i_mode) && new_inode &&
+	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+		return -ENOTEMPTY;
 
 	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	/*
 	 * we're using rename to replace one file with another.
@@ -4837,6 +4975,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	trans = btrfs_start_transaction(root, 1);
 
+	if (dest != root)
+		btrfs_record_root_in_trans(trans, dest);
+
 	/*
 	 * make sure the inode gets flushed if it is replacing
 	 * something.
@@ -4846,18 +4987,22 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		btrfs_add_ordered_operation(trans, root, old_inode);
 	}
 
-	/*
-	 * this is an ugly little race, but the rename is required to make
-	 * sure that if we crash, the inode is either at the old name
-	 * or the new one.  pinning the log transaction lets us make sure
-	 * we don't allow a log commit to come in after we unlink the
-	 * name but before we add the new name back in.
-	 */
-	btrfs_pin_log_trans(root);
+	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+		/* force full log commit if subvolume involved. */
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+	} else {
+		/*
+		 * this is an ugly little race, but the rename is required
+		 * to make sure that if we crash, the inode is either at the
+		 * old name or the new one.  pinning the log transaction lets
+		 * us make sure we don't allow a log commit to come in after
+		 * we unlink the name but before we add the new name back in.
+		 */
+		btrfs_pin_log_trans(root);
+	}
 
 	btrfs_set_trans_block_group(trans, new_dir);
 
-	btrfs_inc_nlink(old_dentry->d_inode);
 	old_dir->i_ctime = old_dir->i_mtime = ctime;
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
@@ -4865,47 +5010,58 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (old_dentry->d_parent != new_dentry->d_parent)
 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
 
-	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
-				 old_dentry->d_name.name,
-				 old_dentry->d_name.len);
-	if (ret)
-		goto out_fail;
+	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+					old_dentry->d_name.name,
+					old_dentry->d_name.len);
+	} else {
+		btrfs_inc_nlink(old_dentry->d_inode);
+		ret = btrfs_unlink_inode(trans, root, old_dir,
+					 old_dentry->d_inode,
+					 old_dentry->d_name.name,
+					 old_dentry->d_name.len);
+	}
+	BUG_ON(ret);
 
 	if (new_inode) {
 		new_inode->i_ctime = CURRENT_TIME;
-		ret = btrfs_unlink_inode(trans, root, new_dir,
-					 new_dentry->d_inode,
-					 new_dentry->d_name.name,
-					 new_dentry->d_name.len);
-		if (ret)
-			goto out_fail;
+		if (unlikely(new_inode->i_ino ==
+			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+			root_objectid = BTRFS_I(new_inode)->location.objectid;
+			ret = btrfs_unlink_subvol(trans, dest, new_dir,
+						root_objectid,
+						new_dentry->d_name.name,
+						new_dentry->d_name.len);
+			BUG_ON(new_inode->i_nlink == 0);
+		} else {
+			ret = btrfs_unlink_inode(trans, dest, new_dir,
+						 new_dentry->d_inode,
+						 new_dentry->d_name.name,
+						 new_dentry->d_name.len);
+		}
+		BUG_ON(ret);
 		if (new_inode->i_nlink == 0) {
 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
-			if (ret)
-				goto out_fail;
+			BUG_ON(ret);
 		}
-
 	}
 	ret = btrfs_set_inode_index(new_dir, &index);
-	if (ret)
-		goto out_fail;
+	BUG_ON(ret);
 
-	ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
-			     old_inode, new_dentry->d_name.name,
+	ret = btrfs_add_link(trans, new_dir, old_inode,
+			     new_dentry->d_name.name,
 			     new_dentry->d_name.len, 1, index);
-	if (ret)
-		goto out_fail;
+	BUG_ON(ret);
 
-	btrfs_log_new_name(trans, old_inode, old_dir,
-				       new_dentry->d_parent);
-out_fail:
+	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		btrfs_log_new_name(trans, old_inode, old_dir,
+				   new_dentry->d_parent);
+		btrfs_end_log_trans(root);
+	}
 
-	/* this btrfs_end_log_trans just allows the current
-	 * log-sub transaction to complete
-	 */
-	btrfs_end_log_trans(root);
 	btrfs_end_transaction_throttle(trans, root);
-out_unlock:
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 76dda93c6ae2c1dc3e6cde34569d6aca26b0c918 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Mon, 21 Sep 2009 16:00:26 -0400
Subject: Btrfs: add snapshot/subvolume destroy ioctl

This patch adds snapshot/subvolume destroy ioctl.  A subvolume that isn't being
used and doesn't contains links to other subvolumes can be destroyed.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 127 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6036b36789c..db9cbd91eb4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3089,6 +3089,11 @@ void btrfs_delete_inode(struct inode *inode)
 	}
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
+	if (inode->i_nlink > 0) {
+		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+		goto no_delete;
+	}
+
 	btrfs_i_size_write(inode, 0);
 	trans = btrfs_join_transaction(root, 1);
 
@@ -3225,11 +3230,13 @@ static void inode_tree_add(struct inode *inode)
 	struct btrfs_inode *entry;
 	struct rb_node **p;
 	struct rb_node *parent;
-
 again:
 	p = &root->inode_tree.rb_node;
 	parent = NULL;
 
+	if (hlist_unhashed(&inode->i_hash))
+		return;
+
 	spin_lock(&root->inode_lock);
 	while (*p) {
 		parent = *p;
@@ -3256,13 +3263,87 @@ again:
 static void inode_tree_del(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int empty = 0;
 
 	spin_lock(&root->inode_lock);
 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+		empty = RB_EMPTY_ROOT(&root->inode_tree);
 	}
 	spin_unlock(&root->inode_lock);
+
+	if (empty && btrfs_root_refs(&root->root_item) == 0) {
+		synchronize_srcu(&root->fs_info->subvol_srcu);
+		spin_lock(&root->inode_lock);
+		empty = RB_EMPTY_ROOT(&root->inode_tree);
+		spin_unlock(&root->inode_lock);
+		if (empty)
+			btrfs_add_dead_root(root);
+	}
+}
+
+int btrfs_invalidate_inodes(struct btrfs_root *root)
+{
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct btrfs_inode *entry;
+	struct inode *inode;
+	u64 objectid = 0;
+
+	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+	spin_lock(&root->inode_lock);
+again:
+	node = root->inode_tree.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+		if (objectid < entry->vfs_inode.i_ino)
+			node = node->rb_left;
+		else if (objectid > entry->vfs_inode.i_ino)
+			node = node->rb_right;
+		else
+			break;
+	}
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct btrfs_inode, rb_node);
+			if (objectid <= entry->vfs_inode.i_ino) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+	while (node) {
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+		objectid = entry->vfs_inode.i_ino + 1;
+		inode = igrab(&entry->vfs_inode);
+		if (inode) {
+			spin_unlock(&root->inode_lock);
+			if (atomic_read(&inode->i_count) > 1)
+				d_prune_aliases(inode);
+			/*
+			 * btrfs_drop_inode will remove it from
+			 * the inode cache when its usage count
+			 * hits zero.
+			 */
+			iput(inode);
+			cond_resched();
+			spin_lock(&root->inode_lock);
+			goto again;
+		}
+
+		if (cond_resched_lock(&root->inode_lock))
+			goto again;
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->inode_lock);
+	return 0;
 }
 
 static noinline void init_btrfs_i(struct inode *inode)
@@ -3379,8 +3460,11 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
+	int index;
 	int ret;
 
+	dentry->d_op = &btrfs_dentry_operations;
+
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
@@ -3399,6 +3483,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 
 	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
 
+	index = srcu_read_lock(&root->fs_info->subvol_srcu);
 	ret = fixup_tree_root_location(root, dir, dentry,
 				       &location, &sub_root);
 	if (ret < 0) {
@@ -3409,9 +3494,24 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	} else {
 		inode = btrfs_iget(dir->i_sb, &location, sub_root);
 	}
+	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+
 	return inode;
 }
 
+static int btrfs_dentry_delete(struct dentry *dentry)
+{
+	struct btrfs_root *root;
+
+	if (!dentry->d_inode)
+		return 0;
+
+	root = BTRFS_I(dentry->d_inode)->root;
+	if (btrfs_root_refs(&root->root_item) == 0)
+		return 1;
+	return 0;
+}
+
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   struct nameidata *nd)
 {
@@ -4773,11 +4873,11 @@ out:
  * create a new subvolume directory/inode (helper for the ioctl).
  */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *new_root, struct dentry *dentry,
+			     struct btrfs_root *new_root,
 			     u64 new_dirid, u64 alloc_hint)
 {
 	struct inode *inode;
-	int error;
+	int err;
 	u64 index = 0;
 
 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
@@ -4790,11 +4890,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 	inode->i_nlink = 1;
 	btrfs_i_size_write(inode, 0);
 
-	error = btrfs_update_inode(trans, new_root, inode);
-	if (error)
-		return error;
+	err = btrfs_update_inode(trans, new_root, inode);
+	BUG_ON(err);
 
-	d_instantiate(dentry, inode);
+	iput(inode);
 	return 0;
 }
 
@@ -4872,6 +4971,16 @@ void btrfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
+void btrfs_drop_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
+		generic_delete_inode(inode);
+	else
+		generic_drop_inode(inode);
+}
+
 static void init_once(void *foo)
 {
 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
@@ -4973,6 +5082,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
 		filemap_flush(old_inode->i_mapping);
 
+	/* close the racy window with snapshot create/destroy ioctl */
+	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		down_read(&root->fs_info->subvol_sem);
+
 	trans = btrfs_start_transaction(root, 1);
 
 	if (dest != root)
@@ -5062,6 +5175,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	btrfs_end_transaction_throttle(trans, root);
 
+	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		up_read(&root->fs_info->subvol_sem);
 	return ret;
 }
 
@@ -5420,6 +5535,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.permission	= btrfs_permission,
 };
+
 static struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
@@ -5506,3 +5622,7 @@ static struct inode_operations btrfs_symlink_inode_operations = {
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 };
+
+struct dentry_operations btrfs_dentry_operations = {
+	.d_delete	= btrfs_dentry_delete,
+};
-- 
cgit v1.2.3


From a57195214358b75807a74bad96a8601a36262af7 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Thu, 24 Sep 2009 09:17:31 -0400
Subject: Btrfs: check size of inode backref before adding hardlink

For every hardlink in btrfs, there is a corresponding inode back
reference. All inode back references for hardlinks in a given
directory are stored in single b-tree item. The size of b-tree item
is limited by the size of b-tree leaf, so we can only create limited
number of hardlinks to a given file in a directory.

The original code lacks of the check, it oops if the number of
hardlinks goes over the limit. This patch fixes the issue by adding
check to btrfs_link and btrfs_rename.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 53 +++++++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index db9cbd91eb4..19fcde289dd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4133,18 +4133,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	err = btrfs_add_nondir(trans, dentry, inode, 1, index);
 
-	if (err)
-		drop_inode = 1;
-
-	btrfs_update_inode_block_group(trans, dir);
-	err = btrfs_update_inode(trans, root, inode);
-
-	if (err)
+	if (err) {
 		drop_inode = 1;
+	} else {
+		btrfs_update_inode_block_group(trans, dir);
+		err = btrfs_update_inode(trans, root, inode);
+		BUG_ON(err);
+		btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+	}
 
 	nr = trans->blocks_used;
-
-	btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
 	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
@@ -5087,23 +5085,26 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		down_read(&root->fs_info->subvol_sem);
 
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, new_dir);
 
 	if (dest != root)
 		btrfs_record_root_in_trans(trans, dest);
 
-	/*
-	 * make sure the inode gets flushed if it is replacing
-	 * something.
-	 */
-	if (new_inode && new_inode->i_size &&
-	    old_inode && S_ISREG(old_inode->i_mode)) {
-		btrfs_add_ordered_operation(trans, root, old_inode);
-	}
+	ret = btrfs_set_inode_index(new_dir, &index);
+	if (ret)
+		goto out_fail;
 
-	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		/* force full log commit if subvolume involved. */
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 	} else {
+		ret = btrfs_insert_inode_ref(trans, dest,
+					     new_dentry->d_name.name,
+					     new_dentry->d_name.len,
+					     old_inode->i_ino,
+					     new_dir->i_ino, index);
+		if (ret)
+			goto out_fail;
 		/*
 		 * this is an ugly little race, but the rename is required
 		 * to make sure that if we crash, the inode is either at the
@@ -5113,8 +5114,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 */
 		btrfs_pin_log_trans(root);
 	}
-
-	btrfs_set_trans_block_group(trans, new_dir);
+	/*
+	 * make sure the inode gets flushed if it is replacing
+	 * something.
+	 */
+	if (new_inode && new_inode->i_size &&
+	    old_inode && S_ISREG(old_inode->i_mode)) {
+		btrfs_add_ordered_operation(trans, root, old_inode);
+	}
 
 	old_dir->i_ctime = old_dir->i_mtime = ctime;
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
@@ -5159,12 +5166,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			BUG_ON(ret);
 		}
 	}
-	ret = btrfs_set_inode_index(new_dir, &index);
-	BUG_ON(ret);
 
 	ret = btrfs_add_link(trans, new_dir, old_inode,
 			     new_dentry->d_name.name,
-			     new_dentry->d_name.len, 1, index);
+			     new_dentry->d_name.len, 0, index);
 	BUG_ON(ret);
 
 	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
@@ -5172,7 +5177,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 				   new_dentry->d_parent);
 		btrfs_end_log_trans(root);
 	}
-
+out_fail:
 	btrfs_end_transaction_throttle(trans, root);
 
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
-- 
cgit v1.2.3


From f679a84034be6f7da123be786bbd8838bf3e9207 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Thu, 24 Sep 2009 09:17:31 -0400
Subject: Btrfs: don't rename file into dummy directory

A recent change enforces only one access point to each subvolume. The first
directory entry (the one added when the subvolume/snapshot was created) is
treated as valid access point, all other subvolume links are linked to dummy
empty directories. The dummy directories are temporary inodes that only in
memory, so we can not rename file into them.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 19fcde289dd..01c5f8b5a34 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5055,6 +5055,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	u64 root_objectid;
 	int ret;
 
+	if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+		return -EPERM;
+
 	/* we only allow rename subvolume link between subvolumes */
 	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
 		return -EXDEV;
-- 
cgit v1.2.3