From 890871be854b5f5e43e7ba2475f706209906cc24 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 16:24:52 -0400 Subject: Btrfs: switch extent_map to a rw lock There are two main users of the extent_map tree. The first is regular file inodes, where it is evenly spread between readers and writers. The second is the chunk allocation tree, which maps blocks from logical addresses to phyiscal ones, and it is 99.99% reads. The mapping tree is a point of lock contention during heavy IO workloads, so this commit switches things to a rw lock. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 04b53b5ebe5..f1df1171861 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -612,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode, set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -748,9 +748,9 @@ static noinline int cow_file_range(struct inode *inode, set_bit(EXTENT_FLAG_PINNED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -1081,9 +1081,9 @@ out_check: em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -1670,13 +1670,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, failrec->last_mirror = 0; failrec->bio_flags = 0; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, failrec->len); if (em->start > start || em->start + em->len < start) { free_extent_map(em); em = NULL; } - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em || IS_ERR(em)) { kfree(failrec); @@ -4069,11 +4069,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, int compressed; again: - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) em->bdev = root->fs_info->fs_devices->latest_bdev; - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) @@ -4264,7 +4264,7 @@ insert: } err = 0; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that @@ -4304,7 +4304,7 @@ insert: err = 0; } } - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); out: if (path) btrfs_free_path(path); -- cgit v1.2.3 From 2c64c53d8d30d43d0670482503a3914dfd3d6d46 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 15:04:12 -0400 Subject: Btrfs: cache values for locking extents Many of the btrfs extent state tree users follow the same pattern. They lock an extent range in the tree, do some operation and then unlock. This translates to at least 2 rbtree searches, and maybe more if they are doing operations on the extent state tree. A locked extent in the tree isn't going to be merged or changed, and so we can safely return the extent state structure as a cached handle. This changes set_extent_bit to give back a cached handle, and also changes both set_extent_bit and clear_extent_bit to use the cached handle if it is available. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f1df1171861..e494545c420 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -854,7 +854,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, int limit = 10 * 1024 * 1042; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | - EXTENT_DELALLOC, 1, 0, GFP_NOFS); + EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); async_cow->inode = inode; @@ -4420,7 +4420,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED, 1, 0, GFP_NOFS); + EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); btrfs_finish_ordered_io(page->mapping->host, page_start, page_end); btrfs_put_ordered_extent(ordered); @@ -4429,7 +4429,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_ORDERED, - 1, 1, GFP_NOFS); + 1, 1, NULL, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); -- cgit v1.2.3 From 9655d2982b53fdb38a9e0f2f11315b99b92d66e2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 15:22:30 -0400 Subject: Btrfs: use a cached state for extent state operations during delalloc This changes the btrfs code to find delalloc ranges in the extent state tree to use the new state caching code from set/test bit. It reduces one of the biggest causes of rbtree searches in the writeback path. test_range_bit is also modified to take the cached state as a starting point while searching. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e494545c420..3f8e93de298 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1376,7 +1376,7 @@ again: /* already ordered? We're done */ if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_ORDERED, 0)) { + EXTENT_ORDERED, 0, NULL)) { goto out; } @@ -1417,7 +1417,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) int ret; ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, - EXTENT_ORDERED, 0); + EXTENT_ORDERED, 0, NULL); if (ret) return 0; @@ -1795,7 +1795,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && - test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, GFP_NOFS); return 0; -- cgit v1.2.3 From 8b62b72b26bcd72082c4a69d179dd906bcc22200 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 16:53:46 -0400 Subject: Btrfs: Use PagePrivate2 to track pages in the data=ordered code. Btrfs writes go through delalloc to the data=ordered code. This makes sure that all of the data is on disk before the metadata that references it. The tracking means that we have to make sure each page in an extent is fully written before we add that extent into the on-disk btree. This was done in the past by setting the EXTENT_ORDERED bit for the range of an extent when it was added to the data=ordered code, and then clearing the EXTENT_ORDERED bit in the extent state tree as each page finished IO. One of the reasons we had to do this was because sometimes pages are magically dirtied without page_mkwrite being called. The EXTENT_ORDERED bit is checked at writepage time, and if it isn't there, our page become dirty without going through the proper path. These bit operations make for a number of rbtree searches for each page, and can cause considerable lock contention. This commit switches from the EXTENT_ORDERED bit to use PagePrivate2. As pages go into the ordered code, PagePrivate2 is set on each one. This is a cheap operation because we already have all the pages locked and ready to go. As IO finishes, the PagePrivate2 bit is cleared and the ordered accoutning is updated for each page. At writepage time, if the PagePrivate2 bit is missing, we go into the writepage fixup code to handle improperly dirtied pages. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3f8e93de298..739a245e25d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -426,7 +426,7 @@ again: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, end, NULL, 1, 0, - 0, 1, 1, 1); + 0, 1, 1, 1, 0); ret = 0; goto free_pages_out; } @@ -641,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - NULL, 1, 1, 0, 1, 1, 0); + NULL, 1, 1, 0, 1, 1, 0, 0); ret = btrfs_submit_compressed_write(inode, async_extent->start, @@ -714,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode, extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, end, NULL, 1, 1, - 1, 1, 1, 1); + 1, 1, 1, 1, 0); *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; @@ -777,11 +777,14 @@ static noinline int cow_file_range(struct inode *inode, /* we're not doing compressed IO, don't unlock the first * page (which the caller expects to stay locked), don't * clear any dirty bits and don't set any writeback bits + * + * Do set the Private2 bit so we know this page was properly + * setup for writepage */ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, start + ram_size - 1, locked_page, unlock, 1, - 1, 0, 0, 0); + 1, 0, 0, 0, 1); disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; @@ -1102,7 +1105,7 @@ out_check: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, cur_offset, cur_offset + num_bytes - 1, - locked_page, 1, 1, 1, 0, 0, 0); + locked_page, 1, 1, 1, 0, 0, 0, 1); cur_offset = extent_end; if (cur_offset > end) break; @@ -1375,10 +1378,8 @@ again: lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); /* already ordered? We're done */ - if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_ORDERED, 0, NULL)) { + if (PagePrivate2(page)) goto out; - } ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { @@ -1414,11 +1415,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) struct inode *inode = page->mapping->host; struct btrfs_writepage_fixup *fixup; struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, - EXTENT_ORDERED, 0, NULL); - if (ret) + /* this page is properly in the ordered list */ + if (TestClearPagePrivate2(page)) return 0; if (PageChecked(page)) @@ -1624,6 +1623,7 @@ nocow: static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { + ClearPagePrivate2(page); return btrfs_finish_ordered_io(page->mapping->host, start, end); } @@ -4403,13 +4403,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + + /* + * we have the page locked, so new writeback can't start, + * and the dirty bit won't be cleared while we are here. + * + * Wait for IO on this page so that we can safely clear + * the PagePrivate2 bit and do ordered accounting + */ wait_on_page_writeback(page); + tree = &BTRFS_I(page->mapping->host)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent(tree, page_start, page_end, GFP_NOFS); ordered = btrfs_lookup_ordered_extent(page->mapping->host, page_offset(page)); @@ -4421,14 +4429,19 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); - btrfs_finish_ordered_io(page->mapping->host, - page_start, page_end); + /* + * whoever cleared the private bit is responsible + * for the finish_ordered_io + */ + if (TestClearPagePrivate2(page)) { + btrfs_finish_ordered_io(page->mapping->host, + page_start, page_end); + } btrfs_put_ordered_extent(ordered); lock_extent(tree, page_start, page_end, GFP_NOFS); } clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_ORDERED, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 1, 1, NULL, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); -- cgit v1.2.3 From a1ed835e1ab5795f91b198d08c43e2f56848dcf3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 11 Sep 2009 12:27:37 -0400 Subject: Btrfs: Fix extent replacment race Data COW means that whenever we write to a file, we replace any old extent pointers with new ones. There was a window where a readpage might find the old extent pointers on disk and cache them in the extent_map tree in ram in the middle of a given write replacing them. Even though both the readpage and the write had their respective bytes in the file locked, the extent readpage inserts may cover more bytes than it had locked down. This commit closes the race by keeping the new extent pinned in the extent map tree until after the on-disk btree is properly setup with the new extent pointers. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 739a245e25d..233fe6f2612 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -232,7 +232,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, + &hint_byte, 1); BUG_ON(ret); if (isize > actual_end) @@ -241,7 +242,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, inline_len, compressed_size, compressed_pages); BUG_ON(ret); - btrfs_drop_extent_cache(inode, start, aligned_end, 0); + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; } @@ -1455,9 +1456,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(!path); path->leave_spinning = 1; + + /* + * we may be replacing one extent in the tree with another. + * The new extent is pinned in the extent map, and we don't want + * to drop it from the cache until it is completely in the btree. + * + * So, tell btrfs_drop_extents to leave this extent in the cache. + * the caller is expected to unpin it and allow it to be merged + * with the others. + */ ret = btrfs_drop_extents(trans, root, inode, file_pos, file_pos + num_bytes, locked_end, - file_pos, &hint); + file_pos, &hint, 0); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1485,7 +1496,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, num_bytes); - btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; @@ -1596,6 +1606,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, + ordered_extent->len); BUG_ON(ret); } unlock_extent(io_tree, ordered_extent->file_offset, @@ -2940,7 +2953,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) cur_offset, cur_offset + hole_size, block_end, - cur_offset, &hint_byte); + cur_offset, &hint_byte, 1); if (err) break; err = btrfs_insert_file_extent(trans, root, @@ -5086,6 +5099,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset -1, 0); num_bytes -= ins.offset; cur_offset += ins.offset; alloc_hint = ins.objectid + ins.offset; -- cgit v1.2.3 From 50a9b214bc6c052caa05a210ebfc1bdf0d7085b2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 11 Sep 2009 12:33:12 -0400 Subject: Btrfs: fix btrfs page_mkwrite to return locked page This closes a whole where the page may be written before the page_mkwrite caller has a chance to dirty it (thanks to Nick Piggin) Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 233fe6f2612..c846482e798 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4552,11 +4552,14 @@ again: } ClearPageChecked(page); set_page_dirty(page); + SetPageUptodate(page); BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + if (!ret) + return VM_FAULT_LOCKED; unlock_page(page); out: return ret; -- cgit v1.2.3 From 93c82d575055f1bd0277acae6f966bebafd80dd5 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 11 Sep 2009 12:36:29 -0400 Subject: Btrfs: zero page past end of inline file items When btrfs_get_extent is reading inline file items for readpage, it needs to copy the inline extent into the page. If the inline extent doesn't cover all of the page, that means there is a hole in the file, or that our file is smaller than one page. readpage does zeroing for the case where the file is smaller than one page, but nobody is currently zeroing for the case where there is a hole after the inline item. This commit changes btrfs_get_extent to zero fill the page past the end of the inline item. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c846482e798..88f9df7bfda 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4233,6 +4233,11 @@ again: map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); + if (pg_offset + copy_size < PAGE_CACHE_SIZE) { + memset(map + pg_offset + copy_size, 0, + PAGE_CACHE_SIZE - pg_offset - + copy_size); + } kunmap(page); } flush_dcache_page(page); -- cgit v1.2.3