From a52d9a8033c454cd9b4697cfafb467509fc1693f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Aug 2007 16:49:44 -0400 Subject: Btrfs: Extent based page cache code. This uses an rbtree of extents and tests instead of buffer heads. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 1656 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1656 insertions(+) create mode 100644 fs/btrfs/extent_map.c (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c new file mode 100644 index 00000000000..d378edf0964 --- /dev/null +++ b/fs/btrfs/extent_map.c @@ -0,0 +1,1656 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "extent_map.h" + +static struct kmem_cache *extent_map_cache; +static struct kmem_cache *extent_state_cache; + +struct tree_entry { + u64 start; + u64 end; + int in_tree; + struct rb_node rb_node; +}; + +/* bits for the extent state */ +#define EXTENT_DIRTY 1 +#define EXTENT_WRITEBACK (1 << 1) +#define EXTENT_UPTODATE (1 << 2) +#define EXTENT_LOCKED (1 << 3) +#define EXTENT_NEW (1 << 4) +#define EXTENT_DELALLOC (1 << 5) + +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) + +static LIST_HEAD(all_states); +spinlock_t state_lock = SPIN_LOCK_UNLOCKED; + +void __init extent_map_init(void) +{ + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | + SLAB_DESTROY_BY_RCU, + NULL); + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | + SLAB_DESTROY_BY_RCU, + NULL); +} + +void __exit extent_map_exit(void) +{ + while(!list_empty(&all_states)) { + struct extent_state *state; + struct list_head *cur = all_states.next; + state = list_entry(cur, struct extent_state, list); + printk("found leaked state %Lu %Lu state %d in_tree %d\n", + state->start, state->end, state->state, state->in_tree); + list_del(&state->list); + kfree(state); + } + if (extent_map_cache) + kmem_cache_destroy(extent_map_cache); + if (extent_state_cache) + kmem_cache_destroy(extent_state_cache); +} + +void extent_map_tree_init(struct extent_map_tree *tree, + struct address_space *mapping, gfp_t mask) +{ + tree->map.rb_node = NULL; + tree->state.rb_node = NULL; + rwlock_init(&tree->lock); + tree->mapping = mapping; +} +EXPORT_SYMBOL(extent_map_tree_init); + +struct extent_map *alloc_extent_map(gfp_t mask) +{ + struct extent_map *em; + em = kmem_cache_alloc(extent_map_cache, mask); + if (!em || IS_ERR(em)) + return em; + em->in_tree = 0; + atomic_set(&em->refs, 1); + return em; +} +EXPORT_SYMBOL(alloc_extent_map); + +void free_extent_map(struct extent_map *em) +{ + if (atomic_dec_and_test(&em->refs)) { + WARN_ON(em->in_tree); + kmem_cache_free(extent_map_cache, em); + } +} +EXPORT_SYMBOL(free_extent_map); + + +struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state || IS_ERR(state)) + return state; + state->state = 0; + state->in_tree = 0; + atomic_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + spin_lock_irq(&state_lock); + list_add(&state->list, &all_states); + spin_unlock_irq(&state_lock); + return state; +} +EXPORT_SYMBOL(alloc_extent_state); + +void free_extent_state(struct extent_state *state) +{ + if (atomic_dec_and_test(&state->refs)) { + WARN_ON(state->in_tree); + spin_lock_irq(&state_lock); + list_del_init(&state->list); + spin_unlock_irq(&state_lock); + kmem_cache_free(extent_state_cache, state); + } +} +EXPORT_SYMBOL(free_extent_state); + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct tree_entry *entry; + + while(*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset > entry->end) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct tree_entry, rb_node); + entry->in_tree = 1; + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_ret) +{ + struct rb_node * n = root->rb_node; + struct rb_node *prev = NULL; + struct tree_entry *entry; + struct tree_entry *prev_entry = NULL; + + while(n) { + entry = rb_entry(n, struct tree_entry, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset > entry->end) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + while(prev && offset > prev_entry->end) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + return NULL; +} + +static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) +{ + struct rb_node *prev; + struct rb_node *ret; + ret = __tree_search(root, offset, &prev); + if (!ret) + return prev; + return ret; +} + +static int tree_delete(struct rb_root *root, u64 offset) +{ + struct rb_node *node; + struct tree_entry *entry; + + node = __tree_search(root, offset, NULL); + if (!node) + return -ENOENT; + entry = rb_entry(node, struct tree_entry, rb_node); + entry->in_tree = 0; + rb_erase(node, root); + return 0; +} + +/* + * add_extent_mapping tries a simple backward merge with existing + * mappings. The extent_map struct passed in will be inserted into + * the tree directly (no copies made, just a reference taken). + */ +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em) +{ + int ret = 0; + struct extent_map *prev = NULL; + struct rb_node *rb; + + write_lock_irq(&tree->lock); + rb = tree_insert(&tree->map, em->end, &em->rb_node); + if (rb) { + prev = rb_entry(rb, struct extent_map, rb_node); + printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end); + ret = -EEXIST; + goto out; + } + atomic_inc(&em->refs); + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + prev = rb_entry(rb, struct extent_map, rb_node); + if (prev && prev->end + 1 == em->start && + ((em->block_start == 0 && prev->block_start == 0) || + (em->block_start == prev->block_end + 1))) { + em->start = prev->start; + em->block_start = prev->block_start; + rb_erase(&prev->rb_node, &tree->map); + prev->in_tree = 0; + free_extent_map(prev); + } + } +out: + write_unlock_irq(&tree->lock); + return ret; +} +EXPORT_SYMBOL(add_extent_mapping); + +/* + * lookup_extent_mapping returns the first extent_map struct in the + * tree that intersects the [start, end] (inclusive) range. There may + * be additional objects in the tree that intersect, so check the object + * returned carefully to make sure you don't need additional lookups. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 end) +{ + struct extent_map *em; + struct rb_node *rb_node; + + read_lock_irq(&tree->lock); + rb_node = tree_search(&tree->map, start); + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + if (em->end < start || em->start > end) { + em = NULL; + goto out; + } + atomic_inc(&em->refs); +out: + read_unlock_irq(&tree->lock); + return em; +} +EXPORT_SYMBOL(lookup_extent_mapping); + +/* + * removes an extent_map struct from the tree. No reference counts are + * dropped, and no checks are done to see if the range is in use + */ +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +{ + int ret; + + write_lock_irq(&tree->lock); + ret = tree_delete(&tree->map, em->end); + write_unlock_irq(&tree->lock); + return ret; +} +EXPORT_SYMBOL(remove_extent_mapping); + +/* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single + * extent in the tree. Extents with EXTENT_IO in their state field + * are not merged because the end_io handlers need to be able to do + * operations on them without sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static int merge_state(struct extent_map_tree *tree, + struct extent_state *state) +{ + struct extent_state *other; + struct rb_node *other_node; + + if (state->state & EXTENT_IOBITS) + return 0; + + other_node = rb_prev(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { + state->start = other->start; + other->in_tree = 0; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); + } + } + other_node = rb_next(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { + other->start = state->start; + state->in_tree = 0; + rb_erase(&state->rb_node, &tree->state); + free_extent_state(state); + } + } + return 0; +} + +/* + * insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_map_tree *tree, + struct extent_state *state, u64 start, u64 end, + int bits) +{ + struct rb_node *node; + + if (end < start) { + printk("end < start %Lu %Lu\n", end, start); + WARN_ON(1); + } + state->state |= bits; + state->start = start; + state->end = end; + if ((end & 4095) == 0) { + printk("insert state %Lu %Lu strange end\n", start, end); + WARN_ON(1); + } + node = tree_insert(&tree->state, end, &state->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); +printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); + free_extent_state(state); + return -EEXIST; + } + merge_state(tree, state); + return 0; +} + +/* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_map_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *node; + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + if ((prealloc->end & 4095) == 0) { + printk("insert state %Lu %Lu strange end\n", prealloc->start, + prealloc->end); + WARN_ON(1); + } + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); +printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); + free_extent_state(prealloc); + return -EEXIST; + } + return 0; +} + +/* + * utility function to clear some bits in an extent state struct. + * it will optionally wake up any one waiting on this state (wake == 1), or + * forcibly remove the state from the tree (delete == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static int clear_state_bit(struct extent_map_tree *tree, + struct extent_state *state, int bits, int wake, + int delete) +{ + int ret = state->state & bits; + state->state &= ~bits; + if (wake) + wake_up(&state->wq); + if (delete || state->state == 0) { + if (state->in_tree) { + rb_erase(&state->rb_node, &tree->state); + state->in_tree = 0; + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + } + return ret; +} + +/* + * clear some bits on a range in the tree. This may require splitting + * or inserting elements in the tree, so the gfp mask is used to + * indicate which allocations or sleeping are allowed. + * + * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove + * the given range from the tree regardless of state (ie for truncate). + * + * the range [start, end] is inclusive. + * + * This takes the tree lock, and returns < 0 on error, > 0 if any of the + * bits were already set, or zero if none of the bits were already set. + */ +int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err; + int set = 0; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + write_lock_irq(&tree->lock); + /* + * this search will find the extents that end after + * our range starts + */ + node = tree_search(&tree->state, start); + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > end) + goto out; + WARN_ON(state->end < start); + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip + * bits on second half. + * + * If the extent we found extends past our range, we + * just split and search again. It'll get split again + * the next time though. + * + * If the extent we found is inside our range, we clear + * the desired bit on it. + */ + + if (state->start < start) { + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, + wake, delete); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + if (wake) + wake_up(&state->wq); + set |= clear_state_bit(tree, prealloc, bits, + wake, delete); + prealloc = NULL; + goto out; + } + + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, wake, delete); + goto search_again; + +out: + write_unlock_irq(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return set; + +search_again: + if (start >= end) + goto out; + write_unlock_irq(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} +EXPORT_SYMBOL(clear_extent_bit); + +static int wait_on_state(struct extent_map_tree *tree, + struct extent_state *state) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + read_unlock_irq(&tree->lock); + schedule(); + read_lock_irq(&tree->lock); + finish_wait(&state->wq, &wait); + return 0; +} + +/* + * waits for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits) +{ + struct extent_state *state; + struct rb_node *node; + + read_lock_irq(&tree->lock); +again: + while (1) { + /* + * this search will find all the extents that end after + * our range starts + */ + node = tree_search(&tree->state, start); + if (!node) + break; + + state = rb_entry(node, struct extent_state, rb_node); + + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + atomic_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (need_resched()) { + read_unlock_irq(&tree->lock); + cond_resched(); + read_lock_irq(&tree->lock); + } + } +out: + read_unlock_irq(&tree->lock); + return 0; +} +EXPORT_SYMBOL(wait_extent_bit); + +/* + * set some bits on a range in the tree. This may require allocations + * or sleeping, so the gfp mask is used to indicate what is allowed. + * + * If 'exclusive' == 1, this will fail with -EEXIST if some part of the + * range already has the desired bits set. The start of the existing + * range is returned in failed_start in this case. + * + * [start, end] is inclusive + * This takes the tree lock. + */ +int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits, + int exclusive, u64 *failed_start, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + int set; + u64 last_start; + u64 last_end; +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + write_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, start); + if (!node) { + err = insert_state(tree, prealloc, start, end, bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + + state = rb_entry(node, struct extent_state, rb_node); + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + set = state->state & bits; + if (set && exclusive) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + state->state |= bits; + start = state->end + 1; + merge_state(tree, state); + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + state->state |= bits; + start = state->end + 1; + merge_state(tree, state); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + prealloc->state |= bits; + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start -1; + err = insert_state(tree, prealloc, start, this_end, + bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + if (err) + goto out; + start = this_end + 1; + goto search_again; + } + goto search_again; + +out: + write_unlock_irq(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + write_unlock_irq(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} +EXPORT_SYMBOL(set_extent_bit); + +/* wrappers around set/clear extent bit */ +int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_dirty); + +int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_DIRTY, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_dirty); + +int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_new); + +int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_new); + +int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_uptodate); + +int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_uptodate); + +int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, + 0, NULL, mask); +} +EXPORT_SYMBOL(set_extent_writeback); + +int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); +} +EXPORT_SYMBOL(clear_extent_writeback); + +int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end) +{ + return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); +} +EXPORT_SYMBOL(wait_on_extent_writeback); + +/* + * locks a range in ascending order, waiting for any locked regions + * it hits on the way. [start,end] are inclusive, and this will sleep. + */ +int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask) +{ + int err; + u64 failed_start; + while (1) { + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, + &failed_start, mask); + if (err == -EEXIST && (mask & __GFP_WAIT)) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; + } else { + break; + } + WARN_ON(start > end); + } + return err; +} +EXPORT_SYMBOL(lock_extent); + +int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); +} +EXPORT_SYMBOL(unlock_extent); + +/* + * helper function to set pages and extents in the tree dirty + */ +int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + set_extent_dirty(tree, start, end, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(set_range_dirty); + +/* + * helper function to set both pages and extents in the tree writeback + */ +int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + set_page_writeback(page); + page_cache_release(page); + index++; + } + set_extent_writeback(tree, start, end, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(set_range_writeback); + +/* + * helper function to lock both pages and extents in the tree. + * pages must be locked first. + */ +int lock_range(struct extent_map_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + int err; + + while (index <= end_index) { + page = grab_cache_page(tree->mapping, index); + if (!page) { + err = -ENOMEM; + goto failed; + } + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto failed; + } + index++; + } + lock_extent(tree, start, end, GFP_NOFS); + return 0; + +failed: + /* + * we failed above in getting the page at 'index', so we undo here + * up to but not including the page at 'index' + */ + end_index = index; + index = start >> PAGE_CACHE_SHIFT; + while (index < end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + return err; +} +EXPORT_SYMBOL(lock_range); + +/* + * helper function to unlock both pages and extents in the tree. + */ +int unlock_range(struct extent_map_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + unlock_extent(tree, start, end, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(unlock_range); + +/* + * searches a range in the state tree for a given mask. + * If 'filled' == 1, this returns 1 only if ever extent in the tree + * has the bits set. Otherwise, 1 is returned if any bit in the + * range is found set. + */ +static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, + int bits, int filled) +{ + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + + read_lock_irq(&tree->lock); + node = tree_search(&tree->state, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > end) + break; + + if (filled && state->start > start) { + bitset = 0; + break; + } + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + start = state->end + 1; + if (start > end) + break; + node = rb_next(node); + } + read_unlock_irq(&tree->lock); + return bitset; +} + +/* + * helper function to set a given page up to date if all the + * extents in the tree for that page are up to date + */ +static int check_page_uptodate(struct extent_map_tree *tree, + struct page *page) +{ + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) + SetPageUptodate(page); + return 0; +} + +/* + * helper function to unlock a page if all the extents in the tree + * for that page are unlocked + */ +static int check_page_locked(struct extent_map_tree *tree, + struct page *page) +{ + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) + unlock_page(page); + return 0; +} + +/* + * helper function to end page writeback if all the extents + * in the tree for that page are done with writeback + */ +static int check_page_writeback(struct extent_map_tree *tree, + struct page *page) +{ + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) + end_page_writeback(page); + return 0; +} + +/* lots and lots of room for performance fixes in the end_bio funcs */ + +/* + * after a writepage IO is done, we need to: + * clear the uptodate bits on error + * clear the writeback bits in the extent tree for this IO + * end_page_writeback if the page has no more pending IO + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static int end_bio_extent_writepage(struct bio *bio, + unsigned int bytes_done, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_map_tree *tree = bio->bi_private; + u64 start; + u64 end; + int whole_page; + + if (bio->bi_size) + return 1; + + do { + struct page *page = bvec->bv_page; + start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, GFP_ATOMIC); + ClearPageUptodate(page); + SetPageError(page); + } + clear_extent_writeback(tree, start, end, GFP_ATOMIC); + + if (whole_page) + end_page_writeback(page); + else + check_page_writeback(tree, page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + return 0; +} + +/* + * after a readpage IO is done, we need to: + * clear the uptodate bits on error + * set the uptodate bits if things worked + * set the page up to date if all extents in the tree are uptodate + * clear the lock bit in the extent tree + * unlock the page if there are no other extents locked for it + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static int end_bio_extent_readpage(struct bio *bio, + unsigned int bytes_done, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_map_tree *tree = bio->bi_private; + u64 start; + u64 end; + int whole_page; + + if (bio->bi_size) + return 1; + + do { + struct page *page = bvec->bv_page; + start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate) { + set_extent_uptodate(tree, start, end, GFP_ATOMIC); + if (whole_page) + SetPageUptodate(page); + else + check_page_uptodate(tree, page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + + unlock_extent(tree, start, end, GFP_ATOMIC); + + if (whole_page) + unlock_page(page); + else + check_page_locked(tree, page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + return 0; +} + +/* + * IO done from prepare_write is pretty simple, we just unlock + * the structs in the extent tree when done, and set the uptodate bits + * as appropriate. + */ +static int end_bio_extent_preparewrite(struct bio *bio, + unsigned int bytes_done, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_map_tree *tree = bio->bi_private; + u64 start; + u64 end; + + if (bio->bi_size) + return 1; + + do { + struct page *page = bvec->bv_page; + start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate) { + set_extent_uptodate(tree, start, end, GFP_ATOMIC); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + + unlock_extent(tree, start, end, GFP_ATOMIC); + + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + return 0; +} + +static int submit_extent_page(int rw, struct extent_map_tree *tree, + struct page *page, sector_t sector, + size_t size, unsigned long offset, + struct block_device *bdev, + bio_end_io_t end_io_func) +{ + struct bio *bio; + int ret = 0; + + bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_io_vec[0].bv_page = page; + bio->bi_io_vec[0].bv_len = size; + bio->bi_io_vec[0].bv_offset = offset; + + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = size; + + bio->bi_end_io = end_io_func; + bio->bi_private = tree; + + bio_get(bio); + submit_bio(rw, bio); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + + bio_put(bio); + return ret; +} + +/* + * basic readpage implementation. Locked extent state structs are inserted + * into the tree that are removed when the IO is done (by the end_io + * handlers) + */ +int extent_read_full_page(struct extent_map_tree *tree, struct page *page, + get_extent_t *get_extent) +{ + struct inode *inode = page->mapping->host; + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 cur_end; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t page_offset = 0; + size_t iosize; + size_t blocksize = inode->i_sb->s_blocksize; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + set_page_private(page, 1); + page_cache_get(page); + } + + end = page_end; + lock_extent(tree, start, end, GFP_NOFS); + + while (cur <= end) { + if (cur >= last_byte) { + iosize = PAGE_CACHE_SIZE - page_offset; + zero_user_page(page, page_offset, iosize, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + break; + } + em = get_extent(inode, page, page_offset, cur, end, 0); + if (IS_ERR(em) || !em) { + SetPageError(page); + unlock_extent(tree, cur, end, GFP_NOFS); + break; + } + + extent_offset = cur - em->start; + BUG_ON(em->end < cur); + BUG_ON(end < cur); + + iosize = min(em->end - cur, end - cur) + 1; + cur_end = min(em->end, end); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + free_extent_map(em); + em = NULL; + + /* we've found a hole, just zero and go on */ + if (block_start == 0) { + zero_user_page(page, page_offset, iosize, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* the get_extent function already copied into the page */ + if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, bdev, + end_bio_extent_readpage); + if (ret) + SetPageError(page); + cur = cur + iosize; + page_offset += iosize; + nr++; + } + if (!nr) { + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + return 0; +} +EXPORT_SYMBOL(extent_read_full_page); + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +int extent_write_full_page(struct extent_map_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t page_offset = 0; + size_t iosize; + size_t blocksize; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + + if (page->index > end_index) { + clear_extent_dirty(tree, start, page_end, GFP_NOFS); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + size_t offset = i_size & (PAGE_CACHE_SIZE - 1); + zero_user_page(page, offset, + PAGE_CACHE_SIZE - offset, KM_USER0); + } + + if (!PagePrivate(page)) { + SetPagePrivate(page); + set_page_private(page, 1); + page_cache_get(page); + } + + end = page_end; + lock_extent(tree, start, page_end, GFP_NOFS); + + if (last_byte <= start) { + clear_extent_dirty(tree, start, page_end, GFP_NOFS); + goto done; + } + + set_extent_uptodate(tree, start, page_end, GFP_NOFS); + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + if (cur >= last_byte) { + clear_extent_dirty(tree, cur, page_end, GFP_NOFS); + break; + } + em = get_extent(inode, page, page_offset, cur, end, 1); + if (IS_ERR(em) || !em) { + SetPageError(page); + break; + } + + extent_offset = cur - em->start; + BUG_ON(em->end < cur); + BUG_ON(end < cur); + iosize = min(em->end - cur, end - cur) + 1; + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + free_extent_map(em); + em = NULL; + + if (block_start == 0 || block_start == EXTENT_MAP_INLINE) { + clear_extent_dirty(tree, cur, + cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + + /* leave this out until we have a page_mkwrite call */ + if (0 && !test_range_bit(tree, cur, cur + iosize - 1, + EXTENT_DIRTY, 0)) { + cur = cur + iosize; + page_offset += iosize; + continue; + } + clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); + set_range_writeback(tree, cur, cur + iosize - 1); + ret = submit_extent_page(WRITE, tree, page, + sector, iosize, page_offset, bdev, + end_bio_extent_writepage); + if (ret) + SetPageError(page); + cur = cur + iosize; + page_offset += iosize; + nr++; + } +done: + WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0)); + unlock_extent(tree, start, page_end, GFP_NOFS); + unlock_page(page); + return 0; +} +EXPORT_SYMBOL(extent_write_full_page); + +/* + * basic invalidatepage code, this waits on any locked or writeback + * ranges corresponding to the page, and then deletes any extent state + * records from the tree + */ +int extent_invalidatepage(struct extent_map_tree *tree, + struct page *page, unsigned long offset) +{ + u64 start = (page->index << PAGE_CACHE_SHIFT); + u64 end = start + PAGE_CACHE_SIZE - 1; + size_t blocksize = page->mapping->host->i_sb->s_blocksize; + + start += (offset + blocksize -1) & ~(blocksize - 1); + if (start > end) + return 0; + + lock_extent(tree, start, end, GFP_NOFS); + wait_on_extent_writeback(tree, start, end); + clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY, + 1, 1, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(extent_invalidatepage); + +/* + * simple commit_write call, set_range_dirty is used to mark both + * the pages and the extent records as dirty + */ +int extent_commit_write(struct extent_map_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + set_page_private(page, 1); + page_cache_get(page); + } + + set_page_dirty(page); + + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} +EXPORT_SYMBOL(extent_commit_write); + +int extent_prepare_write(struct extent_map_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent) +{ + u64 page_start = page->index << PAGE_CACHE_SHIFT; + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 block_start; + u64 orig_block_start; + u64 block_end; + u64 cur_end; + struct extent_map *em; + unsigned blocksize = 1 << inode->i_blkbits; + size_t page_offset = 0; + size_t block_off_start; + size_t block_off_end; + int err = 0; + int iocount = 0; + int ret = 0; + int isnew; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + set_page_private(page, 1); + page_cache_get(page); + } + block_start = (page_start + from) & ~((u64)blocksize - 1); + block_end = (page_start + to - 1) | (blocksize - 1); + orig_block_start = block_start; + + lock_extent(tree, page_start, page_end, GFP_NOFS); + while(block_start <= block_end) { + em = get_extent(inode, page, page_offset, block_start, + block_end, 1); + if (IS_ERR(em) || !em) { + goto err; + } + cur_end = min(block_end, em->end); + block_off_start = block_start & (PAGE_CACHE_SIZE - 1); + block_off_end = block_off_start + blocksize; + isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); + + if (!PageUptodate(page) && isnew && + (block_off_end > to || block_off_start < from)) { + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + if (block_off_end > to) + memset(kaddr + to, 0, block_off_end - to); + if (block_off_start < from) + memset(kaddr + block_off_start, 0, + from - block_off_start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + if (!isnew && !PageUptodate(page) && + (block_off_end > to || block_off_start < from) && + !test_range_bit(tree, block_start, cur_end, + EXTENT_UPTODATE, 1)) { + u64 sector; + u64 extent_offset = block_start - em->start; + size_t iosize; + sector = (em->block_start + extent_offset) >> 9; + iosize = (cur_end - block_start + blocksize - 1) & + ~((u64)blocksize - 1); + /* + * we've already got the extent locked, but we + * need to split the state such that our end_bio + * handler can clear the lock. + */ + set_extent_bit(tree, block_start, + block_start + iosize - 1, + EXTENT_LOCKED, 0, NULL, GFP_NOFS); + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, em->bdev, + end_bio_extent_preparewrite); + iocount++; + block_start = block_start + iosize; + } else { + set_extent_uptodate(tree, block_start, cur_end, + GFP_NOFS); + unlock_extent(tree, block_start, cur_end, GFP_NOFS); + block_start = cur_end + 1; + } + page_offset = block_start & (PAGE_CACHE_SIZE - 1); + free_extent_map(em); + } + if (iocount) { + wait_extent_bit(tree, orig_block_start, + block_end, EXTENT_LOCKED); + } + check_page_uptodate(tree, page); +err: + /* FIXME, zero out newly allocated blocks on error */ + return err; +} +EXPORT_SYMBOL(extent_prepare_write); + +/* + * a helper for releasepage. As long as there are no locked extents + * in the range corresponding to the page, both state records and extent + * map records are removed + */ +int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) +{ + struct extent_map *em; + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + u64 orig_start = start; + + while (start <= end) { + em = lookup_extent_mapping(tree, start, end); + if (!em || IS_ERR(em)) + break; + if (test_range_bit(tree, em->start, em->end, + EXTENT_LOCKED, 0)) { + free_extent_map(em); + start = em->end + 1; +printk("range still locked %Lu %Lu\n", em->start, em->end); + break; + } + remove_extent_mapping(tree, em); + start = em->end + 1; + /* once for the rb tree */ + free_extent_map(em); + /* once for us */ + free_extent_map(em); + } + WARN_ON(test_range_bit(tree, orig_start, end, EXTENT_WRITEBACK, 0)); + clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, + 1, 1, GFP_NOFS); + return 1; +} +EXPORT_SYMBOL(try_release_extent_mapping); + -- cgit v1.2.3 From b888db2bd7b67f190b32934e6a86181f262ac3ec Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Aug 2007 16:49:44 -0400 Subject: Btrfs: Add delayed allocation to the extent based page tree code Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 147 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 112 insertions(+), 35 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index d378edf0964..a9c7419615b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -30,9 +30,6 @@ struct tree_entry { #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) -static LIST_HEAD(all_states); -spinlock_t state_lock = SPIN_LOCK_UNLOCKED; - void __init extent_map_init(void) { extent_map_cache = kmem_cache_create("extent_map", @@ -49,15 +46,6 @@ void __init extent_map_init(void) void __exit extent_map_exit(void) { - while(!list_empty(&all_states)) { - struct extent_state *state; - struct list_head *cur = all_states.next; - state = list_entry(cur, struct extent_state, list); - printk("found leaked state %Lu %Lu state %d in_tree %d\n", - state->start, state->end, state->state, state->in_tree); - list_del(&state->list); - kfree(state); - } if (extent_map_cache) kmem_cache_destroy(extent_map_cache); if (extent_state_cache) @@ -69,6 +57,7 @@ void extent_map_tree_init(struct extent_map_tree *tree, { tree->map.rb_node = NULL; tree->state.rb_node = NULL; + tree->fill_delalloc = NULL; rwlock_init(&tree->lock); tree->mapping = mapping; } @@ -106,9 +95,6 @@ struct extent_state *alloc_extent_state(gfp_t mask) state->in_tree = 0; atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); - spin_lock_irq(&state_lock); - list_add(&state->list, &all_states); - spin_unlock_irq(&state_lock); return state; } EXPORT_SYMBOL(alloc_extent_state); @@ -117,9 +103,6 @@ void free_extent_state(struct extent_state *state) { if (atomic_dec_and_test(&state->refs)) { WARN_ON(state->in_tree); - spin_lock_irq(&state_lock); - list_del_init(&state->list); - spin_unlock_irq(&state_lock); kmem_cache_free(extent_state_cache, state); } } @@ -369,7 +352,7 @@ static int insert_state(struct extent_map_tree *tree, if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); -printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); + printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); free_extent_state(state); return -EEXIST; } @@ -408,7 +391,7 @@ static int split_state(struct extent_map_tree *tree, struct extent_state *orig, if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); -printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); + printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); free_extent_state(prealloc); return -EEXIST; } @@ -792,10 +775,20 @@ int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, } EXPORT_SYMBOL(set_extent_dirty); +int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_delalloc); + int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_DIRTY, 0, 0, mask); + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); } EXPORT_SYMBOL(clear_extent_dirty); @@ -922,6 +915,62 @@ int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) } EXPORT_SYMBOL(set_range_writeback); +u64 find_lock_delalloc_range(struct extent_map_tree *tree, + u64 start, u64 lock_start, u64 *end, u64 max_bytes) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = start; + u64 found = 0; + u64 total_bytes = 0; + + write_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ +search_again: + node = tree_search(&tree->state, cur_start); + if (!node || IS_ERR(node)) { + goto out; + } + + while(1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != cur_start) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + goto out; + } + if (state->start >= lock_start) { + if (state->state & EXTENT_LOCKED) { + DEFINE_WAIT(wait); + atomic_inc(&state->refs); + write_unlock_irq(&tree->lock); + schedule(); + write_lock_irq(&tree->lock); + finish_wait(&state->wq, &wait); + free_extent_state(state); + goto search_again; + } + state->state |= EXTENT_LOCKED; + } + found++; + *end = state->end; + cur_start = state->end + 1; + node = rb_next(node); + if (!node) + break; + total_bytes = state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + } +out: + write_unlock_irq(&tree->lock); + return found; +} + /* * helper function to lock both pages and extents in the tree. * pages must be locked first. @@ -1285,6 +1334,7 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page, if (!PagePrivate(page)) { SetPagePrivate(page); set_page_private(page, 1); + WARN_ON(!page->mapping->a_ops->invalidatepage); page_cache_get(page); } @@ -1384,7 +1434,10 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, size_t blocksize; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + u64 nr_delalloc; + u64 delalloc_end; + WARN_ON(!PageLocked(page)); if (page->index > end_index) { clear_extent_dirty(tree, start, page_end, GFP_NOFS); unlock_page(page); @@ -1400,11 +1453,34 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, if (!PagePrivate(page)) { SetPagePrivate(page); set_page_private(page, 1); + WARN_ON(!page->mapping->a_ops->invalidatepage); page_cache_get(page); } - end = page_end; lock_extent(tree, start, page_end, GFP_NOFS); + nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc) { + tree->fill_delalloc(inode, start, delalloc_end); + if (delalloc_end >= page_end + 1) { + clear_extent_bit(tree, page_end + 1, delalloc_end, + EXTENT_LOCKED | EXTENT_DELALLOC, + 1, 0, GFP_NOFS); + } + clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC, + 0, 0, GFP_NOFS); + if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { + printk("found delalloc bits after clear extent_bit\n"); + } + } else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { + printk("found delalloc bits after find_delalloc_range returns 0\n"); + } + + end = page_end; + if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { + printk("found delalloc bits after lock_extent\n"); + } if (last_byte <= start) { clear_extent_dirty(tree, start, page_end, GFP_NOFS); @@ -1419,7 +1495,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, clear_extent_dirty(tree, cur, page_end, GFP_NOFS); break; } - em = get_extent(inode, page, page_offset, cur, end, 1); + em = get_extent(inode, page, page_offset, cur, end, 0); if (IS_ERR(em) || !em) { SetPageError(page); break; @@ -1507,6 +1583,7 @@ int extent_commit_write(struct extent_map_tree *tree, if (!PagePrivate(page)) { SetPagePrivate(page); set_page_private(page, 1); + WARN_ON(!page->mapping->a_ops->invalidatepage); page_cache_get(page); } @@ -1543,6 +1620,7 @@ int extent_prepare_write(struct extent_map_tree *tree, if (!PagePrivate(page)) { SetPagePrivate(page); set_page_private(page, 1); + WARN_ON(!page->mapping->a_ops->invalidatepage); page_cache_get(page); } block_start = (page_start + from) & ~((u64)blocksize - 1); @@ -1628,29 +1706,28 @@ int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) u64 start = page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; u64 orig_start = start; + int ret = 1; while (start <= end) { em = lookup_extent_mapping(tree, start, end); if (!em || IS_ERR(em)) break; - if (test_range_bit(tree, em->start, em->end, - EXTENT_LOCKED, 0)) { + if (!test_range_bit(tree, em->start, em->end, + EXTENT_LOCKED, 0)) { + remove_extent_mapping(tree, em); + /* once for the rb tree */ free_extent_map(em); - start = em->end + 1; -printk("range still locked %Lu %Lu\n", em->start, em->end); - break; } - remove_extent_mapping(tree, em); start = em->end + 1; - /* once for the rb tree */ - free_extent_map(em); /* once for us */ free_extent_map(em); } - WARN_ON(test_range_bit(tree, orig_start, end, EXTENT_WRITEBACK, 0)); - clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, - 1, 1, GFP_NOFS); - return 1; + if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0)) + ret = 0; + else + clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, + 1, 1, GFP_NOFS); + return ret; } EXPORT_SYMBOL(try_release_extent_mapping); -- cgit v1.2.3 From 07157aacb1ecd394a5494910b5f80d7e7d407d37 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 30 Aug 2007 08:50:51 -0400 Subject: Btrfs: Add file data csums back in via hooks in the extent map code Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 94 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a9c7419615b..6053f9db073 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -57,7 +57,7 @@ void extent_map_tree_init(struct extent_map_tree *tree, { tree->map.rb_node = NULL; tree->state.rb_node = NULL; - tree->fill_delalloc = NULL; + tree->ops = NULL; rwlock_init(&tree->lock); tree->mapping = mapping; } @@ -93,6 +93,7 @@ struct extent_state *alloc_extent_state(gfp_t mask) return state; state->state = 0; state->in_tree = 0; + state->private = 0; atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); return state; @@ -1034,6 +1035,61 @@ int unlock_range(struct extent_map_tree *tree, u64 start, u64 end) } EXPORT_SYMBOL(unlock_range); +int set_state_private(struct extent_map_tree *tree, u64 start, u64 private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + write_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, start); + if (!node || IS_ERR(node)) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + state->private = private; +out: + write_unlock_irq(&tree->lock); + return ret; + +} + +int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + read_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, start); + if (!node || IS_ERR(node)) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + *private = state->private; +out: + read_unlock_irq(&tree->lock); + return ret; +} + /* * searches a range in the state tree for a given mask. * If 'filled' == 1, this returns 1 only if ever extent in the tree @@ -1185,12 +1241,13 @@ static int end_bio_extent_writepage(struct bio *bio, static int end_bio_extent_readpage(struct bio *bio, unsigned int bytes_done, int err) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct extent_map_tree *tree = bio->bi_private; u64 start; u64 end; int whole_page; + int ret; if (bio->bi_size) return 1; @@ -1208,6 +1265,11 @@ static int end_bio_extent_readpage(struct bio *bio, if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); + if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { + ret = tree->ops->readpage_end_io_hook(page, start, end); + if (ret) + uptodate = 0; + } if (uptodate) { set_extent_uptodate(tree, start, end, GFP_ATOMIC); if (whole_page) @@ -1388,9 +1450,16 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page, continue; } - ret = submit_extent_page(READ, tree, page, - sector, iosize, page_offset, bdev, - end_bio_extent_readpage); + ret = 0; + if (tree->ops && tree->ops->readpage_io_hook) { + ret = tree->ops->readpage_io_hook(page, cur, + cur + iosize - 1); + } + if (!ret) { + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, + bdev, end_bio_extent_readpage); + } if (ret) SetPageError(page); cur = cur + iosize; @@ -1462,7 +1531,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, &delalloc_end, 128 * 1024 * 1024); if (nr_delalloc) { - tree->fill_delalloc(inode, start, delalloc_end); + tree->ops->fill_delalloc(inode, start, delalloc_end); if (delalloc_end >= page_end + 1) { clear_extent_bit(tree, page_end + 1, delalloc_end, EXTENT_LOCKED | EXTENT_DELALLOC, @@ -1528,12 +1597,17 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, continue; } clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); - set_range_writeback(tree, cur, cur + iosize - 1); - ret = submit_extent_page(WRITE, tree, page, - sector, iosize, page_offset, bdev, - end_bio_extent_writepage); + ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1); if (ret) SetPageError(page); + else { + set_range_writeback(tree, cur, cur + iosize - 1); + ret = submit_extent_page(WRITE, tree, page, sector, + iosize, page_offset, bdev, + end_bio_extent_writepage); + if (ret) + SetPageError(page); + } cur = cur + iosize; page_offset += iosize; nr++; -- cgit v1.2.3 From 2bf5a725a3b82efeaf7b292c085e69a9388a89ea Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 30 Aug 2007 11:54:02 -0400 Subject: Btrfs: fsx delalloc fixes Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6053f9db073..8ad6f8efc5a 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -77,6 +77,8 @@ EXPORT_SYMBOL(alloc_extent_map); void free_extent_map(struct extent_map *em) { + if (!em) + return; if (atomic_dec_and_test(&em->refs)) { WARN_ON(em->in_tree); kmem_cache_free(extent_map_cache, em); @@ -102,6 +104,8 @@ EXPORT_SYMBOL(alloc_extent_state); void free_extent_state(struct extent_state *state) { + if (!state) + return; if (atomic_dec_and_test(&state->refs)) { WARN_ON(state->in_tree); kmem_cache_free(extent_state_cache, state); @@ -1395,8 +1399,8 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page, if (!PagePrivate(page)) { SetPagePrivate(page); - set_page_private(page, 1); WARN_ON(!page->mapping->a_ops->invalidatepage); + set_page_private(page, 1); page_cache_get(page); } @@ -1638,7 +1642,8 @@ int extent_invalidatepage(struct extent_map_tree *tree, lock_extent(tree, start, end, GFP_NOFS); wait_on_extent_writeback(tree, start, end); - clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY, + clear_extent_bit(tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 1, 1, GFP_NOFS); return 0; } -- cgit v1.2.3 From 86479a04eef8f304a13aeb8b64bcc8e506a68268 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 10 Sep 2007 19:58:16 -0400 Subject: Add support for defragging files via btrfsctl -d. Avoid OOM on extent tree defrag. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8ad6f8efc5a..33f7a18dddf 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -10,6 +10,12 @@ #include #include "extent_map.h" +/* temporary define until extent_map moves out of btrfs */ +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *, struct kmem_cache *, + unsigned long)); + static struct kmem_cache *extent_map_cache; static struct kmem_cache *extent_state_cache; @@ -32,14 +38,12 @@ struct tree_entry { void __init extent_map_init(void) { - extent_map_cache = kmem_cache_create("extent_map", - sizeof(struct extent_map), 0, - SLAB_RECLAIM_ACCOUNT | + extent_map_cache = btrfs_cache_create("extent_map", + sizeof(struct extent_map), SLAB_DESTROY_BY_RCU, NULL); - extent_state_cache = kmem_cache_create("extent_state", - sizeof(struct extent_state), 0, - SLAB_RECLAIM_ACCOUNT | + extent_state_cache = btrfs_cache_create("extent_state", + sizeof(struct extent_state), SLAB_DESTROY_BY_RCU, NULL); } -- cgit v1.2.3 From a8c450b211c010ac55190da23ceb6b39b393f411 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 10 Sep 2007 20:00:27 -0400 Subject: Btrfs: Reorder tests in set_extent_bit to properly find holes Yan Zheng noticed that set_extent_bit was exiting too early when there was a hole in the map. The fix is to reorder the tests to check for the hole first. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 33f7a18dddf..370ed97e52f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -712,28 +712,6 @@ again: } goto search_again; } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - set = state->state & bits; - if (exclusive && set) { - *failed_start = start; - err = -EEXIST; - goto out; - } - err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); - - prealloc->state |= bits; - merge_state(tree, prealloc); - prealloc = NULL; - goto out; - } - /* * | ---- desired range ---- | * | state | or | state | @@ -756,6 +734,28 @@ again: start = this_end + 1; goto search_again; } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + prealloc->state |= bits; + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + goto search_again; out: -- cgit v1.2.3 From 90f1c19a9fd2a943adc69d2b9b8c83bcc4bba6f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Sep 2007 20:02:27 -0400 Subject: Btrfs: [PATCH] extent_map: fix locking for bio completion The bio completion handlers can be run in any context, e.g. when using the old ide driver they run in hardirq context with irqs disabled so lockdep rightfully warns about using write_lock_irq useage in these handlers. This patch switches clear_extent_bit and set_extent_bit to write_lock_irqsave to fix this problem. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 370ed97e52f..3021564ac62 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -456,6 +456,7 @@ int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + unsigned long flags; int err; int set = 0; @@ -466,7 +467,7 @@ again: return -ENOMEM; } - write_lock_irq(&tree->lock); + write_lock_irqsave(&tree->lock, flags); /* * this search will find the extents that end after * our range starts @@ -533,7 +534,7 @@ again: goto search_again; out: - write_unlock_irq(&tree->lock); + write_unlock_irqrestore(&tree->lock, flags); if (prealloc) free_extent_state(prealloc); @@ -542,7 +543,7 @@ out: search_again: if (start >= end) goto out; - write_unlock_irq(&tree->lock); + write_unlock_irqrestore(&tree->lock, flags); if (mask & __GFP_WAIT) cond_resched(); goto again; @@ -628,6 +629,7 @@ int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + unsigned long flags; int err = 0; int set; u64 last_start; @@ -639,7 +641,7 @@ again: return -ENOMEM; } - write_lock_irq(&tree->lock); + write_lock_irqsave(&tree->lock, flags); /* * this search will find all the extents that end after * our range starts. @@ -759,7 +761,7 @@ again: goto search_again; out: - write_unlock_irq(&tree->lock); + write_unlock_irqrestore(&tree->lock, flags); if (prealloc) free_extent_state(prealloc); @@ -768,7 +770,7 @@ out: search_again: if (start > end) goto out; - write_unlock_irq(&tree->lock); + write_unlock_irqrestore(&tree->lock, flags); if (mask & __GFP_WAIT) cond_resched(); goto again; -- cgit v1.2.3 From d396c6f554d182b7bce4e2bb2acb47f80430c05f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Sep 2007 20:02:30 -0400 Subject: Btrfs: [PATCH] extent_map: provide generic bmap generic_bmap is completely trivial, while the extent to bh mapping in btrfs is rather complex. So provide a extent_bmap instead that takes a get_extent callback and can be used by filesystem using the extent_map code. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 3021564ac62..ff2b1dd25bf 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1816,3 +1816,22 @@ int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) } EXPORT_SYMBOL(try_release_extent_mapping); +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent) +{ + struct inode *inode = mapping->host; + u64 start = iblock << inode->i_blkbits; + u64 end = start + (1 << inode->i_blkbits) - 1; + struct extent_map *em; + + em = get_extent(inode, NULL, 0, start, end, 0); + if (!em || IS_ERR(em)) + return 0; + + // XXX(hch): block 0 is valid in some cases, e.g. XFS RT device + if (em->block_start == EXTENT_MAP_INLINE || + em->block_start == 0) + return 0; + + return (em->block_start + start - em->start) >> inode->i_blkbits; +} -- cgit v1.2.3 From b06355f0fe04a62d08238438654ec453e05304ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Sep 2007 20:02:32 -0400 Subject: Btrfs: [PATCH] extent_map: make the writepage_io hook optional The writepage_io is not mandatory, e.g. my port of xfs to the extent_map code does not have one for now. So handle a NULL pointer gracefully here. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ff2b1dd25bf..162766773a3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1607,7 +1607,12 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, continue; } clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); - ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1); + if (tree->ops && tree->ops->writepage_io_hook) { + ret = tree->ops->writepage_io_hook(page, cur, + cur + iosize - 1); + } else { + ret = 0; + } if (ret) SetPageError(page); else { -- cgit v1.2.3 From 0e2752a72cb37075b24899f01e9bc6a589de3b6c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Sep 2007 20:02:33 -0400 Subject: Btrfs: [PATCH] extent_map: add writepage_end_io hook XFS updates the ondisk inode size only after the data I/O has finished, so it needs a hook when the writepage end_bio handler has finished. Might not be worth applying as-is as the per-page callback is very ineffcient. What XFS really wants is a callback when writeout of a whole extent has completed. This delayed i_size updates scheme might be worthwile for btrfs aswell, btw. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 162766773a3..ab5bde31b68 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1231,6 +1231,8 @@ static int end_bio_extent_writepage(struct bio *bio, end_page_writeback(page); else check_page_writeback(tree, page); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, end); } while (bvec >= bio->bi_io_vec); bio_put(bio); -- cgit v1.2.3 From b3cfa35a493866cffd00952438b6c77364d42d3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Sep 2007 11:25:58 -0400 Subject: Btrfs: factor page private preparations into a helper Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ab5bde31b68..0ab368e091f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1379,6 +1379,16 @@ static int submit_extent_page(int rw, struct extent_map_tree *tree, return ret; } +void set_page_extent_mapped(struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + WARN_ON(!page->mapping->a_ops->invalidatepage); + set_page_private(page, 1); + page_cache_get(page); + } +} + /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io @@ -1405,12 +1415,7 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page, size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; - if (!PagePrivate(page)) { - SetPagePrivate(page); - WARN_ON(!page->mapping->a_ops->invalidatepage); - set_page_private(page, 1); - page_cache_get(page); - } + set_page_extent_mapped(page); end = page_end; lock_extent(tree, start, end, GFP_NOFS); @@ -1531,12 +1536,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, PAGE_CACHE_SIZE - offset, KM_USER0); } - if (!PagePrivate(page)) { - SetPagePrivate(page); - set_page_private(page, 1); - WARN_ON(!page->mapping->a_ops->invalidatepage); - page_cache_get(page); - } + set_page_extent_mapped(page); lock_extent(tree, start, page_end, GFP_NOFS); nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1, @@ -1672,13 +1672,7 @@ int extent_commit_write(struct extent_map_tree *tree, { loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - if (!PagePrivate(page)) { - SetPagePrivate(page); - set_page_private(page, 1); - WARN_ON(!page->mapping->a_ops->invalidatepage); - page_cache_get(page); - } - + set_page_extent_mapped(page); set_page_dirty(page); if (pos > inode->i_size) { @@ -1709,12 +1703,8 @@ int extent_prepare_write(struct extent_map_tree *tree, int ret = 0; int isnew; - if (!PagePrivate(page)) { - SetPagePrivate(page); - set_page_private(page, 1); - WARN_ON(!page->mapping->a_ops->invalidatepage); - page_cache_get(page); - } + set_page_extent_mapped(page); + block_start = (page_start + from) & ~((u64)blocksize - 1); block_end = (page_start + to - 1) | (blocksize - 1); orig_block_start = block_start; -- cgit v1.2.3 From 5f39d397dfbe140a14edecd4e73c34ce23c4f9ee Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:14:19 -0400 Subject: Btrfs: Create extent_buffer interface for large blocksizes Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 672 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 656 insertions(+), 16 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 0ab368e091f..55fefdfab84 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -26,16 +26,6 @@ struct tree_entry { struct rb_node rb_node; }; -/* bits for the extent state */ -#define EXTENT_DIRTY 1 -#define EXTENT_WRITEBACK (1 << 1) -#define EXTENT_UPTODATE (1 << 2) -#define EXTENT_LOCKED (1 << 3) -#define EXTENT_NEW (1 << 4) -#define EXTENT_DELALLOC (1 << 5) - -#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) - void __init extent_map_init(void) { extent_map_cache = btrfs_cache_create("extent_map", @@ -223,7 +213,8 @@ int add_extent_mapping(struct extent_map_tree *tree, if (rb) prev = rb_entry(rb, struct extent_map, rb_node); if (prev && prev->end + 1 == em->start && - ((em->block_start == 0 && prev->block_start == 0) || + ((em->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || (em->block_start == prev->block_end + 1))) { em->start = prev->start; em->block_start = prev->block_start; @@ -926,6 +917,40 @@ int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) } EXPORT_SYMBOL(set_range_writeback); +int find_first_extent_bit(struct extent_map_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 1; + + write_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, start); + if (!node || IS_ERR(node)) { + goto out; + } + + while(1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->state & bits) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + } + node = rb_next(node); + if (!node) + break; + } +out: + write_unlock_irq(&tree->lock); + return ret; +} +EXPORT_SYMBOL(find_first_extent_bit); + u64 find_lock_delalloc_range(struct extent_map_tree *tree, u64 start, u64 lock_start, u64 *end, u64 max_bytes) { @@ -1450,7 +1475,7 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page, em = NULL; /* we've found a hole, just zero and go on */ - if (block_start == 0) { + if (block_start == EXTENT_MAP_HOLE) { zero_user_page(page, page_offset, iosize, KM_USER0); set_extent_uptodate(tree, cur, cur + iosize - 1, GFP_NOFS); @@ -1593,7 +1618,8 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, free_extent_map(em); em = NULL; - if (block_start == 0 || block_start == EXTENT_MAP_INLINE) { + if (block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); cur = cur + iosize; @@ -1630,7 +1656,6 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, nr++; } done: - WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0)); unlock_extent(tree, start, page_end, GFP_NOFS); unlock_page(page); return 0; @@ -1827,8 +1852,623 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock, // XXX(hch): block 0 is valid in some cases, e.g. XFS RT device if (em->block_start == EXTENT_MAP_INLINE || - em->block_start == 0) - return 0; + em->block_start == EXTENT_MAP_HOLE) + return 0; return (em->block_start + start - em->start) >> inode->i_blkbits; } + +struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, + u64 start, unsigned long len, + gfp_t mask) +{ + unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT) + 1; + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct page *p; + struct address_space *mapping = tree->mapping; + int uptodate = 0; + + eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask); + if (!eb || IS_ERR(eb)) + return NULL; + + eb->start = start; + eb->len = len; + atomic_set(&eb->refs, 1); + + for (i = 0; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); + if (!p) + goto fail; + eb->pages[i] = p; + if (!PageUptodate(p)) + uptodate = 0; + unlock_page(p); + } + if (uptodate) + eb->flags |= EXTENT_UPTODATE; + return eb; +fail: + free_extent_buffer(eb); + return NULL; +} +EXPORT_SYMBOL(alloc_extent_buffer); + +struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, + u64 start, unsigned long len, + gfp_t mask) +{ + unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT) + 1; + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct page *p; + struct address_space *mapping = tree->mapping; + + eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask); + if (!eb || IS_ERR(eb)) + return NULL; + + eb->start = start; + eb->len = len; + atomic_set(&eb->refs, 1); + + for (i = 0; i < num_pages; i++, index++) { + p = find_get_page(mapping, index); + if (!p) + goto fail; + eb->pages[i] = p; + } + return eb; +fail: + free_extent_buffer(eb); + return NULL; +} +EXPORT_SYMBOL(find_extent_buffer); + +void free_extent_buffer(struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + + if (!eb) + return; + + if (!atomic_dec_and_test(&eb->refs)) + return; + + num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT) + 1; + + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + page_cache_release(eb->pages[i]); + } + kfree(eb); +} +EXPORT_SYMBOL(free_extent_buffer); + +int clear_extent_buffer_dirty(struct extent_map_tree *tree, + struct extent_buffer *eb) +{ + int set; + unsigned long i; + unsigned long num_pages; + struct page *page; + + u64 start = eb->start; + u64 end = start + eb->len - 1; + + set = clear_extent_dirty(tree, start, end, GFP_NOFS); + num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT) + 1; + + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + lock_page(page); + /* + * if we're on the last page or the first page and the + * block isn't aligned on a page boundary, do extra checks + * to make sure we don't clean page that is partially dirty + */ + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) { + start = page->index << PAGE_CACHE_SHIFT; + end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, + EXTENT_DIRTY, 0)) { + unlock_page(page); + continue; + } + } + clear_page_dirty_for_io(page); + unlock_page(page); + } + return 0; +} +EXPORT_SYMBOL(clear_extent_buffer_dirty); + +int wait_on_extent_buffer_writeback(struct extent_map_tree *tree, + struct extent_buffer *eb) +{ + return wait_on_extent_writeback(tree, eb->start, + eb->start + eb->len - 1); +} +EXPORT_SYMBOL(wait_on_extent_buffer_writeback); + +int set_extent_buffer_dirty(struct extent_map_tree *tree, + struct extent_buffer *eb) +{ + return set_range_dirty(tree, eb->start, eb->start + eb->len - 1); +} +EXPORT_SYMBOL(set_extent_buffer_dirty); + +int set_extent_buffer_uptodate(struct extent_map_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT) + 1; + + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) { + check_page_uptodate(tree, page); + continue; + } + SetPageUptodate(page); + } + return 0; +} +EXPORT_SYMBOL(set_extent_buffer_uptodate); + +int extent_buffer_uptodate(struct extent_map_tree *tree, + struct extent_buffer *eb) +{ + if (eb->flags & EXTENT_UPTODATE) + return 1; + return test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1); +} +EXPORT_SYMBOL(extent_buffer_uptodate); + +int read_extent_buffer_pages(struct extent_map_tree *tree, + struct extent_buffer *eb, int wait) +{ + unsigned long i; + struct page *page; + int err; + int ret = 0; + unsigned long num_pages; + + if (eb->flags & EXTENT_UPTODATE) + return 0; + + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1)) { + return 0; + } + + num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT) + 1; + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + if (PageUptodate(page)) { + continue; + } + if (!wait) { + if (TestSetPageLocked(page)) { + continue; + } + } else { + lock_page(page); + } + if (!PageUptodate(page)) { + err = page->mapping->a_ops->readpage(NULL, page); + if (err) { + ret = err; + } + } else { + unlock_page(page); + } + } + + if (ret || !wait) { + return ret; + } + + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + wait_on_page_locked(page); + if (!PageUptodate(page)) { + ret = -EIO; + } + } + eb->flags |= EXTENT_UPTODATE; + return ret; +} +EXPORT_SYMBOL(read_extent_buffer_pages); + +void read_extent_buffer(struct extent_buffer *eb, void *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + page = eb->pages[i]; + offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); + if (i == 0) + offset += start_offset; + + while(len > 0) { + WARN_ON(!PageUptodate(page)); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + // kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); + memcpy(dst, kaddr + offset, cur); + // kunmap_atomic(kaddr, KM_USER0); + + dst += cur; + len -= cur; + offset = 0; + i++; + page = eb->pages[i]; + } +} +EXPORT_SYMBOL(read_extent_buffer); + +int map_extent_buffer(struct extent_buffer *eb, unsigned long start, + char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + size_t offset; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + + if (i == 0) { + offset = start_offset; + *map_start = 0; + } else { + offset = 0; + *map_start = (i << PAGE_CACHE_SHIFT) - offset; + } + + // kaddr = kmap_atomic(eb->pages[i], km); + kaddr = page_address(eb->pages[i]); + *token = kaddr; + *map = kaddr + offset; + *map_len = PAGE_CACHE_SIZE - offset; + return 0; +} +EXPORT_SYMBOL(map_extent_buffer); + +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) +{ + // kunmap_atomic(token, km); +} +EXPORT_SYMBOL(unmap_extent_buffer); + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *ptr = (char *)ptrv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + page = eb->pages[i]; + offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); + if (i == 0) + offset += start_offset; + + while(len > 0) { + WARN_ON(!PageUptodate(page)); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + + // kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); + ret = memcmp(ptr, kaddr + offset, cur); + // kunmap_atomic(kaddr, KM_USER0); + if (ret) + break; + + ptr += cur; + len -= cur; + offset = 0; + i++; + page = eb->pages[i]; + } + return ret; +} +EXPORT_SYMBOL(memcmp_extent_buffer); + +void write_extent_buffer(struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *src = (char *)srcv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + page = eb->pages[i]; + offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); + if (i == 0) + offset += start_offset; + + while(len > 0) { + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + // kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); + memcpy(kaddr + offset, src, cur); + // kunmap_atomic(kaddr, KM_USER0); + + src += cur; + len -= cur; + offset = 0; + i++; + page = eb->pages[i]; + } +} +EXPORT_SYMBOL(write_extent_buffer); + +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + page = eb->pages[i]; + offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); + if (i == 0) + offset += start_offset; + + while(len > 0) { + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + // kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); + memset(kaddr + offset, c, cur); + // kunmap_atomic(kaddr, KM_USER0); + + len -= cur; + offset = 0; + i++; + page = eb->pages[i]; + } +} +EXPORT_SYMBOL(memset_extent_buffer); + +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + u64 dst_len = dst->len; + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(src->len != dst_len); + + offset = dst_offset & ((unsigned long)PAGE_CACHE_SIZE - 1); + if (i == 0) + offset += start_offset; + + while(len > 0) { + page = dst->pages[i]; + WARN_ON(!PageUptodate(page)); + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + + // kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); + read_extent_buffer(src, kaddr + offset, src_offset, cur); + // kunmap_atomic(kaddr, KM_USER1); + + src_offset += cur; + len -= cur; + offset = 0; + i++; + } +} +EXPORT_SYMBOL(copy_extent_buffer); + +static void move_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + // char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); + if (dst_page == src_page) { + memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); + } else { + // char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *src_kaddr = page_address(src_page); + char *p = dst_kaddr + dst_off + len; + char *s = src_kaddr + src_off + len; + + while (len--) + *--p = *--s; + + // kunmap_atomic(src_kaddr, KM_USER1); + } + // kunmap_atomic(dst_kaddr, KM_USER0); +} + +static void copy_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + //kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); + char *src_kaddr; + + if (dst_page != src_page) + src_kaddr = page_address(src_page); // kmap_atomic(src_page, KM_USER1); + else + src_kaddr = dst_kaddr; + + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + /* + kunmap_atomic(dst_kaddr, KM_USER0); + if (dst_page != src_page) + kunmap_atomic(src_kaddr, KM_USER1); + */ +} + +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk("memmove bogus src_offset %lu move len %lu len %lu\n", + src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk("memmove bogus dst_offset %lu move len %lu len %lu\n", + dst_offset, len, dst->len); + BUG_ON(1); + } + + while(len > 0) { + dst_off_in_page = dst_offset & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = src_offset & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + + if (src_i == 0) + src_off_in_page += start_offset; + if (dst_i == 0) + dst_off_in_page += start_offset; + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + src_off_in_page)); + cur = min(cur, (unsigned long)(PAGE_CACHE_SIZE - + dst_off_in_page)); + + copy_pages(dst->pages[dst_i], dst->pages[src_i], + dst_off_in_page, src_off_in_page, cur); + + src_offset += cur; + dst_offset += cur; + len -= cur; + } +} +EXPORT_SYMBOL(memcpy_extent_buffer); + +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + unsigned long dst_end = dst_offset + len - 1; + unsigned long src_end = src_offset + len - 1; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk("memmove bogus src_offset %lu move len %lu len %lu\n", + src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk("memmove bogus dst_offset %lu move len %lu len %lu\n", + dst_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset < src_offset) { + memcpy_extent_buffer(dst, dst_offset, src_offset, len); + return; + } + while(len > 0) { + dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + + dst_off_in_page = dst_end & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = src_end & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + if (src_i == 0) + src_off_in_page += start_offset; + if (dst_i == 0) + dst_off_in_page += start_offset; + + cur = min(len, src_off_in_page + 1); + cur = min(cur, dst_off_in_page + 1); +// printk("move pages orig dst %lu src %lu len %lu, this %lu %lu %lu\n", dst_offset, src_offset, len, dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); + move_pages(dst->pages[dst_i], dst->pages[src_i], + dst_off_in_page - cur + 1, + src_off_in_page - cur + 1, cur); + + dst_end -= cur - 1; + src_end -= cur - 1; + len -= cur; + } +} +EXPORT_SYMBOL(memmove_extent_buffer); -- cgit v1.2.3 From 479965d66e320f1a095bb76027171daa675a9c72 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:14:27 -0400 Subject: Btrfs: Optimizations for the extent_buffer code Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 55fefdfab84..8bef309e1b3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2140,14 +2140,20 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, EXPORT_SYMBOL(read_extent_buffer); int map_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, char **token, char **map, unsigned long *map_start, unsigned long *map_len, int km) { - size_t offset; + size_t offset = start & (PAGE_CACHE_SIZE - 1); char *kaddr; size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + unsigned long end_i = (start_offset + start + min_len) >> + PAGE_CACHE_SHIFT; + + if (i != end_i) + return -EINVAL; WARN_ON(start > eb->len); @@ -2155,8 +2161,7 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start, offset = start_offset; *map_start = 0; } else { - offset = 0; - *map_start = (i << PAGE_CACHE_SHIFT) - offset; + *map_start = (i << PAGE_CACHE_SHIFT) - start_offset; } // kaddr = kmap_atomic(eb->pages[i], km); -- cgit v1.2.3 From 6d36dcd48f1e4e7446d603a3df9638bd314a182d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:14:37 -0400 Subject: Btrfs: Avoid memcpy where possible in extent_buffers Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 132 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 102 insertions(+), 30 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8bef309e1b3..d2c733c68b4 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -18,6 +18,11 @@ struct kmem_cache *btrfs_cache_create(const char *name, size_t size, static struct kmem_cache *extent_map_cache; static struct kmem_cache *extent_state_cache; +static struct kmem_cache *extent_buffer_cache; +static LIST_HEAD(extent_buffers); +static spinlock_t extent_buffers_lock; +static int nr_extent_buffers; +#define MAX_EXTENT_BUFFER_CACHE 128 struct tree_entry { u64 start; @@ -29,21 +34,33 @@ struct tree_entry { void __init extent_map_init(void) { extent_map_cache = btrfs_cache_create("extent_map", - sizeof(struct extent_map), - SLAB_DESTROY_BY_RCU, + sizeof(struct extent_map), 0, NULL); extent_state_cache = btrfs_cache_create("extent_state", - sizeof(struct extent_state), - SLAB_DESTROY_BY_RCU, + sizeof(struct extent_state), 0, NULL); + extent_buffer_cache = btrfs_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + NULL); + spin_lock_init(&extent_buffers_lock); } void __exit extent_map_exit(void) { + struct extent_buffer *eb; + + while (!list_empty(&extent_buffers)) { + eb = list_entry(extent_buffers.next, + struct extent_buffer, list); + list_del(&eb->list); + kmem_cache_free(extent_buffer_cache, eb); + } if (extent_map_cache) kmem_cache_destroy(extent_map_cache); if (extent_state_cache) kmem_cache_destroy(extent_state_cache); + if (extent_buffer_cache) + kmem_cache_destroy(extent_buffer_cache); } void extent_map_tree_init(struct extent_map_tree *tree, @@ -1858,6 +1875,48 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock, return (em->block_start + start - em->start) >> inode->i_blkbits; } +static struct extent_buffer *__alloc_extent_buffer(gfp_t mask) +{ + struct extent_buffer *eb = NULL; + spin_lock(&extent_buffers_lock); + if (!list_empty(&extent_buffers)) { + eb = list_entry(extent_buffers.next, struct extent_buffer, + list); + list_del(&eb->list); + WARN_ON(nr_extent_buffers == 0); + nr_extent_buffers--; + } + spin_unlock(&extent_buffers_lock); + if (eb) { + memset(eb, 0, sizeof(*eb)); + return eb; + } + return kmem_cache_zalloc(extent_buffer_cache, mask); +} + +static void __free_extent_buffer(struct extent_buffer *eb) +{ + if (nr_extent_buffers >= MAX_EXTENT_BUFFER_CACHE) { + kmem_cache_free(extent_buffer_cache, eb); + } else { + spin_lock(&extent_buffers_lock); + list_add(&eb->list, &extent_buffers); + nr_extent_buffers++; + spin_unlock(&extent_buffers_lock); + } +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, int i) +{ + struct page *p; + if (i == 0) + return eb->first_page; + i += eb->start >> PAGE_CACHE_SHIFT; + p = find_get_page(eb->first_page->mapping, i); + page_cache_release(p); + return p; +} + struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, u64 start, unsigned long len, gfp_t mask) @@ -1871,7 +1930,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, struct address_space *mapping = tree->mapping; int uptodate = 0; - eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask); + eb = __alloc_extent_buffer(mask); if (!eb || IS_ERR(eb)) return NULL; @@ -1881,9 +1940,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); - if (!p) + if (!p) { + /* make sure the free only frees the pages we've + * grabbed a reference on + */ + eb->len = i << PAGE_CACHE_SHIFT; + eb->start &= ~((u64)PAGE_CACHE_SIZE - 1); goto fail; - eb->pages[i] = p; + } + if (i == 0) + eb->first_page = p; if (!PageUptodate(p)) uptodate = 0; unlock_page(p); @@ -1909,7 +1975,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, struct page *p; struct address_space *mapping = tree->mapping; - eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask); + eb = __alloc_extent_buffer(mask); if (!eb || IS_ERR(eb)) return NULL; @@ -1919,9 +1985,16 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, for (i = 0; i < num_pages; i++, index++) { p = find_get_page(mapping, index); - if (!p) + if (!p) { + /* make sure the free only frees the pages we've + * grabbed a reference on + */ + eb->len = i << PAGE_CACHE_SHIFT; + eb->start &= ~((u64)PAGE_CACHE_SIZE - 1); goto fail; - eb->pages[i] = p; + } + if (i == 0) + eb->first_page = p; } return eb; fail: @@ -1944,11 +2017,12 @@ void free_extent_buffer(struct extent_buffer *eb) num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - (eb->start >> PAGE_CACHE_SHIFT) + 1; - for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) - page_cache_release(eb->pages[i]); + if (eb->first_page) + page_cache_release(eb->first_page); + for (i = 1; i < num_pages; i++) { + page_cache_release(extent_buffer_page(eb, i)); } - kfree(eb); + __free_extent_buffer(eb); } EXPORT_SYMBOL(free_extent_buffer); @@ -1968,7 +2042,7 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree, (eb->start >> PAGE_CACHE_SHIFT) + 1; for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; + page = extent_buffer_page(eb, i); lock_page(page); /* * if we're on the last page or the first page and the @@ -2021,7 +2095,7 @@ int set_extent_buffer_uptodate(struct extent_map_tree *tree, set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; + page = extent_buffer_page(eb, i); if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || ((i == num_pages - 1) && ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) { @@ -2064,7 +2138,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree, num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - (eb->start >> PAGE_CACHE_SHIFT) + 1; for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; + page = extent_buffer_page(eb, i); if (PageUptodate(page)) { continue; } @@ -2090,7 +2164,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree, } for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; + page = extent_buffer_page(eb, i); wait_on_page_locked(page); if (!PageUptodate(page)) { ret = -EIO; @@ -2116,12 +2190,12 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - page = eb->pages[i]; offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); if (i == 0) offset += start_offset; while(len > 0) { + page = extent_buffer_page(eb, i); WARN_ON(!PageUptodate(page)); cur = min(len, (PAGE_CACHE_SIZE - offset)); @@ -2134,7 +2208,6 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, len -= cur; offset = 0; i++; - page = eb->pages[i]; } } EXPORT_SYMBOL(read_extent_buffer); @@ -2165,7 +2238,7 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start, } // kaddr = kmap_atomic(eb->pages[i], km); - kaddr = page_address(eb->pages[i]); + kaddr = page_address(extent_buffer_page(eb, i)); *token = kaddr; *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; @@ -2195,12 +2268,12 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - page = eb->pages[i]; offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); if (i == 0) offset += start_offset; while(len > 0) { + page = extent_buffer_page(eb, i); WARN_ON(!PageUptodate(page)); cur = min(len, (PAGE_CACHE_SIZE - offset)); @@ -2216,7 +2289,6 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, len -= cur; offset = 0; i++; - page = eb->pages[i]; } return ret; } @@ -2236,12 +2308,12 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - page = eb->pages[i]; offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); if (i == 0) offset += start_offset; while(len > 0) { + page = extent_buffer_page(eb, i); WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); @@ -2254,7 +2326,6 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, len -= cur; offset = 0; i++; - page = eb->pages[i]; } } EXPORT_SYMBOL(write_extent_buffer); @@ -2272,12 +2343,12 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - page = eb->pages[i]; offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); if (i == 0) offset += start_offset; while(len > 0) { + page = extent_buffer_page(eb, i); WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); @@ -2289,7 +2360,6 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, len -= cur; offset = 0; i++; - page = eb->pages[i]; } } EXPORT_SYMBOL(memset_extent_buffer); @@ -2313,7 +2383,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, offset += start_offset; while(len > 0) { - page = dst->pages[i]; + page = extent_buffer_page(dst, i); WARN_ON(!PageUptodate(page)); cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); @@ -2414,7 +2484,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min(cur, (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); - copy_pages(dst->pages[dst_i], dst->pages[src_i], + copy_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), dst_off_in_page, src_off_in_page, cur); src_offset += cur; @@ -2467,7 +2538,8 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min(len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); // printk("move pages orig dst %lu src %lu len %lu, this %lu %lu %lu\n", dst_offset, src_offset, len, dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); - move_pages(dst->pages[dst_i], dst->pages[src_i], + move_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); -- cgit v1.2.3 From ae5252bd51a252b7b8b02289337c36774835101c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:14:41 -0400 Subject: Btrfs: Go back to kmaps instead of page_address in extent_buffers Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 50 +++++++++++++++++++------------------------------- 1 file changed, 19 insertions(+), 31 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index d2c733c68b4..f150188f621 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1867,7 +1867,6 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock, if (!em || IS_ERR(em)) return 0; - // XXX(hch): block 0 is valid in some cases, e.g. XFS RT device if (em->block_start == EXTENT_MAP_INLINE || em->block_start == EXTENT_MAP_HOLE) return 0; @@ -2199,10 +2198,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, WARN_ON(!PageUptodate(page)); cur = min(len, (PAGE_CACHE_SIZE - offset)); - // kaddr = kmap_atomic(page, KM_USER0); - kaddr = page_address(page); + kaddr = kmap_atomic(page, KM_USER0); memcpy(dst, kaddr + offset, cur); - // kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr, KM_USER0); dst += cur; len -= cur; @@ -2237,8 +2235,7 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start, *map_start = (i << PAGE_CACHE_SHIFT) - start_offset; } - // kaddr = kmap_atomic(eb->pages[i], km); - kaddr = page_address(extent_buffer_page(eb, i)); + kaddr = kmap_atomic(extent_buffer_page(eb, i), km); *token = kaddr; *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; @@ -2248,7 +2245,7 @@ EXPORT_SYMBOL(map_extent_buffer); void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) { - // kunmap_atomic(token, km); + kunmap_atomic(token, km); } EXPORT_SYMBOL(unmap_extent_buffer); @@ -2278,10 +2275,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, cur = min(len, (PAGE_CACHE_SIZE - offset)); - // kaddr = kmap_atomic(page, KM_USER0); - kaddr = page_address(page); + kaddr = kmap_atomic(page, KM_USER0); ret = memcmp(ptr, kaddr + offset, cur); - // kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr, KM_USER0); if (ret) break; @@ -2317,10 +2313,9 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - // kaddr = kmap_atomic(page, KM_USER0); - kaddr = page_address(page); + kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr + offset, src, cur); - // kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr, KM_USER0); src += cur; len -= cur; @@ -2352,10 +2347,9 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - // kaddr = kmap_atomic(page, KM_USER0); - kaddr = page_address(page); + kaddr = kmap_atomic(page, KM_USER0); memset(kaddr + offset, c, cur); - // kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr, KM_USER0); len -= cur; offset = 0; @@ -2388,10 +2382,9 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - // kaddr = kmap_atomic(page, KM_USER1); - kaddr = page_address(page); + kaddr = kmap_atomic(page, KM_USER1); read_extent_buffer(src, kaddr + offset, src_offset, cur); - // kunmap_atomic(kaddr, KM_USER1); + kunmap_atomic(kaddr, KM_USER1); src_offset += cur; len -= cur; @@ -2405,43 +2398,38 @@ static void move_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - // char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); - char *dst_kaddr = page_address(dst_page); + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); if (dst_page == src_page) { memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); } else { - // char *src_kaddr = kmap_atomic(src_page, KM_USER1); - char *src_kaddr = page_address(src_page); + char *src_kaddr = kmap_atomic(src_page, KM_USER1); char *p = dst_kaddr + dst_off + len; char *s = src_kaddr + src_off + len; while (len--) *--p = *--s; - // kunmap_atomic(src_kaddr, KM_USER1); + kunmap_atomic(src_kaddr, KM_USER1); } - // kunmap_atomic(dst_kaddr, KM_USER0); + kunmap_atomic(dst_kaddr, KM_USER0); } static void copy_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - //kmap_atomic(dst_page, KM_USER0); - char *dst_kaddr = page_address(dst_page); + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); char *src_kaddr; if (dst_page != src_page) - src_kaddr = page_address(src_page); // kmap_atomic(src_page, KM_USER1); + src_kaddr = kmap_atomic(src_page, KM_USER1); else src_kaddr = dst_kaddr; memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); - /* kunmap_atomic(dst_kaddr, KM_USER0); if (dst_page != src_page) kunmap_atomic(src_kaddr, KM_USER1); - */ } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, @@ -2537,7 +2525,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min(len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); -// printk("move pages orig dst %lu src %lu len %lu, this %lu %lu %lu\n", dst_offset, src_offset, len, dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); + move_pages(extent_buffer_page(dst, dst_i), extent_buffer_page(dst, src_i), dst_off_in_page - cur + 1, -- cgit v1.2.3 From f510cfecfc98759d75283823cfccf0cc0d59a4c6 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:14:48 -0400 Subject: Btrfs: Fix extent_buffer and extent_state leaks Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 64 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f150188f621..5b7dbcaacd1 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -19,8 +19,13 @@ struct kmem_cache *btrfs_cache_create(const char *name, size_t size, static struct kmem_cache *extent_map_cache; static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; + static LIST_HEAD(extent_buffers); +static LIST_HEAD(buffers); +static LIST_HEAD(states); + static spinlock_t extent_buffers_lock; +static spinlock_t state_lock = SPIN_LOCK_UNLOCKED; static int nr_extent_buffers; #define MAX_EXTENT_BUFFER_CACHE 128 @@ -48,6 +53,7 @@ void __init extent_map_init(void) void __exit extent_map_exit(void) { struct extent_buffer *eb; + struct extent_state *state; while (!list_empty(&extent_buffers)) { eb = list_entry(extent_buffers.next, @@ -55,6 +61,22 @@ void __exit extent_map_exit(void) list_del(&eb->list); kmem_cache_free(extent_buffer_cache, eb); } + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, list); + printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs)); + list_del(&state->list); + kmem_cache_free(extent_state_cache, state); + + } + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, + struct extent_buffer, leak_list); + printk("buffer leak start %Lu len %lu return %lX\n", eb->start, eb->len, eb->alloc_addr); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } + + if (extent_map_cache) kmem_cache_destroy(extent_map_cache); if (extent_state_cache) @@ -101,12 +123,19 @@ EXPORT_SYMBOL(free_extent_map); struct extent_state *alloc_extent_state(gfp_t mask) { struct extent_state *state; + unsigned long flags; + state = kmem_cache_alloc(extent_state_cache, mask); if (!state || IS_ERR(state)) return state; state->state = 0; state->in_tree = 0; state->private = 0; + + spin_lock_irqsave(&state_lock, flags); + list_add(&state->list, &states); + spin_unlock_irqrestore(&state_lock, flags); + atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); return state; @@ -115,10 +144,14 @@ EXPORT_SYMBOL(alloc_extent_state); void free_extent_state(struct extent_state *state) { + unsigned long flags; if (!state) return; if (atomic_dec_and_test(&state->refs)) { WARN_ON(state->in_tree); + spin_lock_irqsave(&state_lock, flags); + list_del(&state->list); + spin_unlock_irqrestore(&state_lock, flags); kmem_cache_free(extent_state_cache, state); } } @@ -361,10 +394,6 @@ static int insert_state(struct extent_map_tree *tree, state->state |= bits; state->start = start; state->end = end; - if ((end & 4095) == 0) { - printk("insert state %Lu %Lu strange end\n", start, end); - WARN_ON(1); - } node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -399,11 +428,7 @@ static int split_state(struct extent_map_tree *tree, struct extent_state *orig, prealloc->end = split - 1; prealloc->state = orig->state; orig->start = split; - if ((prealloc->end & 4095) == 0) { - printk("insert state %Lu %Lu strange end\n", prealloc->start, - prealloc->end); - WARN_ON(1); - } + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); if (node) { struct extent_state *found; @@ -957,6 +982,7 @@ int find_first_extent_bit(struct extent_map_tree *tree, u64 start, *start_ret = state->start; *end_ret = state->end; ret = 0; + break; } node = rb_next(node); if (!node) @@ -1877,6 +1903,7 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock, static struct extent_buffer *__alloc_extent_buffer(gfp_t mask) { struct extent_buffer *eb = NULL; + spin_lock(&extent_buffers_lock); if (!list_empty(&extent_buffers)) { eb = list_entry(extent_buffers.next, struct extent_buffer, @@ -1886,15 +1913,26 @@ static struct extent_buffer *__alloc_extent_buffer(gfp_t mask) nr_extent_buffers--; } spin_unlock(&extent_buffers_lock); + if (eb) { memset(eb, 0, sizeof(*eb)); - return eb; + } else { + eb = kmem_cache_zalloc(extent_buffer_cache, mask); } - return kmem_cache_zalloc(extent_buffer_cache, mask); + spin_lock(&extent_buffers_lock); + list_add(&eb->leak_list, &buffers); + spin_unlock(&extent_buffers_lock); + + return eb; } static void __free_extent_buffer(struct extent_buffer *eb) { + + spin_lock(&extent_buffers_lock); + list_del_init(&eb->leak_list); + spin_unlock(&extent_buffers_lock); + if (nr_extent_buffers >= MAX_EXTENT_BUFFER_CACHE) { kmem_cache_free(extent_buffer_cache, eb); } else { @@ -1933,6 +1971,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, if (!eb || IS_ERR(eb)) return NULL; + eb->alloc_addr = __builtin_return_address(0); eb->start = start; eb->len = len; atomic_set(&eb->refs, 1); @@ -1947,6 +1986,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, eb->start &= ~((u64)PAGE_CACHE_SIZE - 1); goto fail; } + set_page_extent_mapped(p); if (i == 0) eb->first_page = p; if (!PageUptodate(p)) @@ -1978,6 +2018,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, if (!eb || IS_ERR(eb)) return NULL; + eb->alloc_addr = __builtin_return_address(0); eb->start = start; eb->len = len; atomic_set(&eb->refs, 1); @@ -1992,6 +2033,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, eb->start &= ~((u64)PAGE_CACHE_SIZE - 1); goto fail; } + set_page_extent_mapped(p); if (i == 0) eb->first_page = p; } -- cgit v1.2.3 From 96b5179d0d9b6368c203856f2ad6e8e12a8b2a2c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:15:19 -0400 Subject: Btrfs: Stop using radix trees for the block group cache Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 5b7dbcaacd1..1b2f9e059de 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -574,7 +574,7 @@ out: return set; search_again: - if (start >= end) + if (start > end) goto out; write_unlock_irqrestore(&tree->lock, flags); if (mask & __GFP_WAIT) @@ -819,6 +819,21 @@ int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, } EXPORT_SYMBOL(set_extent_dirty); +int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_bits); + +int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, bits, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_bits); + int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask) { @@ -1138,7 +1153,6 @@ int set_state_private(struct extent_map_tree *tree, u64 start, u64 private) out: write_unlock_irq(&tree->lock); return ret; - } int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private) -- cgit v1.2.3 From 1a5bc167f6707542b79a55452075525620ed43f5 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:15:26 -0400 Subject: Btrfs: Change the remaining radix trees used by extent-tree.c to extent_map trees Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1b2f9e059de..e081558d52f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1188,8 +1188,8 @@ out: * has the bits set. Otherwise, 1 is returned if any bit in the * range is found set. */ -static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, - int bits, int filled) +int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, + int bits, int filled) { struct extent_state *state = NULL; struct rb_node *node; @@ -1222,6 +1222,7 @@ static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, read_unlock_irq(&tree->lock); return bitset; } +EXPORT_SYMBOL(test_range_bit); /* * helper function to set a given page up to date if all the -- cgit v1.2.3 From db94535db75e67fab12ccbb7f5ee548e33fed891 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:15:53 -0400 Subject: Btrfs: Allow tree blocks larger than the page size Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 91 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index e081558d52f..f658703c42e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1963,18 +1963,27 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, int i) struct page *p; if (i == 0) return eb->first_page; + i += eb->start >> PAGE_CACHE_SHIFT; + if (eb->last_page && eb->last_page->index == i) + return eb->last_page; + p = find_get_page(eb->first_page->mapping, i); page_cache_release(p); + eb->last_page = p; return p; } +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, u64 start, unsigned long len, gfp_t mask) { - unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT) + 1; + unsigned long num_pages = num_extent_pages(start, len); unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT; struct extent_buffer *eb; @@ -1986,7 +1995,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, if (!eb || IS_ERR(eb)) return NULL; - eb->alloc_addr = __builtin_return_address(0); + eb->alloc_addr = (unsigned long)__builtin_return_address(0); eb->start = start; eb->len = len; atomic_set(&eb->refs, 1); @@ -1994,6 +2003,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); if (!p) { + WARN_ON(1); /* make sure the free only frees the pages we've * grabbed a reference on */ @@ -2021,8 +2031,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, u64 start, unsigned long len, gfp_t mask) { - unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT) + 1; + unsigned long num_pages = num_extent_pages(start, len); unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT; struct extent_buffer *eb; @@ -2033,7 +2042,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, if (!eb || IS_ERR(eb)) return NULL; - eb->alloc_addr = __builtin_return_address(0); + eb->alloc_addr = (unsigned long)__builtin_return_address(0); eb->start = start; eb->len = len; atomic_set(&eb->refs, 1); @@ -2070,8 +2079,7 @@ void free_extent_buffer(struct extent_buffer *eb) if (!atomic_dec_and_test(&eb->refs)) return; - num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - - (eb->start >> PAGE_CACHE_SHIFT) + 1; + num_pages = num_extent_pages(eb->start, eb->len); if (eb->first_page) page_cache_release(eb->first_page); @@ -2094,8 +2102,7 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree, u64 end = start + eb->len - 1; set = clear_extent_dirty(tree, start, end, GFP_NOFS); - num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - - (eb->start >> PAGE_CACHE_SHIFT) + 1; + num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); @@ -2145,8 +2152,7 @@ int set_extent_buffer_uptodate(struct extent_map_tree *tree, struct page *page; unsigned long num_pages; - num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - - (eb->start >> PAGE_CACHE_SHIFT) + 1; + num_pages = num_extent_pages(eb->start, eb->len); set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); @@ -2191,8 +2197,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree, return 0; } - num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - - (eb->start >> PAGE_CACHE_SHIFT) + 1; + num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (PageUptodate(page)) { @@ -2267,14 +2272,14 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } EXPORT_SYMBOL(read_extent_buffer); -int map_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, - char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) +static int __map_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) { size_t offset = start & (PAGE_CACHE_SIZE - 1); char *kaddr; + struct page *p; size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; unsigned long end_i = (start_offset + start + min_len) >> @@ -2283,21 +2288,59 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start, if (i != end_i) return -EINVAL; - WARN_ON(start > eb->len); + if (start >= eb->len) { + printk("bad start in map eb start %Lu len %lu caller start %lu min %lu\n", eb->start, eb->len, start, min_len); + WARN_ON(1); + } if (i == 0) { offset = start_offset; *map_start = 0; } else { + offset = 0; *map_start = (i << PAGE_CACHE_SHIFT) - start_offset; } - kaddr = kmap_atomic(extent_buffer_page(eb, i), km); + p = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(p)); + kaddr = kmap_atomic(p, km); *token = kaddr; *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; return 0; } + +int map_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, + char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + int err; + int save = 0; + if (eb->map_token) { + if (start >= eb->map_start && + start + min_len <= eb->map_start + eb->map_len) { + *token = eb->map_token; + *map = eb->kaddr; + *map_start = eb->map_start; + *map_len = eb->map_len; + return 0; + } + unmap_extent_buffer(eb, eb->map_token, km); + eb->map_token = NULL; + save = 1; + } + err = __map_extent_buffer(eb, start, min_len, token, map, + map_start, map_len, km); + if (!err && save) { + eb->map_token = *token; + eb->kaddr = *map; + eb->map_start = *map_start; + eb->map_len = *map_len; + } + return err; +} EXPORT_SYMBOL(map_extent_buffer); void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) @@ -2574,7 +2617,6 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, ((unsigned long)PAGE_CACHE_SIZE - 1); src_off_in_page = src_end & ((unsigned long)PAGE_CACHE_SIZE - 1); - if (src_i == 0) src_off_in_page += start_offset; if (dst_i == 0) @@ -2582,14 +2624,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min(len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - move_pages(extent_buffer_page(dst, dst_i), extent_buffer_page(dst, src_i), dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); - dst_end -= cur - 1; - src_end -= cur - 1; + dst_end -= cur; + src_end -= cur; len -= cur; } } -- cgit v1.2.3 From 14048ed0c415b8729b194e92c16d31c61628d216 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:16:28 -0400 Subject: Btrfs: Cache extent buffer mappings Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f658703c42e..ea6ee68ef53 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2037,6 +2037,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, struct extent_buffer *eb; struct page *p; struct address_space *mapping = tree->mapping; + int uptodate = 1; eb = __alloc_extent_buffer(mask); if (!eb || IS_ERR(eb)) @@ -2048,7 +2049,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, atomic_set(&eb->refs, 1); for (i = 0; i < num_pages; i++, index++) { - p = find_get_page(mapping, index); + p = find_lock_page(mapping, index); if (!p) { /* make sure the free only frees the pages we've * grabbed a reference on @@ -2060,7 +2061,12 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, set_page_extent_mapped(p); if (i == 0) eb->first_page = p; + if (!PageUptodate(p)) + uptodate = 0; + unlock_page(p); } + if (uptodate) + eb->flags |= EXTENT_UPTODATE; return eb; fail: free_extent_buffer(eb); @@ -2192,7 +2198,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree, if (eb->flags & EXTENT_UPTODATE) return 0; - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1, EXTENT_UPTODATE, 1)) { return 0; } @@ -2247,6 +2253,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + unsigned long num_pages = num_extent_pages(eb->start, eb->len); WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); @@ -2257,6 +2264,10 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, while(len > 0) { page = extent_buffer_page(eb, i); + if (!PageUptodate(page)) { + printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len); + WARN_ON(1); + } WARN_ON(!PageUptodate(page)); cur = min(len, (PAGE_CACHE_SIZE - offset)); -- cgit v1.2.3 From 09e71a326341f40111400c88aaf0498ef622824b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:17:04 -0400 Subject: Btrfs: Use an array of pages in the extent buffers to reduce the cost of find_get_page Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ea6ee68ef53..7ef3397a266 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1961,16 +1961,12 @@ static void __free_extent_buffer(struct extent_buffer *eb) static inline struct page *extent_buffer_page(struct extent_buffer *eb, int i) { struct page *p; - if (i == 0) - return eb->first_page; + if (i < EXTENT_INLINE_PAGES) + return eb->pages[i]; i += eb->start >> PAGE_CACHE_SHIFT; - if (eb->last_page && eb->last_page->index == i) - return eb->last_page; - - p = find_get_page(eb->first_page->mapping, i); + p = find_get_page(eb->pages[0]->mapping, i); page_cache_release(p); - eb->last_page = p; return p; } @@ -2012,8 +2008,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, goto fail; } set_page_extent_mapped(p); - if (i == 0) - eb->first_page = p; + if (i < EXTENT_INLINE_PAGES) + eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; unlock_page(p); @@ -2059,8 +2055,8 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, goto fail; } set_page_extent_mapped(p); - if (i == 0) - eb->first_page = p; + if (i < EXTENT_INLINE_PAGES) + eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; unlock_page(p); @@ -2087,9 +2083,7 @@ void free_extent_buffer(struct extent_buffer *eb) num_pages = num_extent_pages(eb->start, eb->len); - if (eb->first_page) - page_cache_release(eb->first_page); - for (i = 1; i < num_pages; i++) { + for (i = 0; i < num_pages; i++) { page_cache_release(extent_buffer_page(eb, i)); } __free_extent_buffer(eb); -- cgit v1.2.3 From e19caa5f0e34b571ed0c2617554af5c43cb124d1 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:17:44 -0400 Subject: Btrfs: Fix allocation routines to avoid intermixing data and metadata allocations Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7ef3397a266..e241699024d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -981,7 +981,7 @@ int find_first_extent_bit(struct extent_map_tree *tree, u64 start, struct extent_state *state; int ret = 1; - write_lock_irq(&tree->lock); + read_lock_irq(&tree->lock); /* * this search will find all the extents that end after * our range starts. @@ -993,7 +993,7 @@ int find_first_extent_bit(struct extent_map_tree *tree, u64 start, while(1) { state = rb_entry(node, struct extent_state, rb_node); - if (state->state & bits) { + if (state->end >= start && (state->state & bits)) { *start_ret = state->start; *end_ret = state->end; ret = 0; @@ -1004,7 +1004,7 @@ int find_first_extent_bit(struct extent_map_tree *tree, u64 start, break; } out: - write_unlock_irq(&tree->lock); + read_unlock_irq(&tree->lock); return ret; } EXPORT_SYMBOL(find_first_extent_bit); -- cgit v1.2.3 From 4dc119046d0d8501afa4346472917fb05586ad9c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:18:14 -0400 Subject: Btrfs: Add an extent buffer LRU to reduce radix tree hits Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 183 +++++++++++++++++++++++++++++--------------------- 1 file changed, 106 insertions(+), 77 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index e241699024d..85b28a6a4e0 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "extent_map.h" /* temporary define until extent_map moves out of btrfs */ @@ -20,14 +21,11 @@ static struct kmem_cache *extent_map_cache; static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; -static LIST_HEAD(extent_buffers); static LIST_HEAD(buffers); static LIST_HEAD(states); -static spinlock_t extent_buffers_lock; static spinlock_t state_lock = SPIN_LOCK_UNLOCKED; -static int nr_extent_buffers; -#define MAX_EXTENT_BUFFER_CACHE 128 +#define BUFFER_LRU_MAX 64 struct tree_entry { u64 start; @@ -47,20 +45,12 @@ void __init extent_map_init(void) extent_buffer_cache = btrfs_cache_create("extent_buffers", sizeof(struct extent_buffer), 0, NULL); - spin_lock_init(&extent_buffers_lock); } void __exit extent_map_exit(void) { - struct extent_buffer *eb; struct extent_state *state; - while (!list_empty(&extent_buffers)) { - eb = list_entry(extent_buffers.next, - struct extent_buffer, list); - list_del(&eb->list); - kmem_cache_free(extent_buffer_cache, eb); - } while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, list); printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs)); @@ -68,14 +58,6 @@ void __exit extent_map_exit(void) kmem_cache_free(extent_state_cache, state); } - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, - struct extent_buffer, leak_list); - printk("buffer leak start %Lu len %lu return %lX\n", eb->start, eb->len, eb->alloc_addr); - list_del(&eb->leak_list); - kmem_cache_free(extent_buffer_cache, eb); - } - if (extent_map_cache) kmem_cache_destroy(extent_map_cache); @@ -92,10 +74,25 @@ void extent_map_tree_init(struct extent_map_tree *tree, tree->state.rb_node = NULL; tree->ops = NULL; rwlock_init(&tree->lock); + spin_lock_init(&tree->lru_lock); tree->mapping = mapping; + INIT_LIST_HEAD(&tree->buffer_lru); + tree->lru_size = 0; } EXPORT_SYMBOL(extent_map_tree_init); +void extent_map_tree_cleanup(struct extent_map_tree *tree) +{ + struct extent_buffer *eb; + while(!list_empty(&tree->buffer_lru)) { + eb = list_entry(tree->buffer_lru.next, struct extent_buffer, + lru); + list_del(&eb->lru); + free_extent_buffer(eb); + } +} +EXPORT_SYMBOL(extent_map_tree_cleanup); + struct extent_map *alloc_extent_map(gfp_t mask) { struct extent_map *em; @@ -1915,66 +1912,99 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock, return (em->block_start + start - em->start) >> inode->i_blkbits; } -static struct extent_buffer *__alloc_extent_buffer(gfp_t mask) +static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb) { - struct extent_buffer *eb = NULL; - - spin_lock(&extent_buffers_lock); - if (!list_empty(&extent_buffers)) { - eb = list_entry(extent_buffers.next, struct extent_buffer, - list); - list_del(&eb->list); - WARN_ON(nr_extent_buffers == 0); - nr_extent_buffers--; - } - spin_unlock(&extent_buffers_lock); + if (list_empty(&eb->lru)) { + extent_buffer_get(eb); + list_add(&eb->lru, &tree->buffer_lru); + tree->lru_size++; + if (tree->lru_size >= BUFFER_LRU_MAX) { + struct extent_buffer *rm; + rm = list_entry(tree->buffer_lru.prev, + struct extent_buffer, lru); + tree->lru_size--; + list_del(&rm->lru); + free_extent_buffer(rm); + } + } else + list_move(&eb->lru, &tree->buffer_lru); + return 0; +} +static struct extent_buffer *find_lru(struct extent_map_tree *tree, + u64 start, unsigned long len) +{ + struct list_head *lru = &tree->buffer_lru; + struct list_head *cur = lru->next; + struct extent_buffer *eb; - if (eb) { - memset(eb, 0, sizeof(*eb)); - } else { - eb = kmem_cache_zalloc(extent_buffer_cache, mask); - } - spin_lock(&extent_buffers_lock); - list_add(&eb->leak_list, &buffers); - spin_unlock(&extent_buffers_lock); + if (list_empty(lru)) + return NULL; - return eb; + do { + eb = list_entry(cur, struct extent_buffer, lru); + if (eb->start == start && eb->len == len) { + extent_buffer_get(eb); + return eb; + } + cur = cur->next; + } while (cur != lru); + return NULL; } -static void __free_extent_buffer(struct extent_buffer *eb) +static inline unsigned long num_extent_pages(u64 start, u64 len) { - - spin_lock(&extent_buffers_lock); - list_del_init(&eb->leak_list); - spin_unlock(&extent_buffers_lock); - - if (nr_extent_buffers >= MAX_EXTENT_BUFFER_CACHE) { - kmem_cache_free(extent_buffer_cache, eb); - } else { - spin_lock(&extent_buffers_lock); - list_add(&eb->list, &extent_buffers); - nr_extent_buffers++; - spin_unlock(&extent_buffers_lock); - } + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, int i) +static inline struct page *extent_buffer_page(struct extent_buffer *eb, + unsigned long i) { struct page *p; - if (i < EXTENT_INLINE_PAGES) - return eb->pages[i]; + if (i == 0) + return eb->last_page; i += eb->start >> PAGE_CACHE_SHIFT; - p = find_get_page(eb->pages[0]->mapping, i); + p = find_get_page(eb->last_page->mapping, i); page_cache_release(p); return p; } -static inline unsigned long num_extent_pages(u64 start, u64 len) +static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree, + u64 start, + unsigned long len, + gfp_t mask) { - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); + struct extent_buffer *eb = NULL; + + spin_lock(&tree->lru_lock); + eb = find_lru(tree, start, len); + if (eb) + goto lru_add; + spin_unlock(&tree->lru_lock); + + if (eb) { + memset(eb, 0, sizeof(*eb)); + } else { + eb = kmem_cache_zalloc(extent_buffer_cache, mask); + } + INIT_LIST_HEAD(&eb->lru); + eb->start = start; + eb->len = len; + atomic_set(&eb->refs, 1); + + spin_lock(&tree->lru_lock); +lru_add: + add_lru(tree, eb); + spin_unlock(&tree->lru_lock); + return eb; +} + +static void __free_extent_buffer(struct extent_buffer *eb) +{ + kmem_cache_free(extent_buffer_cache, eb); } + struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, u64 start, unsigned long len, gfp_t mask) @@ -1987,14 +2017,12 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, struct address_space *mapping = tree->mapping; int uptodate = 0; - eb = __alloc_extent_buffer(mask); + eb = __alloc_extent_buffer(tree, start, len, mask); if (!eb || IS_ERR(eb)) return NULL; - eb->alloc_addr = (unsigned long)__builtin_return_address(0); - eb->start = start; - eb->len = len; - atomic_set(&eb->refs, 1); + if (eb->flags & EXTENT_BUFFER_FILLED) + return eb; for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); @@ -2008,14 +2036,15 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, goto fail; } set_page_extent_mapped(p); - if (i < EXTENT_INLINE_PAGES) - eb->pages[i] = p; + if (i == 0) + eb->last_page = p; if (!PageUptodate(p)) uptodate = 0; unlock_page(p); } if (uptodate) eb->flags |= EXTENT_UPTODATE; + eb->flags |= EXTENT_BUFFER_FILLED; return eb; fail: free_extent_buffer(eb); @@ -2035,14 +2064,12 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, struct address_space *mapping = tree->mapping; int uptodate = 1; - eb = __alloc_extent_buffer(mask); + eb = __alloc_extent_buffer(tree, start, len, mask); if (!eb || IS_ERR(eb)) return NULL; - eb->alloc_addr = (unsigned long)__builtin_return_address(0); - eb->start = start; - eb->len = len; - atomic_set(&eb->refs, 1); + if (eb->flags & EXTENT_BUFFER_FILLED) + return eb; for (i = 0; i < num_pages; i++, index++) { p = find_lock_page(mapping, index); @@ -2055,14 +2082,15 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, goto fail; } set_page_extent_mapped(p); - if (i < EXTENT_INLINE_PAGES) - eb->pages[i] = p; + if (i == 0) + eb->last_page = p; if (!PageUptodate(p)) uptodate = 0; unlock_page(p); } if (uptodate) eb->flags |= EXTENT_UPTODATE; + eb->flags |= EXTENT_BUFFER_FILLED; return eb; fail: free_extent_buffer(eb); @@ -2231,7 +2259,8 @@ int read_extent_buffer_pages(struct extent_map_tree *tree, ret = -EIO; } } - eb->flags |= EXTENT_UPTODATE; + if (!ret) + eb->flags |= EXTENT_UPTODATE; return ret; } EXPORT_SYMBOL(read_extent_buffer_pages); -- cgit v1.2.3 From 810191ff3087e8143b41a944fcf4fd8c693f00e3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:18:55 -0400 Subject: Btrfs: extent_map optimizations to cut down on CPU usage Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 85b28a6a4e0..f8aaba8a30a 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1963,9 +1963,9 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, struct page *p; if (i == 0) - return eb->last_page; + return eb->first_page; i += eb->start >> PAGE_CACHE_SHIFT; - p = find_get_page(eb->last_page->mapping, i); + p = find_get_page(eb->first_page->mapping, i); page_cache_release(p); return p; } @@ -2037,7 +2037,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, } set_page_extent_mapped(p); if (i == 0) - eb->last_page = p; + eb->first_page = p; if (!PageUptodate(p)) uptodate = 0; unlock_page(p); @@ -2083,7 +2083,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, } set_page_extent_mapped(p); if (i == 0) - eb->last_page = p; + eb->first_page = p; if (!PageUptodate(p)) uptodate = 0; unlock_page(p); @@ -2169,7 +2169,15 @@ EXPORT_SYMBOL(wait_on_extent_buffer_writeback); int set_extent_buffer_dirty(struct extent_map_tree *tree, struct extent_buffer *eb) { - return set_range_dirty(tree, eb->start, eb->start + eb->len - 1); + unsigned long i; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + } + return set_extent_dirty(tree, eb->start, + eb->start + eb->len - 1, GFP_NOFS); } EXPORT_SYMBOL(set_extent_buffer_dirty); @@ -2317,16 +2325,11 @@ static int __map_extent_buffer(struct extent_buffer *eb, unsigned long start, size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; unsigned long end_i = (start_offset + start + min_len) >> - PAGE_CACHE_SHIFT; + PAGE_CACHE_SHIFT; if (i != end_i) return -EINVAL; - if (start >= eb->len) { - printk("bad start in map eb start %Lu len %lu caller start %lu min %lu\n", eb->start, eb->len, start, min_len); - WARN_ON(1); - } - if (i == 0) { offset = start_offset; *map_start = 0; @@ -2353,14 +2356,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start, int err; int save = 0; if (eb->map_token) { - if (start >= eb->map_start && - start + min_len <= eb->map_start + eb->map_len) { - *token = eb->map_token; - *map = eb->kaddr; - *map_start = eb->map_start; - *map_len = eb->map_len; - return 0; - } unmap_extent_buffer(eb, eb->map_token, km); eb->map_token = NULL; save = 1; -- cgit v1.2.3 From 19c00ddcc31ad4bdfb86b57085e06d6135b9b1d7 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:19:22 -0400 Subject: Btrfs: Add back metadata checksumming Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 83 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f8aaba8a30a..2a8bc4bd43a 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -81,7 +81,7 @@ void extent_map_tree_init(struct extent_map_tree *tree, } EXPORT_SYMBOL(extent_map_tree_init); -void extent_map_tree_cleanup(struct extent_map_tree *tree) +void extent_map_tree_empty_lru(struct extent_map_tree *tree) { struct extent_buffer *eb; while(!list_empty(&tree->buffer_lru)) { @@ -91,7 +91,7 @@ void extent_map_tree_cleanup(struct extent_map_tree *tree) free_extent_buffer(eb); } } -EXPORT_SYMBOL(extent_map_tree_cleanup); +EXPORT_SYMBOL(extent_map_tree_empty_lru); struct extent_map *alloc_extent_map(gfp_t mask) { @@ -1464,7 +1464,7 @@ void set_page_extent_mapped(struct page *page) if (!PagePrivate(page)) { SetPagePrivate(page); WARN_ON(!page->mapping->a_ops->invalidatepage); - set_page_private(page, 1); + set_page_private(page, EXTENT_PAGE_PRIVATE); page_cache_get(page); } } @@ -1979,8 +1979,9 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree, spin_lock(&tree->lru_lock); eb = find_lru(tree, start, len); - if (eb) + if (eb) { goto lru_add; + } spin_unlock(&tree->lru_lock); if (eb) { @@ -2007,6 +2008,7 @@ static void __free_extent_buffer(struct extent_buffer *eb) struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, u64 start, unsigned long len, + struct page *page0, gfp_t mask) { unsigned long num_pages = num_extent_pages(start, len); @@ -2024,7 +2026,18 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, if (eb->flags & EXTENT_BUFFER_FILLED) return eb; - for (i = 0; i < num_pages; i++, index++) { + if (page0) { + eb->first_page = page0; + i = 1; + index++; + page_cache_get(page0); + set_page_extent_mapped(page0); + set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE | + len << 2); + } else { + i = 0; + } + for (; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); if (!p) { WARN_ON(1); @@ -2036,8 +2049,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, goto fail; } set_page_extent_mapped(p); - if (i == 0) + if (i == 0) { eb->first_page = p; + set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE | + len << 2); + } else { + set_page_private(p, EXTENT_PAGE_PRIVATE); + } if (!PageUptodate(p)) uptodate = 0; unlock_page(p); @@ -2057,8 +2075,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, gfp_t mask) { unsigned long num_pages = num_extent_pages(start, len); - unsigned long i; - unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT; struct extent_buffer *eb; struct page *p; struct address_space *mapping = tree->mapping; @@ -2082,8 +2099,15 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, goto fail; } set_page_extent_mapped(p); - if (i == 0) + + if (i == 0) { eb->first_page = p; + set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE | + len << 2); + } else { + set_page_private(p, EXTENT_PAGE_PRIVATE); + } + if (!PageUptodate(p)) uptodate = 0; unlock_page(p); @@ -2174,7 +2198,21 @@ int set_extent_buffer_dirty(struct extent_map_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { + struct page *page = extent_buffer_page(eb, i); + /* writepage may need to do something special for the + * first page, we have to make sure page->private is + * properly set. releasepage may drop page->private + * on us if the page isn't already dirty. + */ + if (i == 0) { + lock_page(page); + set_page_private(page, + EXTENT_PAGE_PRIVATE_FIRST_PAGE | + eb->len << 2); + } __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + if (i == 0) + unlock_page(page); } return set_extent_dirty(tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); @@ -2217,9 +2255,12 @@ int extent_buffer_uptodate(struct extent_map_tree *tree, EXPORT_SYMBOL(extent_buffer_uptodate); int read_extent_buffer_pages(struct extent_map_tree *tree, - struct extent_buffer *eb, int wait) + struct extent_buffer *eb, + u64 start, + int wait) { unsigned long i; + unsigned long start_i; struct page *page; int err; int ret = 0; @@ -2232,9 +2273,16 @@ int read_extent_buffer_pages(struct extent_map_tree *tree, EXTENT_UPTODATE, 1)) { return 0; } + if (start) { + WARN_ON(start < eb->start); + start_i = (start >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT); + } else { + start_i = 0; + } num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { + for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (PageUptodate(page)) { continue; @@ -2260,7 +2308,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree, return ret; } - for (i = 0; i < num_pages; i++) { + for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); wait_on_page_locked(page); if (!PageUptodate(page)) { @@ -2314,7 +2362,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } EXPORT_SYMBOL(read_extent_buffer); -static int __map_extent_buffer(struct extent_buffer *eb, unsigned long start, +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, char **token, char **map, unsigned long *map_start, unsigned long *map_len, int km) @@ -2337,6 +2385,10 @@ static int __map_extent_buffer(struct extent_buffer *eb, unsigned long start, offset = 0; *map_start = (i << PAGE_CACHE_SHIFT) - start_offset; } + if (start + min_len >= eb->len) { +printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len); + WARN_ON(1); + } p = extent_buffer_page(eb, i); WARN_ON(!PageUptodate(p)); @@ -2346,6 +2398,7 @@ static int __map_extent_buffer(struct extent_buffer *eb, unsigned long start, *map_len = PAGE_CACHE_SIZE - offset; return 0; } +EXPORT_SYMBOL(map_private_extent_buffer); int map_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, @@ -2360,8 +2413,8 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start, eb->map_token = NULL; save = 1; } - err = __map_extent_buffer(eb, start, min_len, token, map, - map_start, map_len, km); + err = map_private_extent_buffer(eb, start, min_len, token, map, + map_start, map_len, km); if (!err && save) { eb->map_token = *token; eb->kaddr = *map; -- cgit v1.2.3 From ff79f8190b6e955ff7a71faf804a3017d526e657 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:22:25 -0400 Subject: Btrfs: Add back file data checksumming Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2a8bc4bd43a..e87e476dca9 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2031,6 +2031,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, i = 1; index++; page_cache_get(page0); + mark_page_accessed(page0); set_page_extent_mapped(page0); set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); @@ -2049,6 +2050,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, goto fail; } set_page_extent_mapped(p); + mark_page_accessed(p); if (i == 0) { eb->first_page = p; set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE | @@ -2099,6 +2101,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, goto fail; } set_page_extent_mapped(p); + mark_page_accessed(p); if (i == 0) { eb->first_page = p; -- cgit v1.2.3 From ae2f5411c4ce7180cca8418853db50c8e52d40db Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Oct 2007 09:22:59 -0400 Subject: btrfs: 32-bit type problems An assorted set of casts to get rid of the warnings on 32-bit archs. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index e87e476dca9..89d0ebd0ea8 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2654,8 +2654,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - src_off_in_page)); - cur = min(cur, (unsigned long)(PAGE_CACHE_SIZE - - dst_off_in_page)); + cur = min_t(unsigned long, cur, + (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); copy_pages(extent_buffer_page(dst, dst_i), extent_buffer_page(dst, src_i), @@ -2707,7 +2707,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, if (dst_i == 0) dst_off_in_page += start_offset; - cur = min(len, src_off_in_page + 1); + cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); move_pages(extent_buffer_page(dst, dst_i), extent_buffer_page(dst, src_i), -- cgit v1.2.3 From 0a2118dfd4c328e8823cdcfebf839dff73ac21e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Oct 2007 09:23:05 -0400 Subject: Btrfs: Fix bi_end_io() functions on > 2.6.23 kernels It now returns void and it is never called for partial completions, so the bio->bi_size check must go. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 89d0ebd0ea8..c0f834563ce 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "extent_map.h" /* temporary define until extent_map moves out of btrfs */ @@ -1274,8 +1275,12 @@ static int check_page_writeback(struct extent_map_tree *tree, * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) +static void end_bio_extent_writepage(struct bio *bio, int err) +#else static int end_bio_extent_writepage(struct bio *bio, unsigned int bytes_done, int err) +#endif { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -1284,8 +1289,10 @@ static int end_bio_extent_writepage(struct bio *bio, u64 end; int whole_page; +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) if (bio->bi_size) return 1; +#endif do { struct page *page = bvec->bv_page; @@ -1316,7 +1323,9 @@ static int end_bio_extent_writepage(struct bio *bio, } while (bvec >= bio->bi_io_vec); bio_put(bio); +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) return 0; +#endif } /* @@ -1330,8 +1339,12 @@ static int end_bio_extent_writepage(struct bio *bio, * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) +static void end_bio_extent_readpage(struct bio *bio, int err) +#else static int end_bio_extent_readpage(struct bio *bio, unsigned int bytes_done, int err) +#endif { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -1341,8 +1354,10 @@ static int end_bio_extent_readpage(struct bio *bio, int whole_page; int ret; +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) if (bio->bi_size) return 1; +#endif do { struct page *page = bvec->bv_page; @@ -1382,7 +1397,9 @@ static int end_bio_extent_readpage(struct bio *bio, } while (bvec >= bio->bi_io_vec); bio_put(bio); +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) return 0; +#endif } /* @@ -1390,8 +1407,12 @@ static int end_bio_extent_readpage(struct bio *bio, * the structs in the extent tree when done, and set the uptodate bits * as appropriate. */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) +static void end_bio_extent_preparewrite(struct bio *bio, int err) +#else static int end_bio_extent_preparewrite(struct bio *bio, unsigned int bytes_done, int err) +#endif { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -1399,8 +1420,10 @@ static int end_bio_extent_preparewrite(struct bio *bio, u64 start; u64 end; +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) if (bio->bi_size) return 1; +#endif do { struct page *page = bvec->bv_page; @@ -1422,7 +1445,9 @@ static int end_bio_extent_preparewrite(struct bio *bio, } while (bvec >= bio->bi_io_vec); bio_put(bio); +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) return 0; +#endif } static int submit_extent_page(int rw, struct extent_map_tree *tree, -- cgit v1.2.3 From 59d169e2b3d69ba0beca0fa0936789080c8d61a9 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 19 Oct 2007 09:23:09 -0400 Subject: Btrfs: Fix read/write_extent_buffer to use KM_USER1 instead of KM_USER0 This avoids recursive use of KM_USER0 during btrfs_file_write Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index c0f834563ce..3c81f5eab15 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2378,9 +2378,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, WARN_ON(!PageUptodate(page)); cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page, KM_USER1); memcpy(dst, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr, KM_USER1); dst += cur; len -= cur; @@ -2523,9 +2523,9 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page, KM_USER1); memcpy(kaddr + offset, src, cur); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr, KM_USER1); src += cur; len -= cur; -- cgit v1.2.3 From 3685f791659c9f21b763ee1702ac8ca58bc20f81 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 19 Oct 2007 09:23:27 -0400 Subject: Btrfs: CPU usage optimizations in push and the extent_map code Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 45 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 3c81f5eab15..caaf0bf0e05 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1986,12 +1986,15 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { struct page *p; + struct address_space *mapping; if (i == 0) return eb->first_page; i += eb->start >> PAGE_CACHE_SHIFT; - p = find_get_page(eb->first_page->mapping, i); - page_cache_release(p); + mapping = eb->first_page->mapping; + read_lock_irq(&mapping->tree_lock); + p = radix_tree_lookup(&mapping->page_tree, i); + read_unlock_irq(&mapping->tree_lock); return p; } @@ -2365,9 +2368,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); - if (i == 0) - offset += start_offset; + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); while(len > 0) { page = extent_buffer_page(eb, i); @@ -2475,9 +2476,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); - if (i == 0) - offset += start_offset; + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); while(len > 0) { page = extent_buffer_page(eb, i); @@ -2514,9 +2513,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); - if (i == 0) - offset += start_offset; + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); while(len > 0) { page = extent_buffer_page(eb, i); @@ -2548,9 +2545,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); - if (i == 0) - offset += start_offset; + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); while(len > 0) { page = extent_buffer_page(eb, i); @@ -2582,9 +2577,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, WARN_ON(src->len != dst_len); - offset = dst_offset & ((unsigned long)PAGE_CACHE_SIZE - 1); - if (i == 0) - offset += start_offset; + offset = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); while(len > 0) { page = extent_buffer_page(dst, i); @@ -2664,19 +2658,14 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } while(len > 0) { - dst_off_in_page = dst_offset & + dst_off_in_page = (start_offset + dst_offset) & ((unsigned long)PAGE_CACHE_SIZE - 1); - src_off_in_page = src_offset & + src_off_in_page = (start_offset + src_offset) & ((unsigned long)PAGE_CACHE_SIZE - 1); dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; - if (src_i == 0) - src_off_in_page += start_offset; - if (dst_i == 0) - dst_off_in_page += start_offset; - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - src_off_in_page)); cur = min_t(unsigned long, cur, @@ -2723,14 +2712,10 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; - dst_off_in_page = dst_end & + dst_off_in_page = (start_offset + dst_end) & ((unsigned long)PAGE_CACHE_SIZE - 1); - src_off_in_page = src_end & + src_off_in_page = (start_offset + src_end) & ((unsigned long)PAGE_CACHE_SIZE - 1); - if (src_i == 0) - src_off_in_page += start_offset; - if (dst_i == 0) - dst_off_in_page += start_offset; cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); -- cgit v1.2.3 From ff190c0c004d8e51195c7bcf5a8490aeefccbce0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 19 Oct 2007 10:39:41 -0400 Subject: Btrfs: Avoid recursive KM_USER1 mappings in copy_extent_buffer Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index caaf0bf0e05..a61379230dd 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2586,9 +2586,9 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = kmap_atomic(page, KM_USER0); read_extent_buffer(src, kaddr + offset, src_offset, cur); - kunmap_atomic(kaddr, KM_USER1); + kunmap_atomic(kaddr, KM_USER0); src_offset += cur; len -= cur; -- cgit v1.2.3 From 65555a06b4d1ae116ce223dc4b82d6068b36df96 Mon Sep 17 00:00:00 2001 From: Yan Date: Thu, 25 Oct 2007 15:42:57 -0400 Subject: Btrfs: Off by one fixes in extent_map.c Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a61379230dd..c44989a1e52 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2045,7 +2045,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, struct extent_buffer *eb; struct page *p; struct address_space *mapping = tree->mapping; - int uptodate = 0; + int uptodate = 1; eb = __alloc_extent_buffer(tree, start, len, mask); if (!eb || IS_ERR(eb)) @@ -2197,7 +2197,7 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree, */ if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || ((i == num_pages - 1) && - ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) { + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { start = page->index << PAGE_CACHE_SHIFT; end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, @@ -2265,7 +2265,7 @@ int set_extent_buffer_uptodate(struct extent_map_tree *tree, page = extent_buffer_page(eb, i); if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || ((i == num_pages - 1) && - ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) { + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { check_page_uptodate(tree, page); continue; } @@ -2401,7 +2401,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, struct page *p; size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - unsigned long end_i = (start_offset + start + min_len) >> + unsigned long end_i = (start_offset + start + min_len - 1) >> PAGE_CACHE_SHIFT; if (i != end_i) @@ -2414,7 +2414,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, offset = 0; *map_start = (i << PAGE_CACHE_SHIFT) - start_offset; } - if (start + min_len >= eb->len) { + if (start + min_len > eb->len) { printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len); WARN_ON(1); } -- cgit v1.2.3 From c67cda1758130f3e437744c4c2ec4a7cc25564e3 Mon Sep 17 00:00:00 2001 From: Yan Date: Mon, 29 Oct 2007 11:41:05 -0400 Subject: Btrfs: Fix extent_map leak in extent_bmap Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index c44989a1e52..e4ee980cc13 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1924,6 +1924,7 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock, struct inode *inode = mapping->host; u64 start = iblock << inode->i_blkbits; u64 end = start + (1 << inode->i_blkbits) - 1; + sector_t sector = 0; struct extent_map *em; em = get_extent(inode, NULL, 0, start, end, 0); @@ -1932,9 +1933,12 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock, if (em->block_start == EXTENT_MAP_INLINE || em->block_start == EXTENT_MAP_HOLE) - return 0; + goto out; - return (em->block_start + start - em->start) >> inode->i_blkbits; + sector = (em->block_start + start - em->start) >> inode->i_blkbits; +out: + free_extent_map(em); + return sector; } static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb) -- cgit v1.2.3 From 35ebb934bd7fcc7ca991b155b7980c3c4ff9f1a5 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 30 Oct 2007 16:56:53 -0400 Subject: Btrfs: Fix PAGE_CACHE_SHIFT shifts on 32 bit machines Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index e4ee980cc13..238cb1d81d5 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1229,7 +1229,7 @@ EXPORT_SYMBOL(test_range_bit); static int check_page_uptodate(struct extent_map_tree *tree, struct page *page) { - u64 start = page->index << PAGE_CACHE_SHIFT; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) SetPageUptodate(page); @@ -1243,7 +1243,7 @@ static int check_page_uptodate(struct extent_map_tree *tree, static int check_page_locked(struct extent_map_tree *tree, struct page *page) { - u64 start = page->index << PAGE_CACHE_SHIFT; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) unlock_page(page); @@ -1257,7 +1257,7 @@ static int check_page_locked(struct extent_map_tree *tree, static int check_page_writeback(struct extent_map_tree *tree, struct page *page) { - u64 start = page->index << PAGE_CACHE_SHIFT; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) end_page_writeback(page); @@ -1296,7 +1296,8 @@ static int end_bio_extent_writepage(struct bio *bio, do { struct page *page = bvec->bv_page; - start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; end = start + bvec->bv_len - 1; if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) @@ -1361,7 +1362,8 @@ static int end_bio_extent_readpage(struct bio *bio, do { struct page *page = bvec->bv_page; - start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; end = start + bvec->bv_len - 1; if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) @@ -1427,7 +1429,8 @@ static int end_bio_extent_preparewrite(struct bio *bio, do { struct page *page = bvec->bv_page; - start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; end = start + bvec->bv_len - 1; if (--bvec >= bio->bi_io_vec) @@ -1503,7 +1506,7 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page, get_extent_t *get_extent) { struct inode *inode = page->mapping->host; - u64 start = page->index << PAGE_CACHE_SHIFT; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; @@ -1608,7 +1611,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - u64 start = page->index << PAGE_CACHE_SHIFT; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; @@ -1750,7 +1753,7 @@ EXPORT_SYMBOL(extent_write_full_page); int extent_invalidatepage(struct extent_map_tree *tree, struct page *page, unsigned long offset) { - u64 start = (page->index << PAGE_CACHE_SHIFT); + u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); u64 end = start + PAGE_CACHE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; @@ -1792,7 +1795,7 @@ int extent_prepare_write(struct extent_map_tree *tree, struct inode *inode, struct page *page, unsigned from, unsigned to, get_extent_t *get_extent) { - u64 page_start = page->index << PAGE_CACHE_SHIFT; + u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; u64 page_end = page_start + PAGE_CACHE_SIZE - 1; u64 block_start; u64 orig_block_start; @@ -1890,7 +1893,7 @@ EXPORT_SYMBOL(extent_prepare_write); int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) { struct extent_map *em; - u64 start = page->index << PAGE_CACHE_SHIFT; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; u64 orig_start = start; int ret = 1; @@ -2202,7 +2205,7 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree, if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || ((i == num_pages - 1) && ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - start = page->index << PAGE_CACHE_SHIFT; + start = (u64)page->index << PAGE_CACHE_SHIFT; end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_DIRTY, 0)) { -- cgit v1.2.3 From 179e29e488cc74f1e9bd67bc45f70b832740e9ec Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 1 Nov 2007 11:28:41 -0400 Subject: Btrfs: Fix a number of inline extent problems that Yan Zheng reported. The fixes do a number of things: 1) Most btrfs_drop_extent callers will try to leave the inline extents in place. It can truncate bytes off the beginning of the inline extent if required. 2) writepage can now update the inline extent, allowing mmap writes to go directly into the inline extent. 3) btrfs_truncate_in_transaction truncates inline extents 4) extent_map.c fixed to not merge inline extent mappings and hole mappings together Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 238cb1d81d5..44be9cfd30e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -263,7 +263,12 @@ int add_extent_mapping(struct extent_map_tree *tree, if (prev && prev->end + 1 == em->start && ((em->block_start == EXTENT_MAP_HOLE && prev->block_start == EXTENT_MAP_HOLE) || - (em->block_start == prev->block_end + 1))) { + (em->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (em->block_start == EXTENT_MAP_DELALLOC && + prev->block_start == EXTENT_MAP_DELALLOC) || + (em->block_start < EXTENT_MAP_DELALLOC - 1 && + em->block_start == prev->block_end + 1))) { em->start = prev->start; em->block_start = prev->block_start; rb_erase(&prev->rb_node, &tree->map); @@ -1618,13 +1623,13 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, u64 extent_offset; u64 last_byte = i_size_read(inode); u64 block_start; + u64 iosize; sector_t sector; struct extent_map *em; struct block_device *bdev; int ret; int nr = 0; size_t page_offset = 0; - size_t iosize; size_t blocksize; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; @@ -1684,7 +1689,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, clear_extent_dirty(tree, cur, page_end, GFP_NOFS); break; } - em = get_extent(inode, page, page_offset, cur, end, 0); + em = get_extent(inode, page, page_offset, cur, end, 1); if (IS_ERR(em) || !em) { SetPageError(page); break; -- cgit v1.2.3 From 944746ec7575258475958d858cdf3a0e4c8f7b9e Mon Sep 17 00:00:00 2001 From: Yan Date: Thu, 1 Nov 2007 11:28:42 -0400 Subject: Btrfs: small fixes for find_lock_delalloc_range. There is a 'finish_wait', but no 'prepare_to_wait' . So I think that the 'prepare_to_wait' is missing. The second change is according to the name of variable. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 44be9cfd30e..ff8881fb56d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1044,6 +1044,8 @@ search_again: if (state->state & EXTENT_LOCKED) { DEFINE_WAIT(wait); atomic_inc(&state->refs); + prepare_to_wait(&state->wq, &wait, + TASK_UNINTERRUPTIBLE); write_unlock_irq(&tree->lock); schedule(); write_lock_irq(&tree->lock); @@ -1059,7 +1061,7 @@ search_again: node = rb_next(node); if (!node) break; - total_bytes = state->end - state->start + 1; + total_bytes += state->end - state->start + 1; if (total_bytes >= max_bytes) break; } -- cgit v1.2.3 From b293f02e1423f2099744f3ade23ddd83b65321fc Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 1 Nov 2007 19:45:34 -0400 Subject: Btrfs: Add writepages support Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 135 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 110 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ff8881fb56d..b0677c84bb7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "extent_map.h" /* temporary define until extent_map moves out of btrfs */ @@ -35,6 +36,12 @@ struct tree_entry { struct rb_node rb_node; }; +struct extent_page_data { + struct bio *bio; + struct extent_map_tree *tree; + get_extent_t *get_extent; +}; + void __init extent_map_init(void) { extent_map_cache = btrfs_cache_create("extent_map", @@ -1460,40 +1467,76 @@ static int end_bio_extent_preparewrite(struct bio *bio, #endif } -static int submit_extent_page(int rw, struct extent_map_tree *tree, - struct page *page, sector_t sector, - size_t size, unsigned long offset, - struct block_device *bdev, - bio_end_io_t end_io_func) +static struct bio * +extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) { struct bio *bio; - int ret = 0; - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc(gfp_flags, nr_vecs); - bio->bi_sector = sector; - bio->bi_bdev = bdev; - bio->bi_io_vec[0].bv_page = page; - bio->bi_io_vec[0].bv_len = size; - bio->bi_io_vec[0].bv_offset = offset; - - bio->bi_vcnt = 1; - bio->bi_idx = 0; - bio->bi_size = size; + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } - bio->bi_end_io = end_io_func; - bio->bi_private = tree; + if (bio) { + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; + } + return bio; +} +static int submit_one_bio(int rw, struct bio *bio) +{ + int ret = 0; bio_get(bio); submit_bio(rw, bio); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; - bio_put(bio); return ret; } +static int submit_extent_page(int rw, struct extent_map_tree *tree, + struct page *page, sector_t sector, + size_t size, unsigned long offset, + struct block_device *bdev, + struct bio **bio_ret, + int max_pages, + bio_end_io_t end_io_func) +{ + int ret = 0; + struct bio *bio; + int nr; + + if (bio_ret && *bio_ret) { + bio = *bio_ret; + if (bio->bi_sector + (bio->bi_size >> 9) != sector || + bio_add_page(bio, page, size, offset) < size) { + ret = submit_one_bio(rw, bio); + bio = NULL; + } else { + return 0; + } + } + nr = min(max_pages, bio_get_nr_vecs(bdev)); + bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + if (!bio) { + printk("failed to allocate bio nr %d\n", nr); + } + bio_add_page(bio, page, size, offset); + bio->bi_end_io = end_io_func; + bio->bi_private = tree; + if (bio_ret) { + *bio_ret = bio; + } else { + ret = submit_one_bio(rw, bio); + } + + return ret; +} + void set_page_extent_mapped(struct page *page) { if (!PagePrivate(page)) { @@ -1590,7 +1633,8 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page, if (!ret) { ret = submit_extent_page(READ, tree, page, sector, iosize, page_offset, - bdev, end_bio_extent_readpage); + bdev, NULL, 1, + end_bio_extent_readpage); } if (ret) SetPageError(page); @@ -1613,11 +1657,12 @@ EXPORT_SYMBOL(extent_read_full_page); * are found, they are marked writeback. Then the lock bits are removed * and the end_io handler clears the writeback ranges */ -int extent_write_full_page(struct extent_map_tree *tree, struct page *page, - get_extent_t *get_extent, - struct writeback_control *wbc) +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) { struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + struct extent_map_tree *tree = epd->tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; @@ -1691,7 +1736,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, clear_extent_dirty(tree, cur, page_end, GFP_NOFS); break; } - em = get_extent(inode, page, page_offset, cur, end, 1); + em = epd->get_extent(inode, page, page_offset, cur, end, 1); if (IS_ERR(em) || !em) { SetPageError(page); break; @@ -1734,9 +1779,12 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, if (ret) SetPageError(page); else { + unsigned long nr = end_index + 1; set_range_writeback(tree, cur, cur + iosize - 1); + ret = submit_extent_page(WRITE, tree, page, sector, iosize, page_offset, bdev, + &epd->bio, nr, end_bio_extent_writepage); if (ret) SetPageError(page); @@ -1750,8 +1798,44 @@ done: unlock_page(page); return 0; } + +int extent_write_full_page(struct extent_map_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + }; + + ret = __extent_writepage(page, wbc, &epd); + if (epd.bio) + submit_one_bio(WRITE, epd.bio); + return ret; +} EXPORT_SYMBOL(extent_write_full_page); +int extent_writepages(struct extent_map_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + }; + + ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd); + if (epd.bio) + submit_one_bio(WRITE, epd.bio); + return ret; +} +EXPORT_SYMBOL(extent_writepages); + /* * basic invalidatepage code, this waits on any locked or writeback * ranges corresponding to the page, and then deletes any extent state @@ -1869,6 +1953,7 @@ int extent_prepare_write(struct extent_map_tree *tree, EXTENT_LOCKED, 0, NULL, GFP_NOFS); ret = submit_extent_page(READ, tree, page, sector, iosize, page_offset, em->bdev, + NULL, 1, end_bio_extent_preparewrite); iocount++; block_start = block_start + iosize; -- cgit v1.2.3 From 09be207d1ba224531a61de9afdc07a125e45318c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 7 Nov 2007 21:08:16 -0500 Subject: Btrfs: Fix failure cleanups when allocating extent buffers fail Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 56 +++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 26 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b0677c84bb7..c976615dcda 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2106,25 +2106,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree, spin_lock(&tree->lru_lock); eb = find_lru(tree, start, len); - if (eb) { - goto lru_add; - } spin_unlock(&tree->lru_lock); - if (eb) { - memset(eb, 0, sizeof(*eb)); - } else { - eb = kmem_cache_zalloc(extent_buffer_cache, mask); + return eb; } + + eb = kmem_cache_zalloc(extent_buffer_cache, mask); INIT_LIST_HEAD(&eb->lru); eb->start = start; eb->len = len; atomic_set(&eb->refs, 1); - spin_lock(&tree->lru_lock); -lru_add: - add_lru(tree, eb); - spin_unlock(&tree->lru_lock); return eb; } @@ -2151,7 +2143,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, return NULL; if (eb->flags & EXTENT_BUFFER_FILLED) - return eb; + goto lru_add; if (page0) { eb->first_page = page0; @@ -2169,11 +2161,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); if (!p) { WARN_ON(1); - /* make sure the free only frees the pages we've - * grabbed a reference on - */ - eb->len = i << PAGE_CACHE_SHIFT; - eb->start &= ~((u64)PAGE_CACHE_SIZE - 1); goto fail; } set_page_extent_mapped(p); @@ -2192,9 +2179,20 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, if (uptodate) eb->flags |= EXTENT_UPTODATE; eb->flags |= EXTENT_BUFFER_FILLED; + +lru_add: + spin_lock(&tree->lru_lock); + add_lru(tree, eb); + spin_unlock(&tree->lru_lock); return eb; + fail: - free_extent_buffer(eb); + if (!atomic_dec_and_test(&eb->refs)) + return NULL; + for (index = 0; index < i; index++) { + page_cache_release(extent_buffer_page(eb, index)); + } + __free_extent_buffer(eb); return NULL; } EXPORT_SYMBOL(alloc_extent_buffer); @@ -2204,7 +2202,8 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, gfp_t mask) { unsigned long num_pages = num_extent_pages(start, len); - unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; struct extent_buffer *eb; struct page *p; struct address_space *mapping = tree->mapping; @@ -2215,16 +2214,11 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, return NULL; if (eb->flags & EXTENT_BUFFER_FILLED) - return eb; + goto lru_add; for (i = 0; i < num_pages; i++, index++) { p = find_lock_page(mapping, index); if (!p) { - /* make sure the free only frees the pages we've - * grabbed a reference on - */ - eb->len = i << PAGE_CACHE_SHIFT; - eb->start &= ~((u64)PAGE_CACHE_SIZE - 1); goto fail; } set_page_extent_mapped(p); @@ -2245,9 +2239,19 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, if (uptodate) eb->flags |= EXTENT_UPTODATE; eb->flags |= EXTENT_BUFFER_FILLED; + +lru_add: + spin_lock(&tree->lru_lock); + add_lru(tree, eb); + spin_unlock(&tree->lru_lock); return eb; fail: - free_extent_buffer(eb); + if (!atomic_dec_and_test(&eb->refs)) + return NULL; + for (index = 0; index < i; index++) { + page_cache_release(extent_buffer_page(eb, index)); + } + __free_extent_buffer(eb); return NULL; } EXPORT_SYMBOL(find_extent_buffer); -- cgit v1.2.3 From 856bf3e592f917e7d663cb7fa93e83fe795e8f4e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 8 Nov 2007 10:59:05 -0500 Subject: Btrfs: Avoid extent_buffer lru corruption Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index c976615dcda..754bc42c162 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2047,7 +2047,7 @@ static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb) rm = list_entry(tree->buffer_lru.prev, struct extent_buffer, lru); tree->lru_size--; - list_del(&rm->lru); + list_del_init(&rm->lru); free_extent_buffer(rm); } } else @@ -2187,6 +2187,9 @@ lru_add: return eb; fail: + spin_lock(&tree->lru_lock); + list_del_init(&eb->lru); + spin_unlock(&tree->lru_lock); if (!atomic_dec_and_test(&eb->refs)) return NULL; for (index = 0; index < i; index++) { @@ -2246,6 +2249,9 @@ lru_add: spin_unlock(&tree->lru_lock); return eb; fail: + spin_lock(&tree->lru_lock); + list_del_init(&eb->lru); + spin_unlock(&tree->lru_lock); if (!atomic_dec_and_test(&eb->refs)) return NULL; for (index = 0; index < i; index++) { -- cgit v1.2.3 From 3ab2fb5a8cb003897016b6eb38ddad916226c1b2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 8 Nov 2007 10:59:22 -0500 Subject: Btrfs: Add readpages support Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 70 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 754bc42c162..0077c6c1d9f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "extent_map.h" /* temporary define until extent_map moves out of btrfs */ @@ -1503,7 +1504,7 @@ static int submit_extent_page(int rw, struct extent_map_tree *tree, size_t size, unsigned long offset, struct block_device *bdev, struct bio **bio_ret, - int max_pages, + unsigned long max_pages, bio_end_io_t end_io_func) { int ret = 0; @@ -1520,7 +1521,7 @@ static int submit_extent_page(int rw, struct extent_map_tree *tree, return 0; } } - nr = min(max_pages, bio_get_nr_vecs(bdev)); + nr = min_t(int, max_pages, bio_get_nr_vecs(bdev)); bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); if (!bio) { printk("failed to allocate bio nr %d\n", nr); @@ -1552,8 +1553,10 @@ void set_page_extent_mapped(struct page *page) * into the tree that are removed when the IO is done (by the end_io * handlers) */ -int extent_read_full_page(struct extent_map_tree *tree, struct page *page, - get_extent_t *get_extent) +static int __extent_read_full_page(struct extent_map_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio) { struct inode *inode = page->mapping->host; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; @@ -1631,10 +1634,12 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page, cur + iosize - 1); } if (!ret) { + unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + nr -= page->index; ret = submit_extent_page(READ, tree, page, - sector, iosize, page_offset, - bdev, NULL, 1, - end_bio_extent_readpage); + sector, iosize, page_offset, + bdev, bio, nr, + end_bio_extent_readpage); } if (ret) SetPageError(page); @@ -1649,6 +1654,18 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page, } return 0; } + +int extent_read_full_page(struct extent_map_tree *tree, struct page *page, + get_extent_t *get_extent) +{ + struct bio *bio = NULL; + int ret; + + ret = __extent_read_full_page(tree, page, get_extent, &bio); + if (bio) + submit_one_bio(READ, bio); + return ret; +} EXPORT_SYMBOL(extent_read_full_page); /* @@ -1836,6 +1853,45 @@ int extent_writepages(struct extent_map_tree *tree, } EXPORT_SYMBOL(extent_writepages); +int extent_readpages(struct extent_map_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent) +{ + struct bio *bio = NULL; + unsigned page_idx; + struct pagevec pvec; + + pagevec_init(&pvec, 0); + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (!add_to_page_cache(page, mapping, + page->index, GFP_KERNEL)) { + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add(&pvec); + __extent_read_full_page(tree, page, get_extent, &bio); + } + page_cache_release(page); + } + if (pagevec_count(&pvec)) + __pagevec_lru_add(&pvec); + BUG_ON(!list_empty(pages)); + if (bio) + submit_one_bio(READ, bio); + return 0; +} +EXPORT_SYMBOL(extent_readpages); + /* * basic invalidatepage code, this waits on any locked or writeback * ranges corresponding to the page, and then deletes any extent state -- cgit v1.2.3 From 0591fb56fb732c7bf500d1446d0bf927046d7002 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sun, 11 Nov 2007 08:22:00 -0500 Subject: Btrfs: Fix extent bit range testing It could return the bit as set when there was actually a hole at the very end of the range. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 0077c6c1d9f..b3ff63c4bf2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -96,7 +96,7 @@ void extent_map_tree_empty_lru(struct extent_map_tree *tree) while(!list_empty(&tree->buffer_lru)) { eb = list_entry(tree->buffer_lru.next, struct extent_buffer, lru); - list_del(&eb->lru); + list_del_init(&eb->lru); free_extent_buffer(eb); } } @@ -1212,13 +1212,15 @@ int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, node = tree_search(&tree->state, start); while (node && start <= end) { state = rb_entry(node, struct extent_state, rb_node); - if (state->start > end) - break; if (filled && state->start > start) { bitset = 0; break; } + + if (state->start > end) + break; + if (state->state & bits) { bitset = 1; if (!filled) @@ -2208,6 +2210,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, page_cache_get(page0); mark_page_accessed(page0); set_page_extent_mapped(page0); + WARN_ON(!PageUptodate(page0)); set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); } else { @@ -2248,9 +2251,11 @@ fail: spin_unlock(&tree->lru_lock); if (!atomic_dec_and_test(&eb->refs)) return NULL; - for (index = 0; index < i; index++) { + for (index = 1; index < i; index++) { page_cache_release(extent_buffer_page(eb, index)); } + if (i > 0) + page_cache_release(extent_buffer_page(eb, 0)); __free_extent_buffer(eb); return NULL; } @@ -2310,9 +2315,11 @@ fail: spin_unlock(&tree->lru_lock); if (!atomic_dec_and_test(&eb->refs)) return NULL; - for (index = 0; index < i; index++) { + for (index = 1; index < i; index++) { page_cache_release(extent_buffer_page(eb, index)); } + if (i > 0) + page_cache_release(extent_buffer_page(eb, 0)); __free_extent_buffer(eb); return NULL; } @@ -2329,11 +2336,13 @@ void free_extent_buffer(struct extent_buffer *eb) if (!atomic_dec_and_test(&eb->refs)) return; + WARN_ON(!list_empty(&eb->lru)); num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { + for (i = 1; i < num_pages; i++) { page_cache_release(extent_buffer_page(eb, i)); } + page_cache_release(extent_buffer_page(eb, 0)); __free_extent_buffer(eb); } EXPORT_SYMBOL(free_extent_buffer); @@ -2469,6 +2478,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree, EXTENT_UPTODATE, 1)) { return 0; } + if (start) { WARN_ON(start < eb->start); start_i = (start >> PAGE_CACHE_SHIFT) - @@ -2577,7 +2587,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, *map_start = 0; } else { offset = 0; - *map_start = (i << PAGE_CACHE_SHIFT) - start_offset; + *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; } if (start + min_len > eb->len) { printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len); -- cgit v1.2.3 From 2f4cbe6442d3be7b5e4cf1607a5ab33995fe2d25 Mon Sep 17 00:00:00 2001 From: Wyatt Banks Date: Mon, 19 Nov 2007 10:22:33 -0500 Subject: Btrfs: Return value checking in module init Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b3ff63c4bf2..f91f28efdb5 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -42,18 +42,30 @@ struct extent_page_data { struct extent_map_tree *tree; get_extent_t *get_extent; }; - -void __init extent_map_init(void) +int __init extent_map_init(void) { extent_map_cache = btrfs_cache_create("extent_map", sizeof(struct extent_map), 0, NULL); + if (!extent_map_cache) + return -ENOMEM; extent_state_cache = btrfs_cache_create("extent_state", sizeof(struct extent_state), 0, NULL); + if (!extent_state_cache) + goto free_map_cache; extent_buffer_cache = btrfs_cache_create("extent_buffers", sizeof(struct extent_buffer), 0, NULL); + if (!extent_buffer_cache) + goto free_state_cache; + return 0; + +free_state_cache: + kmem_cache_destroy(extent_state_cache); +free_map_cache: + kmem_cache_destroy(extent_map_cache); + return -ENOMEM; } void __exit extent_map_exit(void) -- cgit v1.2.3 From 3e9fd94ff0028a044d55690eb0a801fd1472e3c6 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 20 Nov 2007 10:47:25 -0500 Subject: Btrfs: Avoid fragmentation from parallel delalloc filling Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 83 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 34 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f91f28efdb5..7fd4eb7a8f0 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1033,11 +1033,11 @@ out: EXPORT_SYMBOL(find_first_extent_bit); u64 find_lock_delalloc_range(struct extent_map_tree *tree, - u64 start, u64 lock_start, u64 *end, u64 max_bytes) + u64 *start, u64 *end, u64 max_bytes) { struct rb_node *node; struct extent_state *state; - u64 cur_start = start; + u64 cur_start = *start; u64 found = 0; u64 total_bytes = 0; @@ -1054,27 +1054,43 @@ search_again: while(1) { state = rb_entry(node, struct extent_state, rb_node); - if (state->start != cur_start) { + if (found && state->start != cur_start) { goto out; } if (!(state->state & EXTENT_DELALLOC)) { goto out; } - if (state->start >= lock_start) { - if (state->state & EXTENT_LOCKED) { - DEFINE_WAIT(wait); - atomic_inc(&state->refs); - prepare_to_wait(&state->wq, &wait, - TASK_UNINTERRUPTIBLE); - write_unlock_irq(&tree->lock); - schedule(); - write_lock_irq(&tree->lock); - finish_wait(&state->wq, &wait); - free_extent_state(state); - goto search_again; + if (!found) { + struct extent_state *prev_state; + struct rb_node *prev_node = node; + while(1) { + prev_node = rb_prev(prev_node); + if (!prev_node) + break; + prev_state = rb_entry(prev_node, + struct extent_state, + rb_node); + if (!(prev_state->state & EXTENT_DELALLOC)) + break; + state = prev_state; + node = prev_node; } - state->state |= EXTENT_LOCKED; } + if (state->state & EXTENT_LOCKED) { + DEFINE_WAIT(wait); + atomic_inc(&state->refs); + prepare_to_wait(&state->wq, &wait, + TASK_UNINTERRUPTIBLE); + write_unlock_irq(&tree->lock); + schedule(); + write_lock_irq(&tree->lock); + finish_wait(&state->wq, &wait); + free_extent_state(state); + goto search_again; + } + state->state |= EXTENT_LOCKED; + if (!found) + *start = state->start; found++; *end = state->end; cur_start = state->end + 1; @@ -1695,6 +1711,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd = data; struct extent_map_tree *tree = epd->tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 delalloc_start; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; @@ -1729,25 +1746,23 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); - lock_extent(tree, start, page_end, GFP_NOFS); - nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1, - &delalloc_end, - 128 * 1024 * 1024); - if (nr_delalloc) { - tree->ops->fill_delalloc(inode, start, delalloc_end); - if (delalloc_end >= page_end + 1) { - clear_extent_bit(tree, page_end + 1, delalloc_end, - EXTENT_LOCKED | EXTENT_DELALLOC, - 1, 0, GFP_NOFS); - } - clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC, - 0, 0, GFP_NOFS); - if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { - printk("found delalloc bits after clear extent_bit\n"); - } - } else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { - printk("found delalloc bits after find_delalloc_range returns 0\n"); + delalloc_start = start; + delalloc_end = 0; + while(delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc <= 0) + break; + tree->ops->fill_delalloc(inode, delalloc_start, + delalloc_end); + clear_extent_bit(tree, delalloc_start, + delalloc_end, + EXTENT_LOCKED | EXTENT_DELALLOC, + 1, 0, GFP_NOFS); + delalloc_start = delalloc_end + 1; } + lock_extent(tree, start, page_end, GFP_NOFS); end = page_end; if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { -- cgit v1.2.3 From 7073c8e852946274e4d50fdf072438612f9dc845 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 20 Nov 2007 13:44:45 -0500 Subject: Btrfs: Make sure page mapping dirty tag is properly cleared Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7fd4eb7a8f0..a4e9096754f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1825,12 +1825,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret) SetPageError(page); else { - unsigned long nr = end_index + 1; + unsigned long max_nr = end_index + 1; set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + printk("warning page %lu not writeback, " + "cur %llu end %llu\n", page->index, + (unsigned long long)cur, + (unsigned long long)end); + } ret = submit_extent_page(WRITE, tree, page, sector, iosize, page_offset, bdev, - &epd->bio, nr, + &epd->bio, max_nr, end_bio_extent_writepage); if (ret) SetPageError(page); @@ -1840,6 +1846,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, nr++; } done: + if (nr == 0) { + /* make sure the mapping tag for page dirty gets cleared */ + set_page_writeback(page); + end_page_writeback(page); + } unlock_extent(tree, start, page_end, GFP_NOFS); unlock_page(page); return 0; @@ -2408,6 +2419,13 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree, } } clear_page_dirty_for_io(page); + write_lock_irq(&page->mapping->tree_lock); + if (!PageDirty(page)) { + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + } + write_unlock_irq(&page->mapping->tree_lock); unlock_page(page); } return 0; -- cgit v1.2.3 From 015a739c7c238768fbfa4eea8ea2ebc1a35e7bb1 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 26 Nov 2007 16:15:16 -0800 Subject: Btrfs: Handle writeback under high memory pressure better Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a4e9096754f..55f272c335c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1861,13 +1861,25 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, struct writeback_control *wbc) { int ret; + struct address_space *mapping = page->mapping; struct extent_page_data epd = { .bio = NULL, .tree = tree, .get_extent = get_extent, }; + struct writeback_control wbc_writepages = { + .bdi = wbc->bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = 64, + .range_start = page_offset(page) + PAGE_CACHE_SIZE, + .range_end = (loff_t)-1, + }; + ret = __extent_writepage(page, wbc, &epd); + + write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd); if (epd.bio) submit_one_bio(WRITE, epd.bio); return ret; -- cgit v1.2.3 From 793955bca66c99defdffc857ae6eb7e8431d6bbe Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 26 Nov 2007 16:34:41 -0800 Subject: Btrfs: Limit btree writeback to prevent seeks Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 55f272c335c..b6a4974ecc2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1106,6 +1106,45 @@ out: return found; } +u64 count_range_bits(struct extent_map_tree *tree, + u64 *start, u64 max_bytes, unsigned long bits) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 total_bytes = 0; + int found = 0; + + write_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, cur_start); + if (!node || IS_ERR(node)) { + goto out; + } + + while(1) { + state = rb_entry(node, struct extent_state, rb_node); + if ((state->state & bits)) { + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = state->start; + found = 1; + } + } + node = rb_next(node); + if (!node) + break; + } +out: + write_unlock_irq(&tree->lock); + return total_bytes; +} + /* * helper function to lock both pages and extents in the tree. * pages must be locked first. -- cgit v1.2.3 From ca6646264b7dab662d84435441164bb2a8e8885a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 27 Nov 2007 11:16:35 -0500 Subject: Btrfs: Add efficient dirty accounting to the extent_map tree Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b6a4974ecc2..06e437723dc 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -42,6 +42,7 @@ struct extent_page_data { struct extent_map_tree *tree; get_extent_t *get_extent; }; + int __init extent_map_init(void) { extent_map_cache = btrfs_cache_create("extent_map", @@ -94,6 +95,7 @@ void extent_map_tree_init(struct extent_map_tree *tree, tree->map.rb_node = NULL; tree->state.rb_node = NULL; tree->ops = NULL; + tree->dirty_bytes = 0; rwlock_init(&tree->lock); spin_lock_init(&tree->lru_lock); tree->mapping = mapping; @@ -414,6 +416,8 @@ static int insert_state(struct extent_map_tree *tree, printk("end < start %Lu %Lu\n", end, start); WARN_ON(1); } + if (bits & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; state->state |= bits; state->start = start; state->end = end; @@ -476,6 +480,12 @@ static int clear_state_bit(struct extent_map_tree *tree, int delete) { int ret = state->state & bits; + + if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + WARN_ON(range > tree->dirty_bytes); + tree->dirty_bytes -= range; + } state->state &= ~bits; if (wake) wake_up(&state->wq); @@ -668,6 +678,17 @@ out: } EXPORT_SYMBOL(wait_extent_bit); +static void set_state_bits(struct extent_map_tree *tree, + struct extent_state *state, + int bits) +{ + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + tree->dirty_bytes += range; + } + state->state |= bits; +} + /* * set some bits on a range in the tree. This may require allocations * or sleeping, so the gfp mask is used to indicate what is allowed. @@ -727,7 +748,7 @@ again: err = -EEXIST; goto out; } - state->state |= bits; + set_state_bits(tree, state, bits); start = state->end + 1; merge_state(tree, state); goto search_again; @@ -762,7 +783,7 @@ again: if (err) goto out; if (state->end <= end) { - state->state |= bits; + set_state_bits(tree, state, bits); start = state->end + 1; merge_state(tree, state); } else { @@ -808,7 +829,7 @@ again: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - prealloc->state |= bits; + set_state_bits(tree, prealloc, bits); merge_state(tree, prealloc); prealloc = NULL; goto out; @@ -1116,6 +1137,11 @@ u64 count_range_bits(struct extent_map_tree *tree, int found = 0; write_lock_irq(&tree->lock); + if (bits == EXTENT_DIRTY) { + *start = 0; + total_bytes = tree->dirty_bytes; + goto out; + } /* * this search will find all the extents that end after * our range starts. -- cgit v1.2.3 From 17636e03f42a1a42fed3834859de4702bd655fd1 Mon Sep 17 00:00:00 2001 From: Christian Hesse Date: Tue, 11 Dec 2007 09:25:06 -0500 Subject: Btrfs: section mismatch warnings --Boundary-00=_CcOWHFYK4T+JwSj Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: 7bit Content-Disposition: inline Hello everybody, compiling btrfs into the kernel results in section mismatch warnings. __exit functions are called where they are not allowed to. The attached patch fixes this for me. Not sure if it is correct though. Signed-off-by: Christian Hesse -- Regards, Chris --Boundary-00=_CcOWHFYK4T+JwSj Content-Type: text/x-diff; charset="iso-8859-1"; name="btrfs-section_mismatches.patch" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="btrfs-section_mismatches.patch" Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 06e437723dc..8326a18413f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -69,7 +69,7 @@ free_map_cache: return -ENOMEM; } -void __exit extent_map_exit(void) +void extent_map_exit(void) { struct extent_state *state; -- cgit v1.2.3 From 6da6abae027e2dbc59bca5f4168b0760f25068c7 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 18 Dec 2007 16:15:09 -0500 Subject: Btrfs: Back port to 2.6.18-el kernels Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 168 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 160 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8326a18413f..0d1e59a86e4 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1585,8 +1585,18 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, static int submit_one_bio(int rw, struct bio *bio) { + u64 maxsector; int ret = 0; + bio_get(bio); + + maxsector = bio->bi_bdev->bd_inode->i_size >> 9; + if (maxsector < bio->bi_sector) { + printk("sector too large max %Lu got %llu\n", maxsector, + (unsigned long long)bio->bi_sector); + WARN_ON(1); + } + submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; @@ -1678,8 +1688,12 @@ static int __extent_read_full_page(struct extent_map_tree *tree, while (cur <= end) { if (cur >= last_byte) { + char *userpage; iosize = PAGE_CACHE_SIZE - page_offset; - zero_user_page(page, page_offset, iosize, KM_USER0); + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); set_extent_uptodate(tree, cur, cur + iosize - 1, GFP_NOFS); unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); @@ -1707,7 +1721,12 @@ static int __extent_read_full_page(struct extent_map_tree *tree, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - zero_user_page(page, page_offset, iosize, KM_USER0); + char *userpage; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, GFP_NOFS); unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); @@ -1804,9 +1823,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (page->index == end_index) { + char *userpage; + size_t offset = i_size & (PAGE_CACHE_SIZE - 1); - zero_user_page(page, offset, - PAGE_CACHE_SIZE - offset, KM_USER0); + + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); } set_page_extent_mapped(page); @@ -1921,6 +1945,129 @@ done: return 0; } +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) + +/* Taken directly from 2.6.23 for 2.6.18 back port */ +typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, + void *data); + +/** + * write_cache_pages - walk the list of dirty pages of the given address space + * and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +static int write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (ret || (--(wbc->nr_to_write) <= 0)) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + return ret; +} +#endif + int extent_write_full_page(struct extent_map_tree *tree, struct page *page, get_extent_t *get_extent, struct writeback_control *wbc) @@ -1945,18 +2092,20 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, ret = __extent_writepage(page, wbc, &epd); write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd); - if (epd.bio) + if (epd.bio) { submit_one_bio(WRITE, epd.bio); + } return ret; } EXPORT_SYMBOL(extent_write_full_page); + int extent_writepages(struct extent_map_tree *tree, struct address_space *mapping, get_extent_t *get_extent, struct writeback_control *wbc) { - int ret; + int ret = 0; struct extent_page_data epd = { .bio = NULL, .tree = tree, @@ -1964,8 +2113,9 @@ int extent_writepages(struct extent_map_tree *tree, }; ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd); - if (epd.bio) + if (epd.bio) { submit_one_bio(WRITE, epd.bio); + } return ret; } EXPORT_SYMBOL(extent_writepages); @@ -2106,7 +2256,9 @@ int extent_prepare_write(struct extent_map_tree *tree, flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); } - if (!isnew && !PageUptodate(page) && + if ((em->block_start != EXTENT_MAP_HOLE && + em->block_start != EXTENT_MAP_INLINE) && + !isnew && !PageUptodate(page) && (block_off_end > to || block_off_start < from) && !test_range_bit(tree, block_start, cur_end, EXTENT_UPTODATE, 1)) { -- cgit v1.2.3 From 190662b2128dd648749e197f5563e9f6bbb5e05c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 18 Dec 2007 16:25:45 -0500 Subject: Btrfs: Fix delayed allocation to avoid missing delalloc extents find_lock_delalloc_range could exit out too early Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 0d1e59a86e4..a0dff34dd43 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1070,6 +1070,7 @@ u64 find_lock_delalloc_range(struct extent_map_tree *tree, search_again: node = tree_search(&tree->state, cur_start); if (!node || IS_ERR(node)) { + *end = (u64)-1; goto out; } @@ -1079,6 +1080,8 @@ search_again: goto out; } if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; goto out; } if (!found) { @@ -1841,8 +1844,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, &delalloc_end, 128 * 1024 * 1024); - if (nr_delalloc <= 0) - break; + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } tree->ops->fill_delalloc(inode, delalloc_start, delalloc_end); clear_extent_bit(tree, delalloc_start, -- cgit v1.2.3 From 1832a6d5ee3b1af61001cadba9e10da9e91af4a4 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 21 Dec 2007 16:27:21 -0500 Subject: Btrfs: Implement basic support for -ENOSPC This is intended to prevent accidentally filling the drive. A determined user can still make things oops. It includes some accounting of the current bytes under delayed allocation, but this will change as things get optimized Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a0dff34dd43..2b92f107027 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1131,7 +1131,8 @@ out: } u64 count_range_bits(struct extent_map_tree *tree, - u64 *start, u64 max_bytes, unsigned long bits) + u64 *start, u64 search_end, u64 max_bytes, + unsigned long bits) { struct rb_node *node; struct extent_state *state; @@ -1139,9 +1140,14 @@ u64 count_range_bits(struct extent_map_tree *tree, u64 total_bytes = 0; int found = 0; + if (search_end <= cur_start) { + printk("search_end %Lu start %Lu\n", search_end, cur_start); + WARN_ON(1); + return 0; + } + write_lock_irq(&tree->lock); - if (bits == EXTENT_DIRTY) { - *start = 0; + if (cur_start == 0 && bits == EXTENT_DIRTY) { total_bytes = tree->dirty_bytes; goto out; } @@ -1156,8 +1162,11 @@ u64 count_range_bits(struct extent_map_tree *tree, while(1) { state = rb_entry(node, struct extent_state, rb_node); - if ((state->state & bits)) { - total_bytes += state->end - state->start + 1; + if (state->start > search_end) + break; + if (state->end >= cur_start && (state->state & bits)) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); if (total_bytes >= max_bytes) break; if (!found) { @@ -1173,7 +1182,6 @@ out: write_unlock_irq(&tree->lock); return total_bytes; } - /* * helper function to lock both pages and extents in the tree. * pages must be locked first. -- cgit v1.2.3 From 5d4fb734b4cb89a3902d6de309af0f6f3e63fe73 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 21 Dec 2007 16:27:23 -0500 Subject: Btrfs: Fix an off by one in the extent_map prepare write code Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2b92f107027..f53d8c97621 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2270,7 +2270,7 @@ int extent_prepare_write(struct extent_map_tree *tree, kunmap_atomic(kaddr, KM_USER0); } if ((em->block_start != EXTENT_MAP_HOLE && - em->block_start != EXTENT_MAP_INLINE) && + em->block_start != EXTENT_MAP_INLINE) && !isnew && !PageUptodate(page) && (block_off_end > to || block_off_start < from) && !test_range_bit(tree, block_start, cur_end, @@ -2279,7 +2279,7 @@ int extent_prepare_write(struct extent_map_tree *tree, u64 extent_offset = block_start - em->start; size_t iosize; sector = (em->block_start + extent_offset) >> 9; - iosize = (cur_end - block_start + blocksize - 1) & + iosize = (cur_end - block_start + blocksize) & ~((u64)blocksize - 1); /* * we've already got the extent locked, but we -- cgit v1.2.3 From bcd987feefe8da66bc59b4e6bd51761a9820588c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 9 Jan 2008 06:28:28 -0500 Subject: Btrfs: Remove extent_map debugging message Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f53d8c97621..9d6aefa937c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -273,7 +273,6 @@ int add_extent_mapping(struct extent_map_tree *tree, rb = tree_insert(&tree->map, em->end, &em->rb_node); if (rb) { prev = rb_entry(rb, struct extent_map, rb_node); - printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end); ret = -EEXIST; goto out; } -- cgit v1.2.3 From 55c69072d6bd5be170a85467f64a20963cddf490 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 9 Jan 2008 15:55:33 -0500 Subject: Btrfs: Fix extent_buffer usage when nodesize != leafsize Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 9d6aefa937c..f3a384ed700 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1663,6 +1663,13 @@ void set_page_extent_mapped(struct page *page) } } +void set_page_extent_head(struct page *page, unsigned long len) +{ + WARN_ON(page->private && page->private == EXTENT_PAGE_PRIVATE && + PageDirty(page)); + set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); +} + /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io @@ -2490,8 +2497,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, mark_page_accessed(page0); set_page_extent_mapped(page0); WARN_ON(!PageUptodate(page0)); - set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE | - len << 2); + set_page_extent_head(page0, len); } else { i = 0; } @@ -2505,8 +2511,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, mark_page_accessed(p); if (i == 0) { eb->first_page = p; - set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE | - len << 2); + set_page_extent_head(p, len); } else { set_page_private(p, EXTENT_PAGE_PRIVATE); } @@ -2569,8 +2574,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, if (i == 0) { eb->first_page = p; - set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE | - len << 2); + set_page_extent_head(p, len); } else { set_page_private(p, EXTENT_PAGE_PRIVATE); } @@ -2643,6 +2647,11 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree, for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); lock_page(page); + if (i == 0) + set_page_extent_head(page, eb->len); + else + set_page_private(page, EXTENT_PAGE_PRIVATE); + /* * if we're on the last page or the first page and the * block isn't aligned on a page boundary, do extra checks @@ -2697,9 +2706,12 @@ int set_extent_buffer_dirty(struct extent_map_tree *tree, */ if (i == 0) { lock_page(page); - set_page_private(page, - EXTENT_PAGE_PRIVATE_FIRST_PAGE | - eb->len << 2); + set_page_extent_head(page, eb->len); + } else if (PagePrivate(page) && + page->private != EXTENT_PAGE_PRIVATE) { + lock_page(page); + set_page_extent_mapped(page); + unlock_page(page); } __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); if (i == 0) -- cgit v1.2.3 From b3a0d8d28c607cb2531a68742afc2b967b1f6083 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 14 Jan 2008 13:22:53 -0500 Subject: Btrfs: Remove verbose WARN_ON Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f3a384ed700..d71aed341ab 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1665,8 +1665,6 @@ void set_page_extent_mapped(struct page *page) void set_page_extent_head(struct page *page, unsigned long len) { - WARN_ON(page->private && page->private == EXTENT_PAGE_PRIVATE && - PageDirty(page)); set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); } -- cgit v1.2.3 From f0c5da1446cc500856a5e31c9a0e2a7bdd30e663 Mon Sep 17 00:00:00 2001 From: Yan Date: Tue, 22 Jan 2008 12:46:56 -0500 Subject: Btrfs: Fix for test_range_bit test_range_bit doesn't properly handle the case: there's a hole at the end of the range and there's no other extent_state after the range. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index d71aed341ab..485cf0719b3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1336,6 +1336,11 @@ int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, if (start > end) break; node = rb_next(node); + if (!node) { + if (filled) + bitset = 0; + break; + } } read_unlock_irq(&tree->lock); return bitset; -- cgit v1.2.3 From 5f56406aabdf5444d040c5955effc665b1d0dbaf Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 22 Jan 2008 16:47:59 -0500 Subject: Btrfs: Fix hole insertion corner cases There were a few places that could cause duplicate extent insertion, this adjusts the code that creates holes to avoid it. lookup_extent_map is changed to correctly return all of the extents in a range, even when there are none matching at the start of the range. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 485cf0719b3..010a287fbd7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -204,10 +204,12 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, } static struct rb_node *__tree_search(struct rb_root *root, u64 offset, - struct rb_node **prev_ret) + struct rb_node **prev_ret, + struct rb_node **next_ret) { struct rb_node * n = root->rb_node; struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; struct tree_entry *entry; struct tree_entry *prev_entry = NULL; @@ -223,13 +225,25 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, else return n; } - if (!prev_ret) - return NULL; - while(prev && offset > prev_entry->end) { - prev = rb_next(prev); + + if (prev_ret) { + orig_prev = prev; + while(prev && offset > prev_entry->end) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { prev_entry = rb_entry(prev, struct tree_entry, rb_node); + while(prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; } - *prev_ret = prev; return NULL; } @@ -237,7 +251,7 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) { struct rb_node *prev; struct rb_node *ret; - ret = __tree_search(root, offset, &prev); + ret = __tree_search(root, offset, &prev, NULL); if (!ret) return prev; return ret; @@ -248,7 +262,7 @@ static int tree_delete(struct rb_root *root, u64 offset) struct rb_node *node; struct tree_entry *entry; - node = __tree_search(root, offset, NULL); + node = __tree_search(root, offset, NULL, NULL); if (!node) return -ENOENT; entry = rb_entry(node, struct tree_entry, rb_node); @@ -314,9 +328,21 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, { struct extent_map *em; struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; read_lock_irq(&tree->lock); - rb_node = tree_search(&tree->map, start); + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + if (em->start <= end && em->end >= start) + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + if (em->start <= end && em->end >= start) + goto found; + } if (!rb_node) { em = NULL; goto out; @@ -330,6 +356,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, em = NULL; goto out; } +found: atomic_inc(&em->refs); out: read_unlock_irq(&tree->lock); -- cgit v1.2.3 From d1310b2e0cd98eb1348553e69b73827b436dca7b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 24 Jan 2008 16:13:08 -0500 Subject: Btrfs: Split the extent_map code into two parts There is now extent_map for mapping offsets in the file to disk and extent_io for state tracking, IO submission and extent_bufers. The new extent_map code shifts from [start,end] pairs to [start,len], and pushes the locking out into the caller. This allows a few performance optimizations and is easier to use. A number of extent_map usage bugs were fixed, mostly with failing to remove extent_map entries when changing the file. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 3119 ++----------------------------------------------- 1 file changed, 84 insertions(+), 3035 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 010a287fbd7..268ad8facf6 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,17 +1,10 @@ -#include -#include -#include -#include +#include #include -#include -#include +#include #include #include -#include -#include #include -#include -#include +#include #include "extent_map.h" /* temporary define until extent_map moves out of btrfs */ @@ -21,27 +14,6 @@ struct kmem_cache *btrfs_cache_create(const char *name, size_t size, unsigned long)); static struct kmem_cache *extent_map_cache; -static struct kmem_cache *extent_state_cache; -static struct kmem_cache *extent_buffer_cache; - -static LIST_HEAD(buffers); -static LIST_HEAD(states); - -static spinlock_t state_lock = SPIN_LOCK_UNLOCKED; -#define BUFFER_LRU_MAX 64 - -struct tree_entry { - u64 start; - u64 end; - int in_tree; - struct rb_node rb_node; -}; - -struct extent_page_data { - struct bio *bio; - struct extent_map_tree *tree; - get_extent_t *get_extent; -}; int __init extent_map_init(void) { @@ -50,72 +22,23 @@ int __init extent_map_init(void) NULL); if (!extent_map_cache) return -ENOMEM; - extent_state_cache = btrfs_cache_create("extent_state", - sizeof(struct extent_state), 0, - NULL); - if (!extent_state_cache) - goto free_map_cache; - extent_buffer_cache = btrfs_cache_create("extent_buffers", - sizeof(struct extent_buffer), 0, - NULL); - if (!extent_buffer_cache) - goto free_state_cache; return 0; - -free_state_cache: - kmem_cache_destroy(extent_state_cache); -free_map_cache: - kmem_cache_destroy(extent_map_cache); - return -ENOMEM; } void extent_map_exit(void) { - struct extent_state *state; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, list); - printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs)); - list_del(&state->list); - kmem_cache_free(extent_state_cache, state); - - } - if (extent_map_cache) kmem_cache_destroy(extent_map_cache); - if (extent_state_cache) - kmem_cache_destroy(extent_state_cache); - if (extent_buffer_cache) - kmem_cache_destroy(extent_buffer_cache); } -void extent_map_tree_init(struct extent_map_tree *tree, - struct address_space *mapping, gfp_t mask) +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) { tree->map.rb_node = NULL; - tree->state.rb_node = NULL; - tree->ops = NULL; - tree->dirty_bytes = 0; - rwlock_init(&tree->lock); - spin_lock_init(&tree->lru_lock); - tree->mapping = mapping; - INIT_LIST_HEAD(&tree->buffer_lru); - tree->lru_size = 0; + tree->last = NULL; + spin_lock_init(&tree->lock); } EXPORT_SYMBOL(extent_map_tree_init); -void extent_map_tree_empty_lru(struct extent_map_tree *tree) -{ - struct extent_buffer *eb; - while(!list_empty(&tree->buffer_lru)) { - eb = list_entry(tree->buffer_lru.next, struct extent_buffer, - lru); - list_del_init(&eb->lru); - free_extent_buffer(eb); - } -} -EXPORT_SYMBOL(extent_map_tree_empty_lru); - struct extent_map *alloc_extent_map(gfp_t mask) { struct extent_map *em; @@ -123,6 +46,7 @@ struct extent_map *alloc_extent_map(gfp_t mask) if (!em || IS_ERR(em)) return em; em->in_tree = 0; + em->flags = 0; atomic_set(&em->refs, 1); return em; } @@ -132,6 +56,7 @@ void free_extent_map(struct extent_map *em) { if (!em) return; + WARN_ON(atomic_read(&em->refs) == 0); if (atomic_dec_and_test(&em->refs)) { WARN_ON(em->in_tree); kmem_cache_free(extent_map_cache, em); @@ -139,64 +64,28 @@ void free_extent_map(struct extent_map *em) } EXPORT_SYMBOL(free_extent_map); - -struct extent_state *alloc_extent_state(gfp_t mask) -{ - struct extent_state *state; - unsigned long flags; - - state = kmem_cache_alloc(extent_state_cache, mask); - if (!state || IS_ERR(state)) - return state; - state->state = 0; - state->in_tree = 0; - state->private = 0; - - spin_lock_irqsave(&state_lock, flags); - list_add(&state->list, &states); - spin_unlock_irqrestore(&state_lock, flags); - - atomic_set(&state->refs, 1); - init_waitqueue_head(&state->wq); - return state; -} -EXPORT_SYMBOL(alloc_extent_state); - -void free_extent_state(struct extent_state *state) -{ - unsigned long flags; - if (!state) - return; - if (atomic_dec_and_test(&state->refs)) { - WARN_ON(state->in_tree); - spin_lock_irqsave(&state_lock, flags); - list_del(&state->list); - spin_unlock_irqrestore(&state_lock, flags); - kmem_cache_free(extent_state_cache, state); - } -} -EXPORT_SYMBOL(free_extent_state); - static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) { struct rb_node ** p = &root->rb_node; struct rb_node * parent = NULL; - struct tree_entry *entry; + struct extent_map *entry; while(*p) { parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); + entry = rb_entry(parent, struct extent_map, rb_node); + + WARN_ON(!entry->in_tree); if (offset < entry->start) p = &(*p)->rb_left; - else if (offset > entry->end) + else if (offset >= extent_map_end(entry)) p = &(*p)->rb_right; else return parent; } - entry = rb_entry(node, struct tree_entry, rb_node); + entry = rb_entry(node, struct extent_map, rb_node); entry->in_tree = 1; rb_link_node(node, parent, p); rb_insert_color(node, root); @@ -210,17 +99,19 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct rb_node * n = root->rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev = NULL; - struct tree_entry *entry; - struct tree_entry *prev_entry = NULL; + struct extent_map *entry; + struct extent_map *prev_entry = NULL; while(n) { - entry = rb_entry(n, struct tree_entry, rb_node); + entry = rb_entry(n, struct extent_map, rb_node); prev = n; prev_entry = entry; + WARN_ON(!entry->in_tree); + if (offset < entry->start) n = n->rb_left; - else if (offset > entry->end) + else if (offset >= extent_map_end(entry)) n = n->rb_right; else return n; @@ -228,19 +119,19 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, if (prev_ret) { orig_prev = prev; - while(prev && offset > prev_entry->end) { + while(prev && offset >= extent_map_end(prev_entry)) { prev = rb_next(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); + prev_entry = rb_entry(prev, struct extent_map, rb_node); } *prev_ret = prev; prev = orig_prev; } if (next_ret) { - prev_entry = rb_entry(prev, struct tree_entry, rb_node); + prev_entry = rb_entry(prev, struct extent_map, rb_node); while(prev && offset < prev_entry->start) { prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); + prev_entry = rb_entry(prev, struct extent_map, rb_node); } *next_ret = prev; } @@ -257,22 +148,26 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) return ret; } -static int tree_delete(struct rb_root *root, u64 offset) +static int mergable_maps(struct extent_map *prev, struct extent_map *next) { - struct rb_node *node; - struct tree_entry *entry; - - node = __tree_search(root, offset, NULL, NULL); - if (!node) - return -ENOENT; - entry = rb_entry(node, struct tree_entry, rb_node); - entry->in_tree = 0; - rb_erase(node, root); + if (extent_map_end(prev) == next->start && + prev->flags == next->flags && + prev->bdev == next->bdev && + ((next->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || + (next->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (next->block_start == EXTENT_MAP_DELALLOC && + prev->block_start == EXTENT_MAP_DELALLOC) || + (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && + next->block_start == extent_map_block_end(prev)))) { + return 1; + } return 0; } /* - * add_extent_mapping tries a simple backward merge with existing + * add_extent_mapping tries a simple forward/backward merge with existing * mappings. The extent_map struct passed in will be inserted into * the tree directly (no copies made, just a reference taken). */ @@ -280,13 +175,12 @@ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { int ret = 0; - struct extent_map *prev = NULL; + struct extent_map *merge = NULL; struct rb_node *rb; - write_lock_irq(&tree->lock); - rb = tree_insert(&tree->map, em->end, &em->rb_node); + rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { - prev = rb_entry(rb, struct extent_map, rb_node); + merge = rb_entry(rb, struct extent_map, rb_node); ret = -EEXIST; goto out; } @@ -294,53 +188,60 @@ int add_extent_mapping(struct extent_map_tree *tree, if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) - prev = rb_entry(rb, struct extent_map, rb_node); - if (prev && prev->end + 1 == em->start && - ((em->block_start == EXTENT_MAP_HOLE && - prev->block_start == EXTENT_MAP_HOLE) || - (em->block_start == EXTENT_MAP_INLINE && - prev->block_start == EXTENT_MAP_INLINE) || - (em->block_start == EXTENT_MAP_DELALLOC && - prev->block_start == EXTENT_MAP_DELALLOC) || - (em->block_start < EXTENT_MAP_DELALLOC - 1 && - em->block_start == prev->block_end + 1))) { - em->start = prev->start; - em->block_start = prev->block_start; - rb_erase(&prev->rb_node, &tree->map); - prev->in_tree = 0; - free_extent_map(prev); + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); } } + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } + tree->last = em; out: - write_unlock_irq(&tree->lock); return ret; } EXPORT_SYMBOL(add_extent_mapping); +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + /* * lookup_extent_mapping returns the first extent_map struct in the - * tree that intersects the [start, end] (inclusive) range. There may + * tree that intersects the [start, len] range. There may * be additional objects in the tree that intersect, so check the object * returned carefully to make sure you don't need additional lookups. */ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 end) + u64 start, u64 len) { struct extent_map *em; struct rb_node *rb_node; - struct rb_node *prev = NULL; - struct rb_node *next = NULL; + struct rb_node *prev = NULL; struct rb_node *next = NULL; u64 end = range_end(start, len); em = tree->last; if (em && end > em->start && start < extent_map_end(em)) goto found; - read_lock_irq(&tree->lock); rb_node = __tree_search(&tree->map, start, &prev, &next); if (!rb_node && prev) { em = rb_entry(prev, struct extent_map, rb_node); - if (em->start <= end && em->end >= start) + if (end > em->start && start < extent_map_end(em)) goto found; } if (!rb_node && next) { em = rb_entry(next, struct extent_map, rb_node); - if (em->start <= end && em->end >= start) + if (end > em->start && start < extent_map_end(em)) goto found; } if (!rb_node) { @@ -352,14 +253,16 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, goto out; } em = rb_entry(rb_node, struct extent_map, rb_node); - if (em->end < start || em->start > end) { - em = NULL; - goto out; - } + if (end > em->start && start < extent_map_end(em)) + goto found; + + em = NULL; + goto out; + found: atomic_inc(&em->refs); + tree->last = em; out: - read_unlock_irq(&tree->lock); return em; } EXPORT_SYMBOL(lookup_extent_mapping); @@ -370,2866 +273,12 @@ EXPORT_SYMBOL(lookup_extent_mapping); */ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { - int ret; + int ret = 0; - write_lock_irq(&tree->lock); - ret = tree_delete(&tree->map, em->end); - write_unlock_irq(&tree->lock); + rb_erase(&em->rb_node, &tree->map); + em->in_tree = 0; + if (tree->last == em) + tree->last = NULL; return ret; } EXPORT_SYMBOL(remove_extent_mapping); - -/* - * utility function to look for merge candidates inside a given range. - * Any extents with matching state are merged together into a single - * extent in the tree. Extents with EXTENT_IO in their state field - * are not merged because the end_io handlers need to be able to do - * operations on them without sleeping (or doing allocations/splits). - * - * This should be called with the tree lock held. - */ -static int merge_state(struct extent_map_tree *tree, - struct extent_state *state) -{ - struct extent_state *other; - struct rb_node *other_node; - - if (state->state & EXTENT_IOBITS) - return 0; - - other_node = rb_prev(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->end == state->start - 1 && - other->state == state->state) { - state->start = other->start; - other->in_tree = 0; - rb_erase(&other->rb_node, &tree->state); - free_extent_state(other); - } - } - other_node = rb_next(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->start == state->end + 1 && - other->state == state->state) { - other->start = state->start; - state->in_tree = 0; - rb_erase(&state->rb_node, &tree->state); - free_extent_state(state); - } - } - return 0; -} - -/* - * insert an extent_state struct into the tree. 'bits' are set on the - * struct before it is inserted. - * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. - * - * The tree lock is not taken internally. This is a utility function and - * probably isn't what you want to call (see set/clear_extent_bit). - */ -static int insert_state(struct extent_map_tree *tree, - struct extent_state *state, u64 start, u64 end, - int bits) -{ - struct rb_node *node; - - if (end < start) { - printk("end < start %Lu %Lu\n", end, start); - WARN_ON(1); - } - if (bits & EXTENT_DIRTY) - tree->dirty_bytes += end - start + 1; - state->state |= bits; - state->start = start; - state->end = end; - node = tree_insert(&tree->state, end, &state->rb_node); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); - free_extent_state(state); - return -EEXIST; - } - merge_state(tree, state); - return 0; -} - -/* - * split a given extent state struct in two, inserting the preallocated - * struct 'prealloc' as the newly created second half. 'split' indicates an - * offset inside 'orig' where it should be split. - * - * Before calling, - * the tree has 'orig' at [orig->start, orig->end]. After calling, there - * are two extent state structs in the tree: - * prealloc: [orig->start, split - 1] - * orig: [ split, orig->end ] - * - * The tree locks are not taken by this function. They need to be held - * by the caller. - */ -static int split_state(struct extent_map_tree *tree, struct extent_state *orig, - struct extent_state *prealloc, u64 split) -{ - struct rb_node *node; - prealloc->start = orig->start; - prealloc->end = split - 1; - prealloc->state = orig->state; - orig->start = split; - - node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); - free_extent_state(prealloc); - return -EEXIST; - } - return 0; -} - -/* - * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1), or - * forcibly remove the state from the tree (delete == 1). - * - * If no bits are set on the state struct after clearing things, the - * struct is freed and removed from the tree - */ -static int clear_state_bit(struct extent_map_tree *tree, - struct extent_state *state, int bits, int wake, - int delete) -{ - int ret = state->state & bits; - - if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - WARN_ON(range > tree->dirty_bytes); - tree->dirty_bytes -= range; - } - state->state &= ~bits; - if (wake) - wake_up(&state->wq); - if (delete || state->state == 0) { - if (state->in_tree) { - rb_erase(&state->rb_node, &tree->state); - state->in_tree = 0; - free_extent_state(state); - } else { - WARN_ON(1); - } - } else { - merge_state(tree, state); - } - return ret; -} - -/* - * clear some bits on a range in the tree. This may require splitting - * or inserting elements in the tree, so the gfp mask is used to - * indicate which allocations or sleeping are allowed. - * - * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove - * the given range from the tree regardless of state (ie for truncate). - * - * the range [start, end] is inclusive. - * - * This takes the tree lock, and returns < 0 on error, > 0 if any of the - * bits were already set, or zero if none of the bits were already set. - */ -int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, gfp_t mask) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - unsigned long flags; - int err; - int set = 0; - -again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; - } - - write_lock_irqsave(&tree->lock, flags); - /* - * this search will find the extents that end after - * our range starts - */ - node = tree_search(&tree->state, start); - if (!node) - goto out; - state = rb_entry(node, struct extent_state, rb_node); - if (state->start > end) - goto out; - WARN_ON(state->end < start); - - /* - * | ---- desired range ---- | - * | state | or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip - * bits on second half. - * - * If the extent we found extends past our range, we - * just split and search again. It'll get split again - * the next time though. - * - * If the extent we found is inside our range, we clear - * the desired bit on it. - */ - - if (state->start < start) { - err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - start = state->end + 1; - set |= clear_state_bit(tree, state, bits, - wake, delete); - } else { - start = state->start; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and clear the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); - - if (wake) - wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, - wake, delete); - prealloc = NULL; - goto out; - } - - start = state->end + 1; - set |= clear_state_bit(tree, state, bits, wake, delete); - goto search_again; - -out: - write_unlock_irqrestore(&tree->lock, flags); - if (prealloc) - free_extent_state(prealloc); - - return set; - -search_again: - if (start > end) - goto out; - write_unlock_irqrestore(&tree->lock, flags); - if (mask & __GFP_WAIT) - cond_resched(); - goto again; -} -EXPORT_SYMBOL(clear_extent_bit); - -static int wait_on_state(struct extent_map_tree *tree, - struct extent_state *state) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - read_unlock_irq(&tree->lock); - schedule(); - read_lock_irq(&tree->lock); - finish_wait(&state->wq, &wait); - return 0; -} - -/* - * waits for one or more bits to clear on a range in the state tree. - * The range [start, end] is inclusive. - * The tree lock is taken by this function - */ -int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits) -{ - struct extent_state *state; - struct rb_node *node; - - read_lock_irq(&tree->lock); -again: - while (1) { - /* - * this search will find all the extents that end after - * our range starts - */ - node = tree_search(&tree->state, start); - if (!node) - break; - - state = rb_entry(node, struct extent_state, rb_node); - - if (state->start > end) - goto out; - - if (state->state & bits) { - start = state->start; - atomic_inc(&state->refs); - wait_on_state(tree, state); - free_extent_state(state); - goto again; - } - start = state->end + 1; - - if (start > end) - break; - - if (need_resched()) { - read_unlock_irq(&tree->lock); - cond_resched(); - read_lock_irq(&tree->lock); - } - } -out: - read_unlock_irq(&tree->lock); - return 0; -} -EXPORT_SYMBOL(wait_extent_bit); - -static void set_state_bits(struct extent_map_tree *tree, - struct extent_state *state, - int bits) -{ - if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - tree->dirty_bytes += range; - } - state->state |= bits; -} - -/* - * set some bits on a range in the tree. This may require allocations - * or sleeping, so the gfp mask is used to indicate what is allowed. - * - * If 'exclusive' == 1, this will fail with -EEXIST if some part of the - * range already has the desired bits set. The start of the existing - * range is returned in failed_start in this case. - * - * [start, end] is inclusive - * This takes the tree lock. - */ -int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits, - int exclusive, u64 *failed_start, gfp_t mask) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - unsigned long flags; - int err = 0; - int set; - u64 last_start; - u64 last_end; -again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; - } - - write_lock_irqsave(&tree->lock, flags); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(&tree->state, start); - if (!node) { - err = insert_state(tree, prealloc, start, end, bits); - prealloc = NULL; - BUG_ON(err == -EEXIST); - goto out; - } - - state = rb_entry(node, struct extent_state, rb_node); - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - set = state->state & bits; - if (set && exclusive) { - *failed_start = state->start; - err = -EEXIST; - goto out; - } - set_state_bits(tree, state, bits); - start = state->end + 1; - merge_state(tree, state); - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - set = state->state & bits; - if (exclusive && set) { - *failed_start = start; - err = -EEXIST; - goto out; - } - err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, bits); - start = state->end + 1; - merge_state(tree, state); - } else { - start = state->start; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start -1; - err = insert_state(tree, prealloc, start, this_end, - bits); - prealloc = NULL; - BUG_ON(err == -EEXIST); - if (err) - goto out; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - set = state->state & bits; - if (exclusive && set) { - *failed_start = start; - err = -EEXIST; - goto out; - } - err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); - - set_state_bits(tree, prealloc, bits); - merge_state(tree, prealloc); - prealloc = NULL; - goto out; - } - - goto search_again; - -out: - write_unlock_irqrestore(&tree->lock, flags); - if (prealloc) - free_extent_state(prealloc); - - return err; - -search_again: - if (start > end) - goto out; - write_unlock_irqrestore(&tree->lock, flags); - if (mask & __GFP_WAIT) - cond_resched(); - goto again; -} -EXPORT_SYMBOL(set_extent_bit); - -/* wrappers around set/clear extent bit */ -int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_dirty); - -int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) -{ - return set_extent_bit(tree, start, end, bits, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_bits); - -int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, bits, 0, 0, mask); -} -EXPORT_SYMBOL(clear_extent_bits); - -int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_delalloc); - -int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); -} -EXPORT_SYMBOL(clear_extent_dirty); - -int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_new); - -int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); -} -EXPORT_SYMBOL(clear_extent_new); - -int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_uptodate); - -int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); -} -EXPORT_SYMBOL(clear_extent_uptodate); - -int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, - 0, NULL, mask); -} -EXPORT_SYMBOL(set_extent_writeback); - -int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); -} -EXPORT_SYMBOL(clear_extent_writeback); - -int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end) -{ - return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); -} -EXPORT_SYMBOL(wait_on_extent_writeback); - -/* - * locks a range in ascending order, waiting for any locked regions - * it hits on the way. [start,end] are inclusive, and this will sleep. - */ -int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask) -{ - int err; - u64 failed_start; - while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, - &failed_start, mask); - if (err == -EEXIST && (mask & __GFP_WAIT)) { - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); - start = failed_start; - } else { - break; - } - WARN_ON(start > end); - } - return err; -} -EXPORT_SYMBOL(lock_extent); - -int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); -} -EXPORT_SYMBOL(unlock_extent); - -/* - * helper function to set pages and extents in the tree dirty - */ -int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); - __set_page_dirty_nobuffers(page); - page_cache_release(page); - index++; - } - set_extent_dirty(tree, start, end, GFP_NOFS); - return 0; -} -EXPORT_SYMBOL(set_range_dirty); - -/* - * helper function to set both pages and extents in the tree writeback - */ -int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); - set_page_writeback(page); - page_cache_release(page); - index++; - } - set_extent_writeback(tree, start, end, GFP_NOFS); - return 0; -} -EXPORT_SYMBOL(set_range_writeback); - -int find_first_extent_bit(struct extent_map_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 1; - - read_lock_irq(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(&tree->state, start); - if (!node || IS_ERR(node)) { - goto out; - } - - while(1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) { - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - break; - } - node = rb_next(node); - if (!node) - break; - } -out: - read_unlock_irq(&tree->lock); - return ret; -} -EXPORT_SYMBOL(find_first_extent_bit); - -u64 find_lock_delalloc_range(struct extent_map_tree *tree, - u64 *start, u64 *end, u64 max_bytes) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 found = 0; - u64 total_bytes = 0; - - write_lock_irq(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ -search_again: - node = tree_search(&tree->state, cur_start); - if (!node || IS_ERR(node)) { - *end = (u64)-1; - goto out; - } - - while(1) { - state = rb_entry(node, struct extent_state, rb_node); - if (found && state->start != cur_start) { - goto out; - } - if (!(state->state & EXTENT_DELALLOC)) { - if (!found) - *end = state->end; - goto out; - } - if (!found) { - struct extent_state *prev_state; - struct rb_node *prev_node = node; - while(1) { - prev_node = rb_prev(prev_node); - if (!prev_node) - break; - prev_state = rb_entry(prev_node, - struct extent_state, - rb_node); - if (!(prev_state->state & EXTENT_DELALLOC)) - break; - state = prev_state; - node = prev_node; - } - } - if (state->state & EXTENT_LOCKED) { - DEFINE_WAIT(wait); - atomic_inc(&state->refs); - prepare_to_wait(&state->wq, &wait, - TASK_UNINTERRUPTIBLE); - write_unlock_irq(&tree->lock); - schedule(); - write_lock_irq(&tree->lock); - finish_wait(&state->wq, &wait); - free_extent_state(state); - goto search_again; - } - state->state |= EXTENT_LOCKED; - if (!found) - *start = state->start; - found++; - *end = state->end; - cur_start = state->end + 1; - node = rb_next(node); - if (!node) - break; - total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) - break; - } -out: - write_unlock_irq(&tree->lock); - return found; -} - -u64 count_range_bits(struct extent_map_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 total_bytes = 0; - int found = 0; - - if (search_end <= cur_start) { - printk("search_end %Lu start %Lu\n", search_end, cur_start); - WARN_ON(1); - return 0; - } - - write_lock_irq(&tree->lock); - if (cur_start == 0 && bits == EXTENT_DIRTY) { - total_bytes = tree->dirty_bytes; - goto out; - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(&tree->state, cur_start); - if (!node || IS_ERR(node)) { - goto out; - } - - while(1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->start > search_end) - break; - if (state->end >= cur_start && (state->state & bits)) { - total_bytes += min(search_end, state->end) + 1 - - max(cur_start, state->start); - if (total_bytes >= max_bytes) - break; - if (!found) { - *start = state->start; - found = 1; - } - } - node = rb_next(node); - if (!node) - break; - } -out: - write_unlock_irq(&tree->lock); - return total_bytes; -} -/* - * helper function to lock both pages and extents in the tree. - * pages must be locked first. - */ -int lock_range(struct extent_map_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - int err; - - while (index <= end_index) { - page = grab_cache_page(tree->mapping, index); - if (!page) { - err = -ENOMEM; - goto failed; - } - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto failed; - } - index++; - } - lock_extent(tree, start, end, GFP_NOFS); - return 0; - -failed: - /* - * we failed above in getting the page at 'index', so we undo here - * up to but not including the page at 'index' - */ - end_index = index; - index = start >> PAGE_CACHE_SHIFT; - while (index < end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - return err; -} -EXPORT_SYMBOL(lock_range); - -/* - * helper function to unlock both pages and extents in the tree. - */ -int unlock_range(struct extent_map_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - unlock_extent(tree, start, end, GFP_NOFS); - return 0; -} -EXPORT_SYMBOL(unlock_range); - -int set_state_private(struct extent_map_tree *tree, u64 start, u64 private) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 0; - - write_lock_irq(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(&tree->state, start); - if (!node || IS_ERR(node)) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - state->private = private; -out: - write_unlock_irq(&tree->lock); - return ret; -} - -int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 0; - - read_lock_irq(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(&tree->state, start); - if (!node || IS_ERR(node)) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - *private = state->private; -out: - read_unlock_irq(&tree->lock); - return ret; -} - -/* - * searches a range in the state tree for a given mask. - * If 'filled' == 1, this returns 1 only if ever extent in the tree - * has the bits set. Otherwise, 1 is returned if any bit in the - * range is found set. - */ -int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, - int bits, int filled) -{ - struct extent_state *state = NULL; - struct rb_node *node; - int bitset = 0; - - read_lock_irq(&tree->lock); - node = tree_search(&tree->state, start); - while (node && start <= end) { - state = rb_entry(node, struct extent_state, rb_node); - - if (filled && state->start > start) { - bitset = 0; - break; - } - - if (state->start > end) - break; - - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; - break; - } - start = state->end + 1; - if (start > end) - break; - node = rb_next(node); - if (!node) { - if (filled) - bitset = 0; - break; - } - } - read_unlock_irq(&tree->lock); - return bitset; -} -EXPORT_SYMBOL(test_range_bit); - -/* - * helper function to set a given page up to date if all the - * extents in the tree for that page are up to date - */ -static int check_page_uptodate(struct extent_map_tree *tree, - struct page *page) -{ - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) - SetPageUptodate(page); - return 0; -} - -/* - * helper function to unlock a page if all the extents in the tree - * for that page are unlocked - */ -static int check_page_locked(struct extent_map_tree *tree, - struct page *page) -{ - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) - unlock_page(page); - return 0; -} - -/* - * helper function to end page writeback if all the extents - * in the tree for that page are done with writeback - */ -static int check_page_writeback(struct extent_map_tree *tree, - struct page *page) -{ - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) - end_page_writeback(page); - return 0; -} - -/* lots and lots of room for performance fixes in the end_bio funcs */ - -/* - * after a writepage IO is done, we need to: - * clear the uptodate bits on error - * clear the writeback bits in the extent tree for this IO - * end_page_writeback if the page has no more pending IO - * - * Scheduling is not allowed, so the extent state tree is expected - * to have one and only one object corresponding to this IO. - */ -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) -static void end_bio_extent_writepage(struct bio *bio, int err) -#else -static int end_bio_extent_writepage(struct bio *bio, - unsigned int bytes_done, int err) -#endif -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_map_tree *tree = bio->bi_private; - u64 start; - u64 end; - int whole_page; - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - if (bio->bi_size) - return 1; -#endif - - do { - struct page *page = bvec->bv_page; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (!uptodate) { - clear_extent_uptodate(tree, start, end, GFP_ATOMIC); - ClearPageUptodate(page); - SetPageError(page); - } - clear_extent_writeback(tree, start, end, GFP_ATOMIC); - - if (whole_page) - end_page_writeback(page); - else - check_page_writeback(tree, page); - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, end); - } while (bvec >= bio->bi_io_vec); - - bio_put(bio); -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - return 0; -#endif -} - -/* - * after a readpage IO is done, we need to: - * clear the uptodate bits on error - * set the uptodate bits if things worked - * set the page up to date if all extents in the tree are uptodate - * clear the lock bit in the extent tree - * unlock the page if there are no other extents locked for it - * - * Scheduling is not allowed, so the extent state tree is expected - * to have one and only one object corresponding to this IO. - */ -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) -static void end_bio_extent_readpage(struct bio *bio, int err) -#else -static int end_bio_extent_readpage(struct bio *bio, - unsigned int bytes_done, int err) -#endif -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_map_tree *tree = bio->bi_private; - u64 start; - u64 end; - int whole_page; - int ret; - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - if (bio->bi_size) - return 1; -#endif - - do { - struct page *page = bvec->bv_page; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { - ret = tree->ops->readpage_end_io_hook(page, start, end); - if (ret) - uptodate = 0; - } - if (uptodate) { - set_extent_uptodate(tree, start, end, GFP_ATOMIC); - if (whole_page) - SetPageUptodate(page); - else - check_page_uptodate(tree, page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - - unlock_extent(tree, start, end, GFP_ATOMIC); - - if (whole_page) - unlock_page(page); - else - check_page_locked(tree, page); - } while (bvec >= bio->bi_io_vec); - - bio_put(bio); -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - return 0; -#endif -} - -/* - * IO done from prepare_write is pretty simple, we just unlock - * the structs in the extent tree when done, and set the uptodate bits - * as appropriate. - */ -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) -static void end_bio_extent_preparewrite(struct bio *bio, int err) -#else -static int end_bio_extent_preparewrite(struct bio *bio, - unsigned int bytes_done, int err) -#endif -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_map_tree *tree = bio->bi_private; - u64 start; - u64 end; - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - if (bio->bi_size) - return 1; -#endif - - do { - struct page *page = bvec->bv_page; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (uptodate) { - set_extent_uptodate(tree, start, end, GFP_ATOMIC); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - - unlock_extent(tree, start, end, GFP_ATOMIC); - - } while (bvec >= bio->bi_io_vec); - - bio_put(bio); -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - return 0; -#endif -} - -static struct bio * -extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) -{ - struct bio *bio; - - bio = bio_alloc(gfp_flags, nr_vecs); - - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - - if (bio) { - bio->bi_bdev = bdev; - bio->bi_sector = first_sector; - } - return bio; -} - -static int submit_one_bio(int rw, struct bio *bio) -{ - u64 maxsector; - int ret = 0; - - bio_get(bio); - - maxsector = bio->bi_bdev->bd_inode->i_size >> 9; - if (maxsector < bio->bi_sector) { - printk("sector too large max %Lu got %llu\n", maxsector, - (unsigned long long)bio->bi_sector); - WARN_ON(1); - } - - submit_bio(rw, bio); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - bio_put(bio); - return ret; -} - -static int submit_extent_page(int rw, struct extent_map_tree *tree, - struct page *page, sector_t sector, - size_t size, unsigned long offset, - struct block_device *bdev, - struct bio **bio_ret, - unsigned long max_pages, - bio_end_io_t end_io_func) -{ - int ret = 0; - struct bio *bio; - int nr; - - if (bio_ret && *bio_ret) { - bio = *bio_ret; - if (bio->bi_sector + (bio->bi_size >> 9) != sector || - bio_add_page(bio, page, size, offset) < size) { - ret = submit_one_bio(rw, bio); - bio = NULL; - } else { - return 0; - } - } - nr = min_t(int, max_pages, bio_get_nr_vecs(bdev)); - bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); - if (!bio) { - printk("failed to allocate bio nr %d\n", nr); - } - bio_add_page(bio, page, size, offset); - bio->bi_end_io = end_io_func; - bio->bi_private = tree; - if (bio_ret) { - *bio_ret = bio; - } else { - ret = submit_one_bio(rw, bio); - } - - return ret; -} - -void set_page_extent_mapped(struct page *page) -{ - if (!PagePrivate(page)) { - SetPagePrivate(page); - WARN_ON(!page->mapping->a_ops->invalidatepage); - set_page_private(page, EXTENT_PAGE_PRIVATE); - page_cache_get(page); - } -} - -void set_page_extent_head(struct page *page, unsigned long len) -{ - set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); -} - -/* - * basic readpage implementation. Locked extent state structs are inserted - * into the tree that are removed when the IO is done (by the end_io - * handlers) - */ -static int __extent_read_full_page(struct extent_map_tree *tree, - struct page *page, - get_extent_t *get_extent, - struct bio **bio) -{ - struct inode *inode = page->mapping->host; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 page_end = start + PAGE_CACHE_SIZE - 1; - u64 end; - u64 cur = start; - u64 extent_offset; - u64 last_byte = i_size_read(inode); - u64 block_start; - u64 cur_end; - sector_t sector; - struct extent_map *em; - struct block_device *bdev; - int ret; - int nr = 0; - size_t page_offset = 0; - size_t iosize; - size_t blocksize = inode->i_sb->s_blocksize; - - set_page_extent_mapped(page); - - end = page_end; - lock_extent(tree, start, end, GFP_NOFS); - - while (cur <= end) { - if (cur >= last_byte) { - char *userpage; - iosize = PAGE_CACHE_SIZE - page_offset; - userpage = kmap_atomic(page, KM_USER0); - memset(userpage + page_offset, 0, iosize); - flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); - set_extent_uptodate(tree, cur, cur + iosize - 1, - GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); - break; - } - em = get_extent(inode, page, page_offset, cur, end, 0); - if (IS_ERR(em) || !em) { - SetPageError(page); - unlock_extent(tree, cur, end, GFP_NOFS); - break; - } - - extent_offset = cur - em->start; - BUG_ON(em->end < cur); - BUG_ON(end < cur); - - iosize = min(em->end - cur, end - cur) + 1; - cur_end = min(em->end, end); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); - sector = (em->block_start + extent_offset) >> 9; - bdev = em->bdev; - block_start = em->block_start; - free_extent_map(em); - em = NULL; - - /* we've found a hole, just zero and go on */ - if (block_start == EXTENT_MAP_HOLE) { - char *userpage; - userpage = kmap_atomic(page, KM_USER0); - memset(userpage + page_offset, 0, iosize); - flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); - - set_extent_uptodate(tree, cur, cur + iosize - 1, - GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); - cur = cur + iosize; - page_offset += iosize; - continue; - } - /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); - cur = cur + iosize; - page_offset += iosize; - continue; - } - - ret = 0; - if (tree->ops && tree->ops->readpage_io_hook) { - ret = tree->ops->readpage_io_hook(page, cur, - cur + iosize - 1); - } - if (!ret) { - unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1; - nr -= page->index; - ret = submit_extent_page(READ, tree, page, - sector, iosize, page_offset, - bdev, bio, nr, - end_bio_extent_readpage); - } - if (ret) - SetPageError(page); - cur = cur + iosize; - page_offset += iosize; - nr++; - } - if (!nr) { - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); - } - return 0; -} - -int extent_read_full_page(struct extent_map_tree *tree, struct page *page, - get_extent_t *get_extent) -{ - struct bio *bio = NULL; - int ret; - - ret = __extent_read_full_page(tree, page, get_extent, &bio); - if (bio) - submit_one_bio(READ, bio); - return ret; -} -EXPORT_SYMBOL(extent_read_full_page); - -/* - * the writepage semantics are similar to regular writepage. extent - * records are inserted to lock ranges in the tree, and as dirty areas - * are found, they are marked writeback. Then the lock bits are removed - * and the end_io handler clears the writeback ranges - */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; - struct extent_map_tree *tree = epd->tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 delalloc_start; - u64 page_end = start + PAGE_CACHE_SIZE - 1; - u64 end; - u64 cur = start; - u64 extent_offset; - u64 last_byte = i_size_read(inode); - u64 block_start; - u64 iosize; - sector_t sector; - struct extent_map *em; - struct block_device *bdev; - int ret; - int nr = 0; - size_t page_offset = 0; - size_t blocksize; - loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; - u64 nr_delalloc; - u64 delalloc_end; - - WARN_ON(!PageLocked(page)); - if (page->index > end_index) { - clear_extent_dirty(tree, start, page_end, GFP_NOFS); - unlock_page(page); - return 0; - } - - if (page->index == end_index) { - char *userpage; - - size_t offset = i_size & (PAGE_CACHE_SIZE - 1); - - userpage = kmap_atomic(page, KM_USER0); - memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset); - flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); - } - - set_page_extent_mapped(page); - - delalloc_start = start; - delalloc_end = 0; - while(delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, - &delalloc_end, - 128 * 1024 * 1024); - if (nr_delalloc == 0) { - delalloc_start = delalloc_end + 1; - continue; - } - tree->ops->fill_delalloc(inode, delalloc_start, - delalloc_end); - clear_extent_bit(tree, delalloc_start, - delalloc_end, - EXTENT_LOCKED | EXTENT_DELALLOC, - 1, 0, GFP_NOFS); - delalloc_start = delalloc_end + 1; - } - lock_extent(tree, start, page_end, GFP_NOFS); - - end = page_end; - if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { - printk("found delalloc bits after lock_extent\n"); - } - - if (last_byte <= start) { - clear_extent_dirty(tree, start, page_end, GFP_NOFS); - goto done; - } - - set_extent_uptodate(tree, start, page_end, GFP_NOFS); - blocksize = inode->i_sb->s_blocksize; - - while (cur <= end) { - if (cur >= last_byte) { - clear_extent_dirty(tree, cur, page_end, GFP_NOFS); - break; - } - em = epd->get_extent(inode, page, page_offset, cur, end, 1); - if (IS_ERR(em) || !em) { - SetPageError(page); - break; - } - - extent_offset = cur - em->start; - BUG_ON(em->end < cur); - BUG_ON(end < cur); - iosize = min(em->end - cur, end - cur) + 1; - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); - sector = (em->block_start + extent_offset) >> 9; - bdev = em->bdev; - block_start = em->block_start; - free_extent_map(em); - em = NULL; - - if (block_start == EXTENT_MAP_HOLE || - block_start == EXTENT_MAP_INLINE) { - clear_extent_dirty(tree, cur, - cur + iosize - 1, GFP_NOFS); - cur = cur + iosize; - page_offset += iosize; - continue; - } - - /* leave this out until we have a page_mkwrite call */ - if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0)) { - cur = cur + iosize; - page_offset += iosize; - continue; - } - clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); - if (tree->ops && tree->ops->writepage_io_hook) { - ret = tree->ops->writepage_io_hook(page, cur, - cur + iosize - 1); - } else { - ret = 0; - } - if (ret) - SetPageError(page); - else { - unsigned long max_nr = end_index + 1; - set_range_writeback(tree, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - printk("warning page %lu not writeback, " - "cur %llu end %llu\n", page->index, - (unsigned long long)cur, - (unsigned long long)end); - } - - ret = submit_extent_page(WRITE, tree, page, sector, - iosize, page_offset, bdev, - &epd->bio, max_nr, - end_bio_extent_writepage); - if (ret) - SetPageError(page); - } - cur = cur + iosize; - page_offset += iosize; - nr++; - } -done: - if (nr == 0) { - /* make sure the mapping tag for page dirty gets cleared */ - set_page_writeback(page); - end_page_writeback(page); - } - unlock_extent(tree, start, page_end, GFP_NOFS); - unlock_page(page); - return 0; -} - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - -/* Taken directly from 2.6.23 for 2.6.18 back port */ -typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, - void *data); - -/** - * write_cache_pages - walk the list of dirty pages of the given address space - * and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * If a page is already under I/O, write_cache_pages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. - */ -static int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) -{ - struct backing_dev_info *bdi = mapping->backing_dev_info; - int ret = 0; - int done = 0; - struct pagevec pvec; - int nr_pages; - pgoff_t index; - pgoff_t end; /* Inclusive */ - int scanned = 0; - int range_whole = 0; - - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - - pagevec_init(&pvec, 0); - if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ - end = -1; - } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - scanned = 1; - } -retry: - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; - - scanned = 1; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping - */ - lock_page(page); - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - continue; - } - - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; - } - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; - } - - ret = (*writepage)(page, wbc, data); - - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; - } - if (ret || (--(wbc->nr_to_write) <= 0)) - done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - } - } - pagevec_release(&pvec); - cond_resched(); - } - if (!scanned && !done) { - /* - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - scanned = 1; - index = 0; - goto retry; - } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; - return ret; -} -#endif - -int extent_write_full_page(struct extent_map_tree *tree, struct page *page, - get_extent_t *get_extent, - struct writeback_control *wbc) -{ - int ret; - struct address_space *mapping = page->mapping; - struct extent_page_data epd = { - .bio = NULL, - .tree = tree, - .get_extent = get_extent, - }; - struct writeback_control wbc_writepages = { - .bdi = wbc->bdi, - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = 64, - .range_start = page_offset(page) + PAGE_CACHE_SIZE, - .range_end = (loff_t)-1, - }; - - - ret = __extent_writepage(page, wbc, &epd); - - write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd); - if (epd.bio) { - submit_one_bio(WRITE, epd.bio); - } - return ret; -} -EXPORT_SYMBOL(extent_write_full_page); - - -int extent_writepages(struct extent_map_tree *tree, - struct address_space *mapping, - get_extent_t *get_extent, - struct writeback_control *wbc) -{ - int ret = 0; - struct extent_page_data epd = { - .bio = NULL, - .tree = tree, - .get_extent = get_extent, - }; - - ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd); - if (epd.bio) { - submit_one_bio(WRITE, epd.bio); - } - return ret; -} -EXPORT_SYMBOL(extent_writepages); - -int extent_readpages(struct extent_map_tree *tree, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent) -{ - struct bio *bio = NULL; - unsigned page_idx; - struct pagevec pvec; - - pagevec_init(&pvec, 0); - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); - - prefetchw(&page->flags); - list_del(&page->lru); - /* - * what we want to do here is call add_to_page_cache_lru, - * but that isn't exported, so we reproduce it here - */ - if (!add_to_page_cache(page, mapping, - page->index, GFP_KERNEL)) { - - /* open coding of lru_cache_add, also not exported */ - page_cache_get(page); - if (!pagevec_add(&pvec, page)) - __pagevec_lru_add(&pvec); - __extent_read_full_page(tree, page, get_extent, &bio); - } - page_cache_release(page); - } - if (pagevec_count(&pvec)) - __pagevec_lru_add(&pvec); - BUG_ON(!list_empty(pages)); - if (bio) - submit_one_bio(READ, bio); - return 0; -} -EXPORT_SYMBOL(extent_readpages); - -/* - * basic invalidatepage code, this waits on any locked or writeback - * ranges corresponding to the page, and then deletes any extent state - * records from the tree - */ -int extent_invalidatepage(struct extent_map_tree *tree, - struct page *page, unsigned long offset) -{ - u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); - u64 end = start + PAGE_CACHE_SIZE - 1; - size_t blocksize = page->mapping->host->i_sb->s_blocksize; - - start += (offset + blocksize -1) & ~(blocksize - 1); - if (start > end) - return 0; - - lock_extent(tree, start, end, GFP_NOFS); - wait_on_extent_writeback(tree, start, end); - clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, - 1, 1, GFP_NOFS); - return 0; -} -EXPORT_SYMBOL(extent_invalidatepage); - -/* - * simple commit_write call, set_range_dirty is used to mark both - * the pages and the extent records as dirty - */ -int extent_commit_write(struct extent_map_tree *tree, - struct inode *inode, struct page *page, - unsigned from, unsigned to) -{ - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - - set_page_extent_mapped(page); - set_page_dirty(page); - - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - return 0; -} -EXPORT_SYMBOL(extent_commit_write); - -int extent_prepare_write(struct extent_map_tree *tree, - struct inode *inode, struct page *page, - unsigned from, unsigned to, get_extent_t *get_extent) -{ - u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 page_end = page_start + PAGE_CACHE_SIZE - 1; - u64 block_start; - u64 orig_block_start; - u64 block_end; - u64 cur_end; - struct extent_map *em; - unsigned blocksize = 1 << inode->i_blkbits; - size_t page_offset = 0; - size_t block_off_start; - size_t block_off_end; - int err = 0; - int iocount = 0; - int ret = 0; - int isnew; - - set_page_extent_mapped(page); - - block_start = (page_start + from) & ~((u64)blocksize - 1); - block_end = (page_start + to - 1) | (blocksize - 1); - orig_block_start = block_start; - - lock_extent(tree, page_start, page_end, GFP_NOFS); - while(block_start <= block_end) { - em = get_extent(inode, page, page_offset, block_start, - block_end, 1); - if (IS_ERR(em) || !em) { - goto err; - } - cur_end = min(block_end, em->end); - block_off_start = block_start & (PAGE_CACHE_SIZE - 1); - block_off_end = block_off_start + blocksize; - isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); - - if (!PageUptodate(page) && isnew && - (block_off_end > to || block_off_start < from)) { - void *kaddr; - - kaddr = kmap_atomic(page, KM_USER0); - if (block_off_end > to) - memset(kaddr + to, 0, block_off_end - to); - if (block_off_start < from) - memset(kaddr + block_off_start, 0, - from - block_off_start); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - } - if ((em->block_start != EXTENT_MAP_HOLE && - em->block_start != EXTENT_MAP_INLINE) && - !isnew && !PageUptodate(page) && - (block_off_end > to || block_off_start < from) && - !test_range_bit(tree, block_start, cur_end, - EXTENT_UPTODATE, 1)) { - u64 sector; - u64 extent_offset = block_start - em->start; - size_t iosize; - sector = (em->block_start + extent_offset) >> 9; - iosize = (cur_end - block_start + blocksize) & - ~((u64)blocksize - 1); - /* - * we've already got the extent locked, but we - * need to split the state such that our end_bio - * handler can clear the lock. - */ - set_extent_bit(tree, block_start, - block_start + iosize - 1, - EXTENT_LOCKED, 0, NULL, GFP_NOFS); - ret = submit_extent_page(READ, tree, page, - sector, iosize, page_offset, em->bdev, - NULL, 1, - end_bio_extent_preparewrite); - iocount++; - block_start = block_start + iosize; - } else { - set_extent_uptodate(tree, block_start, cur_end, - GFP_NOFS); - unlock_extent(tree, block_start, cur_end, GFP_NOFS); - block_start = cur_end + 1; - } - page_offset = block_start & (PAGE_CACHE_SIZE - 1); - free_extent_map(em); - } - if (iocount) { - wait_extent_bit(tree, orig_block_start, - block_end, EXTENT_LOCKED); - } - check_page_uptodate(tree, page); -err: - /* FIXME, zero out newly allocated blocks on error */ - return err; -} -EXPORT_SYMBOL(extent_prepare_write); - -/* - * a helper for releasepage. As long as there are no locked extents - * in the range corresponding to the page, both state records and extent - * map records are removed - */ -int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) -{ - struct extent_map *em; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - u64 orig_start = start; - int ret = 1; - - while (start <= end) { - em = lookup_extent_mapping(tree, start, end); - if (!em || IS_ERR(em)) - break; - if (!test_range_bit(tree, em->start, em->end, - EXTENT_LOCKED, 0)) { - remove_extent_mapping(tree, em); - /* once for the rb tree */ - free_extent_map(em); - } - start = em->end + 1; - /* once for us */ - free_extent_map(em); - } - if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0)) - ret = 0; - else - clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, - 1, 1, GFP_NOFS); - return ret; -} -EXPORT_SYMBOL(try_release_extent_mapping); - -sector_t extent_bmap(struct address_space *mapping, sector_t iblock, - get_extent_t *get_extent) -{ - struct inode *inode = mapping->host; - u64 start = iblock << inode->i_blkbits; - u64 end = start + (1 << inode->i_blkbits) - 1; - sector_t sector = 0; - struct extent_map *em; - - em = get_extent(inode, NULL, 0, start, end, 0); - if (!em || IS_ERR(em)) - return 0; - - if (em->block_start == EXTENT_MAP_INLINE || - em->block_start == EXTENT_MAP_HOLE) - goto out; - - sector = (em->block_start + start - em->start) >> inode->i_blkbits; -out: - free_extent_map(em); - return sector; -} - -static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb) -{ - if (list_empty(&eb->lru)) { - extent_buffer_get(eb); - list_add(&eb->lru, &tree->buffer_lru); - tree->lru_size++; - if (tree->lru_size >= BUFFER_LRU_MAX) { - struct extent_buffer *rm; - rm = list_entry(tree->buffer_lru.prev, - struct extent_buffer, lru); - tree->lru_size--; - list_del_init(&rm->lru); - free_extent_buffer(rm); - } - } else - list_move(&eb->lru, &tree->buffer_lru); - return 0; -} -static struct extent_buffer *find_lru(struct extent_map_tree *tree, - u64 start, unsigned long len) -{ - struct list_head *lru = &tree->buffer_lru; - struct list_head *cur = lru->next; - struct extent_buffer *eb; - - if (list_empty(lru)) - return NULL; - - do { - eb = list_entry(cur, struct extent_buffer, lru); - if (eb->start == start && eb->len == len) { - extent_buffer_get(eb); - return eb; - } - cur = cur->next; - } while (cur != lru); - return NULL; -} - -static inline unsigned long num_extent_pages(u64 start, u64 len) -{ - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); -} - -static inline struct page *extent_buffer_page(struct extent_buffer *eb, - unsigned long i) -{ - struct page *p; - struct address_space *mapping; - - if (i == 0) - return eb->first_page; - i += eb->start >> PAGE_CACHE_SHIFT; - mapping = eb->first_page->mapping; - read_lock_irq(&mapping->tree_lock); - p = radix_tree_lookup(&mapping->page_tree, i); - read_unlock_irq(&mapping->tree_lock); - return p; -} - -static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree, - u64 start, - unsigned long len, - gfp_t mask) -{ - struct extent_buffer *eb = NULL; - - spin_lock(&tree->lru_lock); - eb = find_lru(tree, start, len); - spin_unlock(&tree->lru_lock); - if (eb) { - return eb; - } - - eb = kmem_cache_zalloc(extent_buffer_cache, mask); - INIT_LIST_HEAD(&eb->lru); - eb->start = start; - eb->len = len; - atomic_set(&eb->refs, 1); - - return eb; -} - -static void __free_extent_buffer(struct extent_buffer *eb) -{ - kmem_cache_free(extent_buffer_cache, eb); -} - -struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, - u64 start, unsigned long len, - struct page *page0, - gfp_t mask) -{ - unsigned long num_pages = num_extent_pages(start, len); - unsigned long i; - unsigned long index = start >> PAGE_CACHE_SHIFT; - struct extent_buffer *eb; - struct page *p; - struct address_space *mapping = tree->mapping; - int uptodate = 1; - - eb = __alloc_extent_buffer(tree, start, len, mask); - if (!eb || IS_ERR(eb)) - return NULL; - - if (eb->flags & EXTENT_BUFFER_FILLED) - goto lru_add; - - if (page0) { - eb->first_page = page0; - i = 1; - index++; - page_cache_get(page0); - mark_page_accessed(page0); - set_page_extent_mapped(page0); - WARN_ON(!PageUptodate(page0)); - set_page_extent_head(page0, len); - } else { - i = 0; - } - for (; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); - if (!p) { - WARN_ON(1); - goto fail; - } - set_page_extent_mapped(p); - mark_page_accessed(p); - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); - } - if (!PageUptodate(p)) - uptodate = 0; - unlock_page(p); - } - if (uptodate) - eb->flags |= EXTENT_UPTODATE; - eb->flags |= EXTENT_BUFFER_FILLED; - -lru_add: - spin_lock(&tree->lru_lock); - add_lru(tree, eb); - spin_unlock(&tree->lru_lock); - return eb; - -fail: - spin_lock(&tree->lru_lock); - list_del_init(&eb->lru); - spin_unlock(&tree->lru_lock); - if (!atomic_dec_and_test(&eb->refs)) - return NULL; - for (index = 1; index < i; index++) { - page_cache_release(extent_buffer_page(eb, index)); - } - if (i > 0) - page_cache_release(extent_buffer_page(eb, 0)); - __free_extent_buffer(eb); - return NULL; -} -EXPORT_SYMBOL(alloc_extent_buffer); - -struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, - u64 start, unsigned long len, - gfp_t mask) -{ - unsigned long num_pages = num_extent_pages(start, len); - unsigned long i; - unsigned long index = start >> PAGE_CACHE_SHIFT; - struct extent_buffer *eb; - struct page *p; - struct address_space *mapping = tree->mapping; - int uptodate = 1; - - eb = __alloc_extent_buffer(tree, start, len, mask); - if (!eb || IS_ERR(eb)) - return NULL; - - if (eb->flags & EXTENT_BUFFER_FILLED) - goto lru_add; - - for (i = 0; i < num_pages; i++, index++) { - p = find_lock_page(mapping, index); - if (!p) { - goto fail; - } - set_page_extent_mapped(p); - mark_page_accessed(p); - - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); - } - - if (!PageUptodate(p)) - uptodate = 0; - unlock_page(p); - } - if (uptodate) - eb->flags |= EXTENT_UPTODATE; - eb->flags |= EXTENT_BUFFER_FILLED; - -lru_add: - spin_lock(&tree->lru_lock); - add_lru(tree, eb); - spin_unlock(&tree->lru_lock); - return eb; -fail: - spin_lock(&tree->lru_lock); - list_del_init(&eb->lru); - spin_unlock(&tree->lru_lock); - if (!atomic_dec_and_test(&eb->refs)) - return NULL; - for (index = 1; index < i; index++) { - page_cache_release(extent_buffer_page(eb, index)); - } - if (i > 0) - page_cache_release(extent_buffer_page(eb, 0)); - __free_extent_buffer(eb); - return NULL; -} -EXPORT_SYMBOL(find_extent_buffer); - -void free_extent_buffer(struct extent_buffer *eb) -{ - unsigned long i; - unsigned long num_pages; - - if (!eb) - return; - - if (!atomic_dec_and_test(&eb->refs)) - return; - - WARN_ON(!list_empty(&eb->lru)); - num_pages = num_extent_pages(eb->start, eb->len); - - for (i = 1; i < num_pages; i++) { - page_cache_release(extent_buffer_page(eb, i)); - } - page_cache_release(extent_buffer_page(eb, 0)); - __free_extent_buffer(eb); -} -EXPORT_SYMBOL(free_extent_buffer); - -int clear_extent_buffer_dirty(struct extent_map_tree *tree, - struct extent_buffer *eb) -{ - int set; - unsigned long i; - unsigned long num_pages; - struct page *page; - - u64 start = eb->start; - u64 end = start + eb->len - 1; - - set = clear_extent_dirty(tree, start, end, GFP_NOFS); - num_pages = num_extent_pages(eb->start, eb->len); - - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - lock_page(page); - if (i == 0) - set_page_extent_head(page, eb->len); - else - set_page_private(page, EXTENT_PAGE_PRIVATE); - - /* - * if we're on the last page or the first page and the - * block isn't aligned on a page boundary, do extra checks - * to make sure we don't clean page that is partially dirty - */ - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - start = (u64)page->index << PAGE_CACHE_SHIFT; - end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, - EXTENT_DIRTY, 0)) { - unlock_page(page); - continue; - } - } - clear_page_dirty_for_io(page); - write_lock_irq(&page->mapping->tree_lock); - if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - } - write_unlock_irq(&page->mapping->tree_lock); - unlock_page(page); - } - return 0; -} -EXPORT_SYMBOL(clear_extent_buffer_dirty); - -int wait_on_extent_buffer_writeback(struct extent_map_tree *tree, - struct extent_buffer *eb) -{ - return wait_on_extent_writeback(tree, eb->start, - eb->start + eb->len - 1); -} -EXPORT_SYMBOL(wait_on_extent_buffer_writeback); - -int set_extent_buffer_dirty(struct extent_map_tree *tree, - struct extent_buffer *eb) -{ - unsigned long i; - unsigned long num_pages; - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *page = extent_buffer_page(eb, i); - /* writepage may need to do something special for the - * first page, we have to make sure page->private is - * properly set. releasepage may drop page->private - * on us if the page isn't already dirty. - */ - if (i == 0) { - lock_page(page); - set_page_extent_head(page, eb->len); - } else if (PagePrivate(page) && - page->private != EXTENT_PAGE_PRIVATE) { - lock_page(page); - set_page_extent_mapped(page); - unlock_page(page); - } - __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); - if (i == 0) - unlock_page(page); - } - return set_extent_dirty(tree, eb->start, - eb->start + eb->len - 1, GFP_NOFS); -} -EXPORT_SYMBOL(set_extent_buffer_dirty); - -int set_extent_buffer_uptodate(struct extent_map_tree *tree, - struct extent_buffer *eb) -{ - unsigned long i; - struct page *page; - unsigned long num_pages; - - num_pages = num_extent_pages(eb->start, eb->len); - - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - GFP_NOFS); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - check_page_uptodate(tree, page); - continue; - } - SetPageUptodate(page); - } - return 0; -} -EXPORT_SYMBOL(set_extent_buffer_uptodate); - -int extent_buffer_uptodate(struct extent_map_tree *tree, - struct extent_buffer *eb) -{ - if (eb->flags & EXTENT_UPTODATE) - return 1; - return test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1); -} -EXPORT_SYMBOL(extent_buffer_uptodate); - -int read_extent_buffer_pages(struct extent_map_tree *tree, - struct extent_buffer *eb, - u64 start, - int wait) -{ - unsigned long i; - unsigned long start_i; - struct page *page; - int err; - int ret = 0; - unsigned long num_pages; - - if (eb->flags & EXTENT_UPTODATE) - return 0; - - if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1)) { - return 0; - } - - if (start) { - WARN_ON(start < eb->start); - start_i = (start >> PAGE_CACHE_SHIFT) - - (eb->start >> PAGE_CACHE_SHIFT); - } else { - start_i = 0; - } - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (PageUptodate(page)) { - continue; - } - if (!wait) { - if (TestSetPageLocked(page)) { - continue; - } - } else { - lock_page(page); - } - if (!PageUptodate(page)) { - err = page->mapping->a_ops->readpage(NULL, page); - if (err) { - ret = err; - } - } else { - unlock_page(page); - } - } - - if (ret || !wait) { - return ret; - } - - for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - wait_on_page_locked(page); - if (!PageUptodate(page)) { - ret = -EIO; - } - } - if (!ret) - eb->flags |= EXTENT_UPTODATE; - return ret; -} -EXPORT_SYMBOL(read_extent_buffer_pages); - -void read_extent_buffer(struct extent_buffer *eb, void *dstv, - unsigned long start, - unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - char *dst = (char *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - unsigned long num_pages = num_extent_pages(eb->start, eb->len); - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while(len > 0) { - page = extent_buffer_page(eb, i); - if (!PageUptodate(page)) { - printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len); - WARN_ON(1); - } - WARN_ON(!PageUptodate(page)); - - cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER1); - memcpy(dst, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER1); - - dst += cur; - len -= cur; - offset = 0; - i++; - } -} -EXPORT_SYMBOL(read_extent_buffer); - -int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) -{ - size_t offset = start & (PAGE_CACHE_SIZE - 1); - char *kaddr; - struct page *p; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - unsigned long end_i = (start_offset + start + min_len - 1) >> - PAGE_CACHE_SHIFT; - - if (i != end_i) - return -EINVAL; - - if (i == 0) { - offset = start_offset; - *map_start = 0; - } else { - offset = 0; - *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; - } - if (start + min_len > eb->len) { -printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len); - WARN_ON(1); - } - - p = extent_buffer_page(eb, i); - WARN_ON(!PageUptodate(p)); - kaddr = kmap_atomic(p, km); - *token = kaddr; - *map = kaddr + offset; - *map_len = PAGE_CACHE_SIZE - offset; - return 0; -} -EXPORT_SYMBOL(map_private_extent_buffer); - -int map_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, - char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) -{ - int err; - int save = 0; - if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, km); - eb->map_token = NULL; - save = 1; - } - err = map_private_extent_buffer(eb, start, min_len, token, map, - map_start, map_len, km); - if (!err && save) { - eb->map_token = *token; - eb->kaddr = *map; - eb->map_start = *map_start; - eb->map_len = *map_len; - } - return err; -} -EXPORT_SYMBOL(map_extent_buffer); - -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) -{ - kunmap_atomic(token, km); -} -EXPORT_SYMBOL(unmap_extent_buffer); - -int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, - unsigned long start, - unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - char *ptr = (char *)ptrv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - int ret = 0; - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while(len > 0) { - page = extent_buffer_page(eb, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, (PAGE_CACHE_SIZE - offset)); - - kaddr = kmap_atomic(page, KM_USER0); - ret = memcmp(ptr, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER0); - if (ret) - break; - - ptr += cur; - len -= cur; - offset = 0; - i++; - } - return ret; -} -EXPORT_SYMBOL(memcmp_extent_buffer); - -void write_extent_buffer(struct extent_buffer *eb, const void *srcv, - unsigned long start, unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - char *src = (char *)srcv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while(len > 0) { - page = extent_buffer_page(eb, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER1); - memcpy(kaddr + offset, src, cur); - kunmap_atomic(kaddr, KM_USER1); - - src += cur; - len -= cur; - offset = 0; - i++; - } -} -EXPORT_SYMBOL(write_extent_buffer); - -void memset_extent_buffer(struct extent_buffer *eb, char c, - unsigned long start, unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while(len > 0) { - page = extent_buffer_page(eb, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, c, cur); - kunmap_atomic(kaddr, KM_USER0); - - len -= cur; - offset = 0; - i++; - } -} -EXPORT_SYMBOL(memset_extent_buffer); - -void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, - unsigned long dst_offset, unsigned long src_offset, - unsigned long len) -{ - u64 dst_len = dst->len; - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; - - WARN_ON(src->len != dst_len); - - offset = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - - while(len > 0) { - page = extent_buffer_page(dst, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - - kaddr = kmap_atomic(page, KM_USER0); - read_extent_buffer(src, kaddr + offset, src_offset, cur); - kunmap_atomic(kaddr, KM_USER0); - - src_offset += cur; - len -= cur; - offset = 0; - i++; - } -} -EXPORT_SYMBOL(copy_extent_buffer); - -static void move_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); - if (dst_page == src_page) { - memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); - } else { - char *src_kaddr = kmap_atomic(src_page, KM_USER1); - char *p = dst_kaddr + dst_off + len; - char *s = src_kaddr + src_off + len; - - while (len--) - *--p = *--s; - - kunmap_atomic(src_kaddr, KM_USER1); - } - kunmap_atomic(dst_kaddr, KM_USER0); -} - -static void copy_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); - char *src_kaddr; - - if (dst_page != src_page) - src_kaddr = kmap_atomic(src_page, KM_USER1); - else - src_kaddr = dst_kaddr; - - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); - kunmap_atomic(dst_kaddr, KM_USER0); - if (dst_page != src_page) - kunmap_atomic(src_kaddr, KM_USER1); -} - -void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) -{ - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long dst_i; - unsigned long src_i; - - if (src_offset + len > dst->len) { - printk("memmove bogus src_offset %lu move len %lu len %lu\n", - src_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset + len > dst->len) { - printk("memmove bogus dst_offset %lu move len %lu len %lu\n", - dst_offset, len, dst->len); - BUG_ON(1); - } - - while(len > 0) { - dst_off_in_page = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - src_off_in_page = (start_offset + src_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - - dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; - src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; - - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - - src_off_in_page)); - cur = min_t(unsigned long, cur, - (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); - - copy_pages(extent_buffer_page(dst, dst_i), - extent_buffer_page(dst, src_i), - dst_off_in_page, src_off_in_page, cur); - - src_offset += cur; - dst_offset += cur; - len -= cur; - } -} -EXPORT_SYMBOL(memcpy_extent_buffer); - -void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) -{ - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; - unsigned long dst_end = dst_offset + len - 1; - unsigned long src_end = src_offset + len - 1; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long dst_i; - unsigned long src_i; - - if (src_offset + len > dst->len) { - printk("memmove bogus src_offset %lu move len %lu len %lu\n", - src_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset + len > dst->len) { - printk("memmove bogus dst_offset %lu move len %lu len %lu\n", - dst_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset < src_offset) { - memcpy_extent_buffer(dst, dst_offset, src_offset, len); - return; - } - while(len > 0) { - dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; - src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; - - dst_off_in_page = (start_offset + dst_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - src_off_in_page = (start_offset + src_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - - cur = min_t(unsigned long, len, src_off_in_page + 1); - cur = min(cur, dst_off_in_page + 1); - move_pages(extent_buffer_page(dst, dst_i), - extent_buffer_page(dst, src_i), - dst_off_in_page - cur + 1, - src_off_in_page - cur + 1, cur); - - dst_end -= cur; - src_end -= cur; - len -= cur; - } -} -EXPORT_SYMBOL(memmove_extent_buffer); -- cgit v1.2.3 From 306929f364b993581c91596230807fa1c022268a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Jun 2008 10:21:04 -0400 Subject: btrfs: fix strange indentation in lookup_extent_mapping Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 268ad8facf6..ba46f7911d9 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -231,7 +231,13 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, { struct extent_map *em; struct rb_node *rb_node; - struct rb_node *prev = NULL; struct rb_node *next = NULL; u64 end = range_end(start, len); em = tree->last; if (em && end > em->start && start < extent_map_end(em)) goto found; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + u64 end = range_end(start, len); + + em = tree->last; + if (em && end > em->start && start < extent_map_end(em)) + goto found; rb_node = __tree_search(&tree->map, start, &prev, &next); if (!rb_node && prev) { -- cgit v1.2.3 From 9d2423c5c3fbb0f110ac0b6cdc5a8e4d64729483 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jun 2008 21:52:17 -0400 Subject: Btrfs: kerneldoc comments for extent_map.c Add kerneldoc comments for all exported functions. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 61 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ba46f7911d9..f5a04eb9a2a 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -31,6 +31,14 @@ void extent_map_exit(void) kmem_cache_destroy(extent_map_cache); } +/** + * extent_map_tree_init - initialize extent map tree + * @tree: tree to initialize + * @mask: flags for memory allocations during tree operations + * + * Initialize the extent tree @tree. Should be called for each new inode + * or other user of the extent_map interface. + */ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) { tree->map.rb_node = NULL; @@ -39,6 +47,14 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) } EXPORT_SYMBOL(extent_map_tree_init); +/** + * alloc_extent_map - allocate new extent map structure + * @mask: memory allocation flags + * + * Allocate a new extent_map structure. The new structure is + * returned with a reference count of one and needs to be + * freed using free_extent_map() + */ struct extent_map *alloc_extent_map(gfp_t mask) { struct extent_map *em; @@ -52,6 +68,13 @@ struct extent_map *alloc_extent_map(gfp_t mask) } EXPORT_SYMBOL(alloc_extent_map); +/** + * free_extent_map - drop reference count of an extent_map + * @em: extent map beeing releasead + * + * Drops the reference out on @em by one and free the structure + * if the reference count hits zero. + */ void free_extent_map(struct extent_map *em) { if (!em) @@ -166,10 +189,15 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) return 0; } -/* - * add_extent_mapping tries a simple forward/backward merge with existing - * mappings. The extent_map struct passed in will be inserted into - * the tree directly (no copies made, just a reference taken). +/** + * add_extent_mapping - add new extent map to the extent tree + * @tree: tree to insert new map in + * @em: map to insert + * + * Insert @em into @tree or perform a simple forward/backward merge with + * existing mappings. The extent_map struct passed in will be inserted + * into the tree directly, with an additional reference taken, or a + * reference dropped if the merge attempt was sucessfull. */ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) @@ -220,11 +248,16 @@ static u64 range_end(u64 start, u64 len) return start + len; } -/* - * lookup_extent_mapping returns the first extent_map struct in the - * tree that intersects the [start, len] range. There may - * be additional objects in the tree that intersect, so check the object - * returned carefully to make sure you don't need additional lookups. +/** + * lookup_extent_mapping - lookup extent_map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. There may be additional objects in the tree that + * intersect, so check the object returned carefully to make sure that no + * additional lookups are needed. */ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) @@ -273,9 +306,13 @@ out: } EXPORT_SYMBOL(lookup_extent_mapping); -/* - * removes an extent_map struct from the tree. No reference counts are - * dropped, and no checks are done to see if the range is in use +/** + * remove_extent_mapping - removes an extent_map from the extent tree + * @tree: extent tree to remove from + * @em: extent map beeing removed + * + * Removes @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use */ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { -- cgit v1.2.3 From e6dcd2dc9c489108648e2ed543315dd134d50a9a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 17 Jul 2008 12:53:50 -0400 Subject: Btrfs: New data=ordered implementation The old data=ordered code would force commit to wait until all the data extents from the transaction were fully on disk. This introduced large latencies into the commit and stalled new writers in the transaction for a long time. The new code changes the way data allocations and extents work: * When delayed allocation is filled, data extents are reserved, and the extent bit EXTENT_ORDERED is set on the entire range of the extent. A struct btrfs_ordered_extent is allocated an inserted into a per-inode rbtree to track the pending extents. * As each page is written EXTENT_ORDERED is cleared on the bytes corresponding to that page. * When all of the bytes corresponding to a single struct btrfs_ordered_extent are written, The previously reserved extent is inserted into the FS btree and into the extent allocation trees. The checksums for the file data are also updated. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f5a04eb9a2a..81123277c2b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -206,10 +206,11 @@ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *merge = NULL; struct rb_node *rb; + BUG_ON(spin_trylock(&tree->lock)); rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { - merge = rb_entry(rb, struct extent_map, rb_node); ret = -EEXIST; + free_extent_map(merge); goto out; } atomic_inc(&em->refs); @@ -268,6 +269,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *next = NULL; u64 end = range_end(start, len); + BUG_ON(spin_trylock(&tree->lock)); em = tree->last; if (em && end > em->start && start < extent_map_end(em)) goto found; @@ -318,6 +320,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { int ret = 0; + BUG_ON(spin_trylock(&tree->lock)); rb_erase(&em->rb_node, &tree->map); em->in_tree = 0; if (tree->last == em) -- cgit v1.2.3 From 7f3c74fb831fa19bafe087e817c0a5ff3883f1ea Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 18 Jul 2008 12:01:11 -0400 Subject: Btrfs: Keep extent mappings in ram until pending ordered extents are done It was possible for stale mappings from disk to be used instead of the new pending ordered extent. This adds a flag to the extent map struct to keep it pinned until the pending ordered extent is actually on disk. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 81123277c2b..71b1ac15535 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -173,6 +173,9 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) static int mergable_maps(struct extent_map *prev, struct extent_map *next) { + if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -320,6 +323,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { int ret = 0; + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); BUG_ON(spin_trylock(&tree->lock)); rb_erase(&em->rb_node, &tree->map); em->in_tree = 0; -- cgit v1.2.3 From f421950f86bf96a11fef932e167ab2e70d4c43a0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 22 Jul 2008 11:18:09 -0400 Subject: Btrfs: Fix some data=ordered related data corruptions Stress testing was showing data checksum errors, most of which were caused by a lookup bug in the extent_map tree. The tree was caching the last pointer returned, and searches would check the last pointer first. But, search callers also expect the search to return the very first matching extent in the range, which wasn't always true with the last pointer usage. For now, the code to cache the last return value is just removed. It is easy to fix, but I think lookups are rare enough that it isn't required anymore. This commit also replaces do_sync_mapping_range with a local copy of the related functions. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 71b1ac15535..8a502ee2f23 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -42,7 +42,6 @@ void extent_map_exit(void) void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) { tree->map.rb_node = NULL; - tree->last = NULL; spin_lock_init(&tree->lock); } EXPORT_SYMBOL(extent_map_tree_init); @@ -239,7 +238,6 @@ int add_extent_mapping(struct extent_map_tree *tree, merge->in_tree = 0; free_extent_map(merge); } - tree->last = em; out: return ret; } @@ -273,10 +271,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 end = range_end(start, len); BUG_ON(spin_trylock(&tree->lock)); - em = tree->last; - if (em && end > em->start && start < extent_map_end(em)) - goto found; - rb_node = __tree_search(&tree->map, start, &prev, &next); if (!rb_node && prev) { em = rb_entry(prev, struct extent_map, rb_node); @@ -305,7 +299,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, found: atomic_inc(&em->refs); - tree->last = em; out: return em; } @@ -327,8 +320,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) BUG_ON(spin_trylock(&tree->lock)); rb_erase(&em->rb_node, &tree->map); em->in_tree = 0; - if (tree->last == em) - tree->last = NULL; return ret; } EXPORT_SYMBOL(remove_extent_mapping); -- cgit v1.2.3 From 64f26f745084872b916cd1bef6054e21b15c5784 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 24 Jul 2008 10:09:43 -0400 Subject: Btrfs: Use assert_spin_locked instead of spin_trylock On UP systems spin_trylock always succeeds Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8a502ee2f23..954b047639a 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -208,7 +208,7 @@ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *merge = NULL; struct rb_node *rb; - BUG_ON(spin_trylock(&tree->lock)); + assert_spin_locked(&tree->lock); rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { ret = -EEXIST; @@ -270,7 +270,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *next = NULL; u64 end = range_end(start, len); - BUG_ON(spin_trylock(&tree->lock)); + assert_spin_locked(&tree->lock); rb_node = __tree_search(&tree->map, start, &prev, &next); if (!rb_node && prev) { em = rb_entry(prev, struct extent_map, rb_node); @@ -317,7 +317,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) int ret = 0; WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - BUG_ON(spin_trylock(&tree->lock)); + assert_spin_locked(&tree->lock); rb_erase(&em->rb_node, &tree->map); em->in_tree = 0; return ret; -- cgit v1.2.3 From 7c2fe32a238eb12422beca5cbd5194a594baa559 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 20 Aug 2008 08:51:50 -0400 Subject: Btrfs: Fix add_extent_mapping to check for duplicates across the whole range add_extent_mapping was allowing the insertion of overlapping extents. This never used to happen because it only inserted the extents from disk and those were never overlapping. But, with the data=ordered code, the disk and memory representations of the file are not the same. add_extent_mapping needs to ensure a new extent does not overlap before it inserts. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 954b047639a..78ced11d18c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -207,7 +207,14 @@ int add_extent_mapping(struct extent_map_tree *tree, int ret = 0; struct extent_map *merge = NULL; struct rb_node *rb; + struct extent_map *exist; + exist = lookup_extent_mapping(tree, em->start, em->len); + if (exist) { + free_extent_map(exist); + ret = -EEXIST; + goto out; + } assert_spin_locked(&tree->lock); rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { -- cgit v1.2.3 From d352ac68148b69937d39ca5d48bcc4478e118dbf Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 29 Sep 2008 15:18:18 -0400 Subject: Btrfs: add and improve comments This improves the comments at the top of many functions. It didn't dive into the guts of functions because I was trying to avoid merging problems with the new allocator and back reference work. extent-tree.c and volumes.c were both skipped, and there is definitely more work todo in cleaning and commenting the code. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 78ced11d18c..74b2a29880d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -114,6 +114,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return NULL; } +/* + * search through the tree for an extent_map with a given offset. If + * it can't be found, try to find some neighboring extents + */ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct rb_node **prev_ret, struct rb_node **next_ret) @@ -160,6 +164,10 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } +/* + * look for an offset in the tree, and if it can't be found, return + * the first offset we can find smaller than 'offset'. + */ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) { struct rb_node *prev; @@ -170,6 +178,7 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) return ret; } +/* check to see if two extent_map structs are adjacent and safe to merge */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) @@ -250,6 +259,7 @@ out: } EXPORT_SYMBOL(add_extent_mapping); +/* simple helper to do math around the end of an extent, handling wrap */ static u64 range_end(u64 start, u64 len) { if (start + len < start) -- cgit v1.2.3 From c8b978188c9a0fd3d535c13debd19d522b726f1f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 29 Oct 2008 14:49:59 -0400 Subject: Btrfs: Add zlib compression support This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 74b2a29880d..fd3ebfb8c3c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) return 0; + /* + * don't merge compressed extents, we need to know their + * actual size + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree, if (rb && mergable_maps(merge, em)) { em->start = merge->start; em->len += merge->len; + em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; rb_erase(&merge->rb_node, &tree->map); @@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree, merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(em, merge)) { em->len += merge->len; + em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; free_extent_map(merge); -- cgit v1.2.3 From d397712bcc6a759a560fd247e6053ecae091f958 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 5 Jan 2009 21:25:51 -0500 Subject: Btrfs: Fix checkpatch.pl warnings There were many, most are fixed now. struct-funcs.c generates some warnings but these are bogus. Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index fd3ebfb8c3c..4a83e33ada3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -89,11 +89,11 @@ EXPORT_SYMBOL(free_extent_map); static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) { - struct rb_node ** p = &root->rb_node; - struct rb_node * parent = NULL; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; struct extent_map *entry; - while(*p) { + while (*p) { parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); @@ -122,13 +122,13 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct rb_node **prev_ret, struct rb_node **next_ret) { - struct rb_node * n = root->rb_node; + struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev = NULL; struct extent_map *entry; struct extent_map *prev_entry = NULL; - while(n) { + while (n) { entry = rb_entry(n, struct extent_map, rb_node); prev = n; prev_entry = entry; @@ -145,7 +145,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, if (prev_ret) { orig_prev = prev; - while(prev && offset >= extent_map_end(prev_entry)) { + while (prev && offset >= extent_map_end(prev_entry)) { prev = rb_next(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); } @@ -155,7 +155,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, if (next_ret) { prev_entry = rb_entry(prev, struct extent_map, rb_node); - while(prev && offset < prev_entry->start) { + while (prev && offset < prev_entry->start) { prev = rb_prev(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); } -- cgit v1.2.3