From 0b86a832a1f38abec695864ec2eaedc9d2383f1b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:01:56 -0400
Subject: Btrfs: Add support for multiple devices per filesystem

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 852 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 852 insertions(+)
 create mode 100644 fs/btrfs/volumes.c

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 00000000000..90a8d45dc6d
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+
+struct map_lookup {
+	struct btrfs_device *dev;
+	u64 physical;
+};
+
+/*
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ */
+static int find_free_dev_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_device *device,
+				struct btrfs_path *path,
+				u64 num_bytes, u64 *start)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	u64 hole_size = 0;
+	u64 last_byte = 0;
+	u64 search_start = 0;
+	u64 search_end = device->total_bytes;
+	int ret;
+	int slot = 0;
+	int start_found;
+	struct extent_buffer *l;
+
+	start_found = 0;
+	path->reada = 2;
+
+	/* FIXME use last free of some kind */
+
+	key.objectid = device->devid;
+	key.offset = search_start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	ret = btrfs_previous_item(root, path, 0, key.type);
+	if (ret < 0)
+		goto error;
+	l = path->nodes[0];
+	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+no_more_items:
+			if (!start_found) {
+				if (search_start >= search_end) {
+					ret = -ENOSPC;
+					goto error;
+				}
+				*start = search_start;
+				start_found = 1;
+				goto check_pending;
+			}
+			*start = last_byte > search_start ?
+				last_byte : search_start;
+			if (search_end <= *start) {
+				ret = -ENOSPC;
+				goto error;
+			}
+			goto check_pending;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			goto no_more_items;
+
+		if (key.offset >= search_start && key.offset > last_byte &&
+		    start_found) {
+			if (last_byte < search_start)
+				last_byte = search_start;
+			hole_size = key.offset - last_byte;
+			if (key.offset > last_byte &&
+			    hole_size >= num_bytes) {
+				*start = last_byte;
+				goto check_pending;
+			}
+		}
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
+			goto next;
+		}
+
+		start_found = 1;
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+check_pending:
+	/* we have to make sure we didn't find an extent that has already
+	 * been allocated by the map tree or the original allocation
+	 */
+	btrfs_release_path(root, path);
+	BUG_ON(*start < search_start);
+
+	if (*start + num_bytes >= search_end) {
+		ret = -ENOSPC;
+		goto error;
+	}
+	/* check for pending inserts here */
+	return 0;
+
+error:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_device *device,
+			   u64 owner, u64 num_bytes, u64 *start)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *extent;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = find_free_dev_extent(trans, device, path, num_bytes, start);
+	if (ret)
+		goto err;
+
+	key.objectid = device->devid;
+	key.offset = *start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*extent));
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	extent = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_dev_extent);
+	btrfs_set_dev_extent_owner(leaf, extent, owner);
+	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+	btrfs_mark_buffer_dirty(leaf);
+err:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int find_next_chunk(struct btrfs_root *root, u64 *objectid)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = (u64)-1;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+	if (ret) {
+		*objectid = 0;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*objectid = found_key.objectid + found_key.offset;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static struct btrfs_device *next_device(struct list_head *head,
+					struct list_head *last)
+{
+	struct list_head *next = last->next;
+	struct btrfs_device *dev;
+
+	if (list_empty(head))
+		return NULL;
+
+	if (next == head)
+		next = next->next;
+
+	dev = list_entry(next, struct btrfs_device, dev_list);
+	return dev;
+}
+
+static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
+			   u64 *objectid)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+				  BTRFS_DEV_ITEM_KEY);
+	if (ret) {
+		*objectid = 1;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*objectid = found_key.offset + 1;
+	}
+	ret = 0;
+error:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+	u64 free_devid;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = find_next_devid(root, path, &free_devid);
+	if (ret)
+		goto out;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = free_devid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*dev_item) + device->name_len);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
+	btrfs_set_device_partition(leaf, dev_item, device->partition);
+	btrfs_set_device_name_len(leaf, dev_item, device->name_len);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+
+	ptr = (unsigned long)btrfs_device_name(dev_item);
+	write_extent_buffer(leaf, device->name, ptr, device->name_len);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+	ret = 0;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+int btrfs_update_device(struct btrfs_trans_handle *trans,
+			struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	root = device->dev_root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
+	btrfs_set_device_partition(leaf, dev_item, device->partition);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_mark_buffer_dirty(leaf);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_key *key,
+			   struct btrfs_chunk *chunk, int item_size)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key disk_key;
+	u32 array_size;
+	u8 *ptr;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+		return -EFBIG;
+
+	ptr = super_copy->sys_chunk_array + array_size;
+	btrfs_cpu_key_to_disk(&disk_key, key);
+	memcpy(ptr, &disk_key, sizeof(disk_key));
+	ptr += sizeof(disk_key);
+	memcpy(ptr, chunk, item_size);
+	item_size += sizeof(disk_key);
+	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+	return 0;
+}
+
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 *start,
+		      u64 *num_bytes, u32 type)
+{
+	u64 dev_offset;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_stripe *stripes;
+	struct btrfs_device *device = NULL;
+	struct btrfs_chunk *chunk;
+	struct list_head *dev_list = &extent_root->fs_info->devices;
+	struct list_head *last_dev = extent_root->fs_info->last_device;
+	struct extent_map_tree *em_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 physical;
+	u64 calc_size = 1024 * 1024 * 1024;
+	int num_stripes;
+	int ret;
+	int index = 0;
+	struct btrfs_key key;
+
+
+	ret = find_next_chunk(chunk_root, &key.objectid);
+	if (ret)
+		return ret;
+
+	num_stripes = 1;
+	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
+	if (!chunk)
+		return -ENOMEM;
+
+	stripes = &chunk->stripe;
+
+	*num_bytes = calc_size;
+	while(index < num_stripes) {
+		device = next_device(dev_list, last_dev);
+		BUG_ON(!device);
+		last_dev = &device->dev_list;
+		extent_root->fs_info->last_device = last_dev;
+
+		ret = btrfs_alloc_dev_extent(trans, device,
+					     key.objectid,
+					     calc_size, &dev_offset);
+		BUG_ON(ret);
+
+		device->bytes_used += calc_size;
+		ret = btrfs_update_device(trans, device);
+		BUG_ON(ret);
+
+		btrfs_set_stack_stripe_devid(stripes + index, device->devid);
+		btrfs_set_stack_stripe_offset(stripes + index, dev_offset);
+		physical = dev_offset;
+		index++;
+	}
+
+	/* key.objectid was set above */
+	key.offset = *num_bytes;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+	btrfs_set_stack_chunk_stripe_len(chunk, 64 * 1024);
+	btrfs_set_stack_chunk_type(chunk, type);
+	btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
+	btrfs_set_stack_chunk_io_align(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_io_width(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+
+	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
+				btrfs_chunk_item_size(num_stripes));
+	BUG_ON(ret);
+	*start = key.objectid;
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	em->bdev = (struct block_device *)map;
+	em->start = key.objectid;
+	em->len = key.offset;
+	em->block_start = 0;
+
+	map->physical = physical;
+	map->dev = device;
+
+	if (!map->dev) {
+		kfree(map);
+		free_extent_map(em);
+		return -EIO;
+	}
+	kfree(chunk);
+
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	spin_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	BUG_ON(ret);
+	spin_unlock(&em_tree->lock);
+	free_extent_map(em);
+	return ret;
+}
+
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+	extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+}
+
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+	struct extent_map *em;
+
+	while(1) {
+		spin_lock(&tree->map_tree.lock);
+		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+		if (em)
+			remove_extent_mapping(&tree->map_tree, em);
+		spin_unlock(&tree->map_tree.lock);
+		if (!em)
+			break;
+		kfree(em->bdev);
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree */
+		free_extent_map(em);
+	}
+}
+
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
+		    u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	u64 offset;
+
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, *length);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	offset = logical - em->start;
+	*phys = map->physical + offset;
+	*length = em->len - offset;
+	*dev = map->dev;
+	free_extent_map(em);
+	spin_unlock(&em_tree->lock);
+	return 0;
+}
+
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
+{
+	struct btrfs_mapping_tree *map_tree;
+	struct btrfs_device *dev;
+	u64 logical = bio->bi_sector << 9;
+	u64 physical;
+	u64 length = 0;
+	u64 map_length;
+	struct bio_vec *bvec;
+	int i;
+	int ret;
+
+	bio_for_each_segment(bvec, bio, i) {
+		length += bvec->bv_len;
+	}
+	map_tree = &root->fs_info->mapping_tree;
+	map_length = length;
+	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	BUG_ON(map_length < length);
+	bio->bi_sector = physical >> 9;
+	bio->bi_bdev = dev->bdev;
+	submit_bio(rw, bio);
+	return 0;
+}
+
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid)
+{
+	struct btrfs_device *dev;
+	struct list_head *cur = root->fs_info->devices.next;
+	struct list_head *head = &root->fs_info->devices;
+
+	while(cur != head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (dev->devid == devid)
+			return dev;
+		cur = cur->next;
+	}
+	return NULL;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+			  struct extent_buffer *leaf,
+			  struct btrfs_chunk *chunk)
+{
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 logical;
+	u64 length;
+	u64 devid;
+	int ret;
+
+	logical = key->objectid;
+	length = key->offset;
+	spin_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+
+	/* already mapped? */
+	if (em && em->start <= logical && em->start + em->len > logical) {
+		free_extent_map(em);
+		spin_unlock(&map_tree->map_tree.lock);
+		return 0;
+	} else if (em) {
+		free_extent_map(em);
+	}
+	spin_unlock(&map_tree->map_tree.lock);
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (!map)
+		return -ENOMEM;
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	em->bdev = (struct block_device *)map;
+	em->start = logical;
+	em->len = length;
+	em->block_start = 0;
+
+	map->physical = btrfs_stripe_offset_nr(leaf, chunk, 0);
+	devid = btrfs_stripe_devid_nr(leaf, chunk, 0);
+	map->dev = btrfs_find_device(root, devid);
+	if (!map->dev) {
+		kfree(map);
+		free_extent_map(em);
+		return -EIO;
+	}
+
+	spin_lock(&map_tree->map_tree.lock);
+	ret = add_extent_mapping(&map_tree->map_tree, em);
+	BUG_ON(ret);
+	spin_unlock(&map_tree->map_tree.lock);
+	free_extent_map(em);
+
+	return 0;
+}
+
+static int fill_device_from_item(struct extent_buffer *leaf,
+				 struct btrfs_dev_item *dev_item,
+				 struct btrfs_device *device)
+{
+	unsigned long ptr;
+	char *name;
+
+	device->devid = btrfs_device_id(leaf, dev_item);
+	device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+	device->type = btrfs_device_type(leaf, dev_item);
+	device->io_align = btrfs_device_io_align(leaf, dev_item);
+	device->io_width = btrfs_device_io_width(leaf, dev_item);
+	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+	device->rdev = btrfs_device_rdev(leaf, dev_item);
+	device->partition = btrfs_device_partition(leaf, dev_item);
+	device->name_len = btrfs_device_name_len(leaf, dev_item);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+
+	name = kmalloc(device->name_len + 1, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+	device->name = name;
+	ptr = (unsigned long)btrfs_device_name(dev_item);
+	read_extent_buffer(leaf, name, ptr, device->name_len);
+	name[device->name_len] = '\0';
+	return 0;
+}
+
+static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
+			struct extent_buffer *leaf,
+			struct btrfs_dev_item *dev_item)
+{
+	struct btrfs_device *device;
+	u64 devid;
+	int ret;
+
+	devid = btrfs_device_id(leaf, dev_item);
+	if (btrfs_find_device(root, devid))
+		return 0;
+
+	device = kmalloc(sizeof(*device), GFP_NOFS);
+	if (!device)
+		return -ENOMEM;
+
+	fill_device_from_item(leaf, dev_item, device);
+	device->dev_root = root->fs_info->dev_root;
+	device->bdev = root->fs_info->sb->s_bdev;
+	list_add(&device->dev_list, &root->fs_info->devices);
+	memcpy(&device->dev_key, key, sizeof(*key));
+	ret = 0;
+#if 0
+	ret = btrfs_open_device(device);
+	if (ret) {
+		kfree(device);
+	}
+#endif
+	return ret;
+}
+
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct extent_buffer *sb = root->fs_info->sb_buffer;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_dev_item *dev_item;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key key;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u8 *ptr;
+	unsigned long sb_ptr;
+	u32 cur;
+	int ret;
+	int dev_only = 1;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	/*
+	 * we do this loop twice, once for the device items and
+	 * once for all of the chunks.  This way there are device
+	 * structs filled in for every chunk
+	 */
+again:
+	ptr = super_copy->sys_chunk_array;
+	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key);
+		ptr += len;
+		sb_ptr += len;
+		cur += len;
+
+		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID &&
+		    key.type == BTRFS_DEV_ITEM_KEY) {
+			dev_item = (struct btrfs_dev_item *)sb_ptr;
+			if (dev_only) {
+				ret = read_one_dev(root, &key, sb, dev_item);
+				BUG_ON(ret);
+			}
+			len = sizeof(*dev_item);
+			len += btrfs_device_name_len(sb, dev_item);
+		} else if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+
+			chunk = (struct btrfs_chunk *)sb_ptr;
+			if (!dev_only) {
+				ret = read_one_chunk(root, &key, sb, chunk);
+				BUG_ON(ret);
+			}
+			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+			len = btrfs_chunk_item_size(num_stripes);
+		} else {
+			BUG();
+		}
+		ptr += len;
+		sb_ptr += len;
+		cur += len;
+	}
+	if (dev_only == 1) {
+		dev_only = 0;
+		goto again;
+	}
+	return 0;
+}
+
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	int ret;
+	int slot;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* first we search for all of the device items, and then we
+	 * read in all of the chunk items.  This way we can create chunk
+	 * mappings that reference all of the devices that are afound
+	 */
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = 0;
+again:
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	while(1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
+				break;
+			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+				struct btrfs_dev_item *dev_item;
+				dev_item = btrfs_item_ptr(leaf, slot,
+						  struct btrfs_dev_item);
+				ret = read_one_dev(root, &found_key, leaf,
+						   dev_item);
+				BUG_ON(ret);
+			}
+		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+			struct btrfs_chunk *chunk;
+			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+			ret = read_one_chunk(root, &found_key, leaf, chunk);
+		}
+		path->slots[0]++;
+	}
+	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+		key.objectid = 0;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+
+	btrfs_free_path(path);
+	ret = 0;
+error:
+	return ret;
+}
+
-- 
cgit v1.2.3


From 6324fbf334f4586325057197da7752f4ffa409d3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:01:59 -0400
Subject: Btrfs: Dynamic chunk and block group allocation

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 87 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 34 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 90a8d45dc6d..a52a13f365d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -131,7 +131,7 @@ check_pending:
 	btrfs_release_path(root, path);
 	BUG_ON(*start < search_start);
 
-	if (*start + num_bytes >= search_end) {
+	if (*start + num_bytes > search_end) {
 		ret = -ENOSPC;
 		goto error;
 	}
@@ -159,8 +159,9 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	ret = find_free_dev_extent(trans, device, path, num_bytes, start);
-	if (ret)
+	if (ret) {
 		goto err;
+	}
 
 	key.objectid = device->devid;
 	key.offset = *start;
@@ -214,22 +215,6 @@ error:
 	return ret;
 }
 
-static struct btrfs_device *next_device(struct list_head *head,
-					struct list_head *last)
-{
-	struct list_head *next = last->next;
-	struct btrfs_device *dev;
-
-	if (list_empty(head))
-		return NULL;
-
-	if (next == head)
-		next = next->next;
-
-	dev = list_entry(next, struct btrfs_device, dev_list);
-	return dev;
-}
-
 static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
 			   u64 *objectid)
 {
@@ -397,31 +382,63 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 *start,
-		      u64 *num_bytes, u32 type)
+		      u64 *num_bytes, u64 type)
 {
 	u64 dev_offset;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
 	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
+	struct list_head private_devs;
 	struct list_head *dev_list = &extent_root->fs_info->devices;
-	struct list_head *last_dev = extent_root->fs_info->last_device;
+	struct list_head *cur;
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
-	int num_stripes;
+	u64 avail;
+	u64 max_avail = 0;
+	int num_stripes = 1;
+	int looped = 0;
 	int ret;
-	int index = 0;
+	int index;
 	struct btrfs_key key;
 
+	if (list_empty(dev_list))
+		return -ENOSPC;
+again:
+	INIT_LIST_HEAD(&private_devs);
+	cur = dev_list->next;
+	index = 0;
+	/* build a private list of devices we will allocate from */
+	while(index < num_stripes) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		avail = device->total_bytes - device->bytes_used;
+		cur = cur->next;
+		if (avail > max_avail)
+			max_avail = avail;
+		if (avail >= calc_size) {
+			list_move_tail(&device->dev_list, &private_devs);
+			index++;
+		}
+		if (cur == dev_list)
+			break;
+	}
+	if (index < num_stripes) {
+		list_splice(&private_devs, dev_list);
+		if (!looped && max_avail > 0) {
+			looped = 1;
+			calc_size = max_avail;
+			goto again;
+		}
+		return -ENOSPC;
+	}
 
 	ret = find_next_chunk(chunk_root, &key.objectid);
 	if (ret)
 		return ret;
 
-	num_stripes = 1;
 	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
 	if (!chunk)
 		return -ENOMEM;
@@ -429,11 +446,12 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	stripes = &chunk->stripe;
 
 	*num_bytes = calc_size;
+	index = 0;
 	while(index < num_stripes) {
-		device = next_device(dev_list, last_dev);
-		BUG_ON(!device);
-		last_dev = &device->dev_list;
-		extent_root->fs_info->last_device = last_dev;
+		BUG_ON(list_empty(&private_devs));
+		cur = private_devs.next;
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		list_move_tail(&device->dev_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
 					     key.objectid,
@@ -449,6 +467,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		physical = dev_offset;
 		index++;
 	}
+	BUG_ON(!list_empty(&private_devs));
 
 	/* key.objectid was set above */
 	key.offset = *num_bytes;
@@ -692,17 +711,17 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
 	int ret;
 
 	devid = btrfs_device_id(leaf, dev_item);
-	if (btrfs_find_device(root, devid))
-		return 0;
-
-	device = kmalloc(sizeof(*device), GFP_NOFS);
-	if (!device)
-		return -ENOMEM;
+	device = btrfs_find_device(root, devid);
+	if (!device) {
+		device = kmalloc(sizeof(*device), GFP_NOFS);
+		if (!device)
+			return -ENOMEM;
+		list_add(&device->dev_list, &root->fs_info->devices);
+	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = root->fs_info->sb->s_bdev;
-	list_add(&device->dev_list, &root->fs_info->devices);
 	memcpy(&device->dev_key, key, sizeof(*key));
 	ret = 0;
 #if 0
-- 
cgit v1.2.3


From 0d81ba5dbedef0c3970d6aa318aa84920943e6e3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:07 -0400
Subject: Btrfs: Move device information into the super block so it can be
 scanned

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 61 ++++++++++++++----------------------------------------
 1 file changed, 15 insertions(+), 46 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a52a13f365d..ae22d01ecf5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -278,7 +278,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	key.offset = free_devid;
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
-				      sizeof(*dev_item) + device->name_len);
+				      sizeof(*dev_item));
 	if (ret)
 		goto out;
 
@@ -290,15 +290,9 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
-	btrfs_set_device_partition(leaf, dev_item, device->partition);
-	btrfs_set_device_name_len(leaf, dev_item, device->name_len);
 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
 
-	ptr = (unsigned long)btrfs_device_name(dev_item);
-	write_extent_buffer(leaf, device->name, ptr, device->name_len);
-
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
@@ -345,8 +339,6 @@ int btrfs_update_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
-	btrfs_set_device_partition(leaf, dev_item, device->partition);
 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
 	btrfs_mark_buffer_dirty(leaf);
@@ -676,7 +668,6 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 				 struct btrfs_device *device)
 {
 	unsigned long ptr;
-	char *name;
 
 	device->devid = btrfs_device_id(leaf, dev_item);
 	device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
@@ -685,24 +676,14 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	device->io_align = btrfs_device_io_align(leaf, dev_item);
 	device->io_width = btrfs_device_io_width(leaf, dev_item);
 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
-	device->rdev = btrfs_device_rdev(leaf, dev_item);
-	device->partition = btrfs_device_partition(leaf, dev_item);
-	device->name_len = btrfs_device_name_len(leaf, dev_item);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
 
-	name = kmalloc(device->name_len + 1, GFP_NOFS);
-	if (!name)
-		return -ENOMEM;
-	device->name = name;
-	ptr = (unsigned long)btrfs_device_name(dev_item);
-	read_extent_buffer(leaf, name, ptr, device->name_len);
-	name[device->name_len] = '\0';
 	return 0;
 }
 
-static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
+static int read_one_dev(struct btrfs_root *root,
 			struct extent_buffer *leaf,
 			struct btrfs_dev_item *dev_item)
 {
@@ -722,7 +703,6 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = root->fs_info->sb->s_bdev;
-	memcpy(&device->dev_key, key, sizeof(*key));
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
@@ -733,12 +713,20 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
 	return ret;
 }
 
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
+{
+	struct btrfs_dev_item *dev_item;
+
+	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
+						     dev_item);
+	return read_one_dev(root, buf, dev_item);
+}
+
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	struct extent_buffer *sb = root->fs_info->sb_buffer;
 	struct btrfs_disk_key *disk_key;
-	struct btrfs_dev_item *dev_item;
 	struct btrfs_chunk *chunk;
 	struct btrfs_key key;
 	u32 num_stripes;
@@ -748,7 +736,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	unsigned long sb_ptr;
 	u32 cur;
 	int ret;
-	int dev_only = 1;
 
 	array_size = btrfs_super_sys_array_size(super_copy);
 
@@ -757,7 +744,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	 * once for all of the chunks.  This way there are device
 	 * structs filled in for every chunk
 	 */
-again:
 	ptr = super_copy->sys_chunk_array;
 	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
 	cur = 0;
@@ -771,22 +757,10 @@ again:
 		sb_ptr += len;
 		cur += len;
 
-		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID &&
-		    key.type == BTRFS_DEV_ITEM_KEY) {
-			dev_item = (struct btrfs_dev_item *)sb_ptr;
-			if (dev_only) {
-				ret = read_one_dev(root, &key, sb, dev_item);
-				BUG_ON(ret);
-			}
-			len = sizeof(*dev_item);
-			len += btrfs_device_name_len(sb, dev_item);
-		} else if (key.type == BTRFS_CHUNK_ITEM_KEY) {
-
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
 			chunk = (struct btrfs_chunk *)sb_ptr;
-			if (!dev_only) {
-				ret = read_one_chunk(root, &key, sb, chunk);
-				BUG_ON(ret);
-			}
+			ret = read_one_chunk(root, &key, sb, chunk);
+			BUG_ON(ret);
 			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
 			len = btrfs_chunk_item_size(num_stripes);
 		} else {
@@ -796,10 +770,6 @@ again:
 		sb_ptr += len;
 		cur += len;
 	}
-	if (dev_only == 1) {
-		dev_only = 0;
-		goto again;
-	}
 	return 0;
 }
 
@@ -846,8 +816,7 @@ again:
 				struct btrfs_dev_item *dev_item;
 				dev_item = btrfs_item_ptr(leaf, slot,
 						  struct btrfs_dev_item);
-				ret = read_one_dev(root, &found_key, leaf,
-						   dev_item);
+				ret = read_one_dev(root, leaf, dev_item);
 				BUG_ON(ret);
 			}
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
-- 
cgit v1.2.3


From 239b14b32dc39232ebf9cce29ff77c4c564355fd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:07 -0400
Subject: Btrfs: Bring back mount -o ssd optimizations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ae22d01ecf5..16fb6bbe6e2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -578,6 +578,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	if (map_length < length) {
+		printk("mapping failed logical %Lu bio len %Lu physical %Lu "
+		       "len %Lu\n", logical, length, physical, map_length);
+		BUG();
+	}
 	BUG_ON(map_length < length);
 	bio->bi_sector = physical >> 9;
 	bio->bi_bdev = dev->bdev;
-- 
cgit v1.2.3


From 8a4b83cc8bd75fca29ac68615896d9e92820e7c2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:07 -0400
Subject: Btrfs: Add support for device scanning and detection ioctls

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 222 insertions(+), 14 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 16fb6bbe6e2..263f01cc3db 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
  */
 #include <linux/sched.h>
 #include <linux/bio.h>
+#include <linux/buffer_head.h>
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
@@ -28,6 +29,215 @@ struct map_lookup {
 	struct btrfs_device *dev;
 	u64 physical;
 };
+static DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+
+int btrfs_cleanup_fs_uuids(void)
+{
+	struct btrfs_fs_devices *fs_devices;
+	struct list_head *uuid_cur;
+	struct list_head *devices_cur;
+	struct btrfs_device *dev;
+
+	list_for_each(uuid_cur, &fs_uuids) {
+		fs_devices = list_entry(uuid_cur, struct btrfs_fs_devices,
+					list);
+		while(!list_empty(&fs_devices->devices)) {
+			devices_cur = fs_devices->devices.next;
+			dev = list_entry(devices_cur, struct btrfs_device,
+					 dev_list);
+			printk("uuid cleanup finds %s\n", dev->name);
+			if (dev->bdev) {
+				printk("closing\n");
+				close_bdev_excl(dev->bdev);
+			}
+			list_del(&dev->dev_list);
+			kfree(dev);
+		}
+	}
+	return 0;
+}
+
+static struct btrfs_device *__find_device(struct list_head *head, u64 devid)
+{
+	struct btrfs_device *dev;
+	struct list_head *cur;
+
+	list_for_each(cur, head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (dev->devid == devid)
+			return dev;
+	}
+	return NULL;
+}
+
+static struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+	struct list_head *cur;
+	struct btrfs_fs_devices *fs_devices;
+
+	list_for_each(cur, &fs_uuids) {
+		fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+			return fs_devices;
+	}
+	return NULL;
+}
+
+static int device_list_add(const char *path,
+			   struct btrfs_super_block *disk_super,
+			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices;
+	u64 found_transid = btrfs_super_generation(disk_super);
+
+	fs_devices = find_fsid(disk_super->fsid);
+	if (!fs_devices) {
+		fs_devices = kmalloc(sizeof(*fs_devices), GFP_NOFS);
+		if (!fs_devices)
+			return -ENOMEM;
+		INIT_LIST_HEAD(&fs_devices->devices);
+		list_add(&fs_devices->list, &fs_uuids);
+		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+		fs_devices->latest_devid = devid;
+		fs_devices->latest_trans = found_transid;
+		fs_devices->lowest_devid = (u64)-1;
+		fs_devices->num_devices = 0;
+		device = NULL;
+	} else {
+		device = __find_device(&fs_devices->devices, devid);
+	}
+	if (!device) {
+		device = kzalloc(sizeof(*device), GFP_NOFS);
+		if (!device) {
+			/* we can safely leave the fs_devices entry around */
+			return -ENOMEM;
+		}
+		device->devid = devid;
+		device->name = kstrdup(path, GFP_NOFS);
+		if (!device->name) {
+			kfree(device);
+			return -ENOMEM;
+		}
+		list_add(&device->dev_list, &fs_devices->devices);
+		fs_devices->num_devices++;
+	}
+
+	if (found_transid > fs_devices->latest_trans) {
+		fs_devices->latest_devid = devid;
+		fs_devices->latest_trans = found_transid;
+	}
+	if (fs_devices->lowest_devid > devid) {
+		fs_devices->lowest_devid = devid;
+		printk("lowest devid now %Lu\n", devid);
+	}
+	*fs_devices_ret = fs_devices;
+	return 0;
+}
+
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *head = &fs_devices->devices;
+	struct list_head *cur;
+	struct btrfs_device *device;
+
+	mutex_lock(&uuid_mutex);
+	list_for_each(cur, head) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev) {
+			close_bdev_excl(device->bdev);
+			printk("close devices closes %s\n", device->name);
+		}
+		device->bdev = NULL;
+	}
+	mutex_unlock(&uuid_mutex);
+	return 0;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       int flags, void *holder)
+{
+	struct block_device *bdev;
+	struct list_head *head = &fs_devices->devices;
+	struct list_head *cur;
+	struct btrfs_device *device;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	list_for_each(cur, head) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		bdev = open_bdev_excl(device->name, flags, holder);
+printk("opening %s devid %Lu\n", device->name, device->devid);
+		if (IS_ERR(bdev)) {
+			printk("open %s failed\n", device->name);
+			ret = PTR_ERR(bdev);
+			goto fail;
+		}
+		if (device->devid == fs_devices->latest_devid)
+			fs_devices->latest_bdev = bdev;
+		if (device->devid == fs_devices->lowest_devid) {
+			fs_devices->lowest_bdev = bdev;
+printk("lowest bdev %s\n", device->name);
+		}
+		device->bdev = bdev;
+	}
+	mutex_unlock(&uuid_mutex);
+	return 0;
+fail:
+	mutex_unlock(&uuid_mutex);
+	btrfs_close_devices(fs_devices);
+	return ret;
+}
+
+int btrfs_scan_one_device(const char *path, int flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_super_block *disk_super;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+	int ret;
+	u64 devid;
+
+	mutex_lock(&uuid_mutex);
+
+	printk("scan one opens %s\n", path);
+	bdev = open_bdev_excl(path, flags, holder);
+
+	if (IS_ERR(bdev)) {
+		printk("open failed\n");
+		ret = PTR_ERR(bdev);
+		goto error;
+	}
+
+	ret = set_blocksize(bdev, 4096);
+	if (ret)
+		goto error_close;
+	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	if (!bh) {
+		ret = -EIO;
+		goto error_close;
+	}
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+	    sizeof(disk_super->magic))) {
+		printk("no btrfs found on %s\n", path);
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	devid = le64_to_cpu(disk_super->dev_item.devid);
+	printk("found device %Lu on %s\n", devid, path);
+	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+
+error_brelse:
+	brelse(bh);
+error_close:
+	close_bdev_excl(bdev);
+	printk("scan one closes bdev %s\n", path);
+error:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
 
 /*
  * this uses a pretty simple search, the expectation is that it is
@@ -56,6 +266,10 @@ static int find_free_dev_extent(struct btrfs_trans_handle *trans,
 
 	/* FIXME use last free of some kind */
 
+	/* we don't want to overwrite the superblock on the drive,
+	 * so we make sure to start at an offset of at least 1MB
+	 */
+	search_start = max((u64)1024 * 1024, search_start);
 	key.objectid = device->devid;
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
@@ -285,6 +499,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
 
+	device->devid = free_devid;
 	btrfs_set_device_id(leaf, dev_item, device->devid);
 	btrfs_set_device_type(leaf, dev_item, device->type);
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
@@ -382,7 +597,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
 	struct list_head private_devs;
-	struct list_head *dev_list = &extent_root->fs_info->devices;
+	struct list_head *dev_list = &extent_root->fs_info->fs_devices->devices;
 	struct list_head *cur;
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
@@ -449,7 +664,7 @@ again:
 					     key.objectid,
 					     calc_size, &dev_offset);
 		BUG_ON(ret);
-
+printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
@@ -592,17 +807,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid)
 {
-	struct btrfs_device *dev;
-	struct list_head *cur = root->fs_info->devices.next;
-	struct list_head *head = &root->fs_info->devices;
+	struct list_head *head = &root->fs_info->fs_devices->devices;
 
-	while(cur != head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
-		if (dev->devid == devid)
-			return dev;
-		cur = cur->next;
-	}
-	return NULL;
+	return __find_device(head, devid);
 }
 
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
@@ -699,15 +906,16 @@ static int read_one_dev(struct btrfs_root *root,
 	devid = btrfs_device_id(leaf, dev_item);
 	device = btrfs_find_device(root, devid);
 	if (!device) {
+		printk("warning devid %Lu not found already\n", devid);
 		device = kmalloc(sizeof(*device), GFP_NOFS);
 		if (!device)
 			return -ENOMEM;
-		list_add(&device->dev_list, &root->fs_info->devices);
+		list_add(&device->dev_list,
+			 &root->fs_info->fs_devices->devices);
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
-	device->bdev = root->fs_info->sb->s_bdev;
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
-- 
cgit v1.2.3


From 593060d756e0c2382d59cf00d5f3b9e3a336c408 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 25 Mar 2008 16:50:33 -0400
Subject: Btrfs: Implement raid0 when multiple devices are present

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 130 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 100 insertions(+), 30 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 263f01cc3db..d8fce32a3bb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
@@ -25,10 +26,24 @@
 #include "print-tree.h"
 #include "volumes.h"
 
-struct map_lookup {
+struct stripe {
 	struct btrfs_device *dev;
 	u64 physical;
 };
+
+struct map_lookup {
+	u64 type;
+	int io_align;
+	int io_width;
+	int stripe_len;
+	int sector_size;
+	int num_stripes;
+	struct stripe stripes[];
+};
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+			    (sizeof(struct stripe) * (n)))
+
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 
@@ -592,6 +607,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      u64 *num_bytes, u64 type)
 {
 	u64 dev_offset;
+	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
 	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
@@ -610,10 +626,18 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int looped = 0;
 	int ret;
 	int index;
+	int stripe_len = 64 * 1024;
 	struct btrfs_key key;
 
 	if (list_empty(dev_list))
 		return -ENOSPC;
+
+	if (type & BTRFS_BLOCK_GROUP_RAID0)
+		num_stripes = btrfs_super_num_devices(&info->super_copy);
+	if (type & BTRFS_BLOCK_GROUP_DATA)
+		stripe_len = 64 * 1024;
+	if (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
+		stripe_len = 32 * 1024;
 again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
@@ -650,9 +674,15 @@ again:
 	if (!chunk)
 		return -ENOMEM;
 
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+	if (!map) {
+		kfree(chunk);
+		return -ENOMEM;
+	}
+
 	stripes = &chunk->stripe;
 
-	*num_bytes = calc_size;
+	*num_bytes = calc_size * num_stripes;
 	index = 0;
 	while(index < num_stripes) {
 		BUG_ON(list_empty(&private_devs));
@@ -669,6 +699,8 @@ printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
 
+		map->stripes[index].dev = device;
+		map->stripes[index].physical = dev_offset;
 		btrfs_set_stack_stripe_devid(stripes + index, device->devid);
 		btrfs_set_stack_stripe_offset(stripes + index, dev_offset);
 		physical = dev_offset;
@@ -680,12 +712,18 @@ printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 	key.offset = *num_bytes;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
-	btrfs_set_stack_chunk_stripe_len(chunk, 64 * 1024);
+	btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
 	btrfs_set_stack_chunk_type(chunk, type);
 	btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
-	btrfs_set_stack_chunk_io_align(chunk, extent_root->sectorsize);
-	btrfs_set_stack_chunk_io_width(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_io_align(chunk, stripe_len);
+	btrfs_set_stack_chunk_io_width(chunk, stripe_len);
 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+	map->sector_size = extent_root->sectorsize;
+	map->stripe_len = stripe_len;
+	map->io_align = stripe_len;
+	map->io_width = stripe_len;
+	map->type = type;
+	map->num_stripes = num_stripes;
 
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 				btrfs_chunk_item_size(num_stripes));
@@ -695,25 +733,11 @@ printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
-	map = kmalloc(sizeof(*map), GFP_NOFS);
-	if (!map) {
-		free_extent_map(em);
-		return -ENOMEM;
-	}
-
 	em->bdev = (struct block_device *)map;
 	em->start = key.objectid;
 	em->len = key.offset;
 	em->block_start = 0;
 
-	map->physical = physical;
-	map->dev = device;
-
-	if (!map->dev) {
-		kfree(map);
-		free_extent_map(em);
-		return -EIO;
-	}
 	kfree(chunk);
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
@@ -758,6 +782,9 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	struct map_lookup *map;
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
 	u64 offset;
+	u64 stripe_offset;
+	u64 stripe_nr;
+	int stripe_index;
 
 
 	spin_lock(&em_tree->lock);
@@ -767,9 +794,40 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	BUG_ON(em->start > logical || em->start + em->len < logical);
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
-	*phys = map->physical + offset;
-	*length = em->len - offset;
-	*dev = map->dev;
+
+	stripe_nr = offset;
+	/*
+	 * stripe_nr counts the total number of stripes we have to stride
+	 * to get to this block
+	 */
+	do_div(stripe_nr, map->stripe_len);
+
+	stripe_offset = stripe_nr * map->stripe_len;
+	BUG_ON(offset < stripe_offset);
+
+	/* stripe_offset is the offset of this block in its stripe*/
+	stripe_offset = offset - stripe_offset;
+
+	/*
+	 * after this do_div call, stripe_nr is the number of stripes
+	 * on this device we have to walk to find the data, and
+	 * stripe_index is the number of our device in the stripe array
+	 */
+	stripe_index = do_div(stripe_nr, map->num_stripes);
+
+	BUG_ON(stripe_index >= map->num_stripes);
+
+	*phys = map->stripes[stripe_index].physical + stripe_offset +
+		stripe_nr * map->stripe_len;
+
+	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+		/* we limit the length of each bio to what fits in a stripe */
+		*length = min_t(u64, em->len - offset,
+			      map->stripe_len - stripe_offset);
+	} else {
+		*length = em->len - offset;
+	}
+	*dev = map->stripes[stripe_index].dev;
 	free_extent_map(em);
 	spin_unlock(&em_tree->lock);
 	return 0;
@@ -822,7 +880,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	u64 logical;
 	u64 length;
 	u64 devid;
+	int num_stripes;
 	int ret;
+	int i;
 
 	logical = key->objectid;
 	length = key->offset;
@@ -846,7 +906,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
-	map = kmalloc(sizeof(*map), GFP_NOFS);
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		free_extent_map(em);
 		return -ENOMEM;
@@ -857,13 +918,22 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em->len = length;
 	em->block_start = 0;
 
-	map->physical = btrfs_stripe_offset_nr(leaf, chunk, 0);
-	devid = btrfs_stripe_devid_nr(leaf, chunk, 0);
-	map->dev = btrfs_find_device(root, devid);
-	if (!map->dev) {
-		kfree(map);
-		free_extent_map(em);
-		return -EIO;
+	map->num_stripes = num_stripes;
+	map->io_width = btrfs_chunk_io_width(leaf, chunk);
+	map->io_align = btrfs_chunk_io_align(leaf, chunk);
+	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	map->type = btrfs_chunk_type(leaf, chunk);
+	for (i = 0; i < num_stripes; i++) {
+		map->stripes[i].physical =
+			btrfs_stripe_offset_nr(leaf, chunk, i);
+		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+		map->stripes[i].dev = btrfs_find_device(root, devid);
+		if (!map->stripes[i].dev) {
+			kfree(map);
+			free_extent_map(em);
+			return -EIO;
+		}
 	}
 
 	spin_lock(&map_tree->map_tree.lock);
-- 
cgit v1.2.3


From e58ca0203d32869a01540a293df40ddc480dc378 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 1 Apr 2008 11:21:34 -0400
Subject: Fix btrfs_fill_super to return -EINVAL when no FS found

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d8fce32a3bb..169be0f7285 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -237,7 +237,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 	    sizeof(disk_super->magic))) {
 		printk("no btrfs found on %s\n", path);
-		ret = -ENOENT;
+		ret = -EINVAL;
 		goto error_brelse;
 	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
-- 
cgit v1.2.3


From 8790d502e4401a4a3a4175b83a3a47e8d595c771 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Apr 2008 16:29:03 -0400
Subject: Btrfs: Add support for mirroring across drives

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 154 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 126 insertions(+), 28 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 169be0f7285..bc3c0b97588 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -31,6 +31,13 @@ struct stripe {
 	u64 physical;
 };
 
+struct multi_bio {
+	atomic_t stripes;
+	bio_end_io_t *end_io;
+	void *private;
+	int error;
+};
+
 struct map_lookup {
 	u64 type;
 	int io_align;
@@ -632,12 +639,12 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (list_empty(dev_list))
 		return -ENOSPC;
 
-	if (type & BTRFS_BLOCK_GROUP_RAID0)
+	if (type & (BTRFS_BLOCK_GROUP_RAID0))
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
-	if (type & BTRFS_BLOCK_GROUP_DATA)
-		stripe_len = 64 * 1024;
-	if (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
-		stripe_len = 32 * 1024;
+	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+		num_stripes = min_t(u64, 2,
+				  btrfs_super_num_devices(&info->super_copy));
+	}
 again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
@@ -682,7 +689,11 @@ again:
 
 	stripes = &chunk->stripe;
 
-	*num_bytes = calc_size * num_stripes;
+	if (type & BTRFS_BLOCK_GROUP_RAID1)
+		*num_bytes = calc_size;
+	else
+		*num_bytes = calc_size * num_stripes;
+
 	index = 0;
 	while(index < num_stripes) {
 		BUG_ON(list_empty(&private_devs));
@@ -694,7 +705,7 @@ again:
 					     key.objectid,
 					     calc_size, &dev_offset);
 		BUG_ON(ret);
-printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
+printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, calc_size, device->devid, type);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
@@ -774,9 +785,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 	}
 }
 
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
-		    u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev)
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		    int dev_nr, u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev, int *total_devs)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -808,19 +819,39 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	/* stripe_offset is the offset of this block in its stripe*/
 	stripe_offset = offset - stripe_offset;
 
-	/*
-	 * after this do_div call, stripe_nr is the number of stripes
-	 * on this device we have to walk to find the data, and
-	 * stripe_index is the number of our device in the stripe array
-	 */
-	stripe_index = do_div(stripe_nr, map->num_stripes);
-
+	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+		stripe_index = dev_nr;
+		if (rw & (1 << BIO_RW))
+			*total_devs = map->num_stripes;
+		else {
+			int i;
+			u64 least = (u64)-1;
+			struct btrfs_device *cur;
+
+			for (i = 0; i < map->num_stripes; i++) {
+				cur = map->stripes[i].dev;
+				spin_lock(&cur->io_lock);
+				if (cur->total_ios < least) {
+					least = cur->total_ios;
+					stripe_index = i;
+				}
+				spin_unlock(&cur->io_lock);
+			}
+			*total_devs = 1;
+		}
+	} else {
+		/*
+		 * after this do_div call, stripe_nr is the number of stripes
+		 * on this device we have to walk to find the data, and
+		 * stripe_index is the number of our device in the stripe array
+		 */
+		stripe_index = do_div(stripe_nr, map->num_stripes);
+	}
 	BUG_ON(stripe_index >= map->num_stripes);
-
 	*phys = map->stripes[stripe_index].physical + stripe_offset +
 		stripe_nr * map->stripe_len;
 
-	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1)) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
 			      map->stripe_len - stripe_offset);
@@ -833,33 +864,98 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	return 0;
 }
 
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_multi_stripe(struct bio *bio, int err)
+#else
+static int end_bio_multi_stripe(struct bio *bio,
+				   unsigned int bytes_done, int err)
+#endif
+{
+	struct multi_bio *multi = bio->bi_private;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	if (bio->bi_size)
+		return 1;
+#endif
+	if (err)
+		multi->error = err;
+
+	if (atomic_dec_and_test(&multi->stripes)) {
+		bio->bi_private = multi->private;
+		bio->bi_end_io = multi->end_io;
+
+		if (!err && multi->error)
+			err = multi->error;
+		kfree(multi);
+
+		bio_endio(bio, err);
+	} else {
+		bio_put(bio);
+	}
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	return 0;
+#endif
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 {
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
+	struct bio *first_bio = bio;
 	u64 logical = bio->bi_sector << 9;
 	u64 physical;
 	u64 length = 0;
 	u64 map_length;
 	struct bio_vec *bvec;
+	struct multi_bio *multi = NULL;
 	int i;
 	int ret;
+	int dev_nr = 0;
+	int total_devs = 1;
 
 	bio_for_each_segment(bvec, bio, i) {
 		length += bvec->bv_len;
 	}
+
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
-	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
-	if (map_length < length) {
-		printk("mapping failed logical %Lu bio len %Lu physical %Lu "
-		       "len %Lu\n", logical, length, physical, map_length);
-		BUG();
+	while(dev_nr < total_devs) {
+		ret = btrfs_map_block(map_tree, rw, dev_nr, logical,
+				      &physical, &map_length, &dev,
+				      &total_devs);
+		if (map_length < length) {
+			printk("mapping failed logical %Lu bio len %Lu physical %Lu "
+			       "len %Lu\n", logical, length, physical, map_length);
+			BUG();
+		}
+		BUG_ON(map_length < length);
+		if (total_devs > 1) {
+			if (!multi) {
+				multi = kmalloc(sizeof(*multi), GFP_NOFS);
+				atomic_set(&multi->stripes, 1);
+				multi->end_io = bio->bi_end_io;
+				multi->private = first_bio->bi_private;
+				multi->error = 0;
+			} else {
+				atomic_inc(&multi->stripes);
+			}
+			if (dev_nr < total_devs - 1) {
+				bio = bio_clone(first_bio, GFP_NOFS);
+				BUG_ON(!bio);
+			} else {
+				bio = first_bio;
+			}
+			bio->bi_private = multi;
+			bio->bi_end_io = end_bio_multi_stripe;
+		}
+		bio->bi_sector = physical >> 9;
+		bio->bi_bdev = dev->bdev;
+		spin_lock(&dev->io_lock);
+		dev->total_ios++;
+		spin_unlock(&dev->io_lock);
+		submit_bio(rw, bio);
+		dev_nr++;
 	}
-	BUG_ON(map_length < length);
-	bio->bi_sector = physical >> 9;
-	bio->bi_bdev = dev->bdev;
-	submit_bio(rw, bio);
 	return 0;
 }
 
@@ -982,6 +1078,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -ENOMEM;
 		list_add(&device->dev_list,
 			 &root->fs_info->fs_devices->devices);
+		device->total_ios = 0;
+		spin_lock_init(&device->io_lock);
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
-- 
cgit v1.2.3


From 611f0e00a27fe0e5a571194a12443ecdc99a43ef Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Apr 2008 16:29:03 -0400
Subject: Btrfs: Add support for duplicate blocks on a single spindle

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bc3c0b97588..b9294e3c05f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -627,6 +627,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map *em;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
+	u64 min_free = calc_size;
 	u64 avail;
 	u64 max_avail = 0;
 	int num_stripes = 1;
@@ -641,6 +642,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0))
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
+	if (type & (BTRFS_BLOCK_GROUP_DUP))
+		num_stripes = 2;
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
@@ -649,16 +652,23 @@ again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
 	index = 0;
+
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_free = calc_size * 2;
+
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
 		if (avail > max_avail)
 			max_avail = avail;
-		if (avail >= calc_size) {
+		if (avail >= min_free) {
 			list_move_tail(&device->dev_list, &private_devs);
 			index++;
+			if (type & BTRFS_BLOCK_GROUP_DUP)
+				index++;
 		}
 		if (cur == dev_list)
 			break;
@@ -689,17 +699,22 @@ again:
 
 	stripes = &chunk->stripe;
 
-	if (type & BTRFS_BLOCK_GROUP_RAID1)
+	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		*num_bytes = calc_size;
 	else
 		*num_bytes = calc_size * num_stripes;
 
 	index = 0;
+printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes);
 	while(index < num_stripes) {
 		BUG_ON(list_empty(&private_devs));
 		cur = private_devs.next;
 		device = list_entry(cur, struct btrfs_device, dev_list);
-		list_move_tail(&device->dev_list, dev_list);
+
+		/* loop over this device again if we're doing a dup group */
+		if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
+		    (index == num_stripes - 1))
+			list_move_tail(&device->dev_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
 					     key.objectid,
@@ -839,6 +854,14 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			}
 			*total_devs = 1;
 		}
+	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+		if (rw == WRITE) {
+			*total_devs = map->num_stripes;
+			stripe_index = dev_nr;
+		} else {
+			stripe_index = 0;
+			*total_devs = 1;
+		}
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -851,7 +874,8 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	*phys = map->stripes[stripe_index].physical + stripe_offset +
 		stripe_nr * map->stripe_len;
 
-	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1)) {
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_DUP)) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
 			      map->stripe_len - stripe_offset);
-- 
cgit v1.2.3


From cea9e4452ebaf18dd0951e90dc84d82a5dee40b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Change btrfs_map_block to return a structure with mappings for all
 stripes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 135 +++++++++++++++++++++++++++++------------------------
 1 file changed, 75 insertions(+), 60 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9294e3c05f..008d3640e8c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -26,18 +26,6 @@
 #include "print-tree.h"
 #include "volumes.h"
 
-struct stripe {
-	struct btrfs_device *dev;
-	u64 physical;
-};
-
-struct multi_bio {
-	atomic_t stripes;
-	bio_end_io_t *end_io;
-	void *private;
-	int error;
-};
-
 struct map_lookup {
 	u64 type;
 	int io_align;
@@ -45,11 +33,11 @@ struct map_lookup {
 	int stripe_len;
 	int sector_size;
 	int num_stripes;
-	struct stripe stripes[];
+	struct btrfs_bio_stripe stripes[];
 };
 
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
-			    (sizeof(struct stripe) * (n)))
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
@@ -801,8 +789,8 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 }
 
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
-		    int dev_nr, u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev, int *total_devs)
+		    u64 logical, u64 *length,
+		    struct btrfs_multi_bio **multi_ret)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -810,8 +798,21 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	u64 offset;
 	u64 stripe_offset;
 	u64 stripe_nr;
+	int stripes_allocated = 8;
 	int stripe_index;
+	int i;
+	struct btrfs_multi_bio *multi = NULL;
 
+	if (multi_ret && !(rw & (1 << BIO_RW))) {
+		stripes_allocated = 1;
+	}
+again:
+	if (multi_ret) {
+		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+				GFP_NOFS);
+		if (!multi)
+			return -ENOMEM;
+	}
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
@@ -821,6 +822,17 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
 
+	/* if our multi bio struct is too small, back off and try again */
+	if (multi_ret && (rw & (1 << BIO_RW)) &&
+	    stripes_allocated < map->num_stripes &&
+	    ((map->type & BTRFS_BLOCK_GROUP_RAID1) ||
+	     (map->type & BTRFS_BLOCK_GROUP_DUP))) {
+		stripes_allocated = map->num_stripes;
+		spin_unlock(&em_tree->lock);
+		free_extent_map(em);
+		kfree(multi);
+		goto again;
+	}
 	stripe_nr = offset;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
@@ -834,10 +846,22 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	/* stripe_offset is the offset of this block in its stripe*/
 	stripe_offset = offset - stripe_offset;
 
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_DUP)) {
+		/* we limit the length of each bio to what fits in a stripe */
+		*length = min_t(u64, em->len - offset,
+			      map->stripe_len - stripe_offset);
+	} else {
+		*length = em->len - offset;
+	}
+	if (!multi_ret)
+		goto out;
+
+	multi->num_stripes = 1;
+	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		stripe_index = dev_nr;
 		if (rw & (1 << BIO_RW))
-			*total_devs = map->num_stripes;
+			multi->num_stripes = map->num_stripes;
 		else {
 			int i;
 			u64 least = (u64)-1;
@@ -852,16 +876,10 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 				}
 				spin_unlock(&cur->io_lock);
 			}
-			*total_devs = 1;
 		}
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (rw == WRITE) {
-			*total_devs = map->num_stripes;
-			stripe_index = dev_nr;
-		} else {
-			stripe_index = 0;
-			*total_devs = 1;
-		}
+		if (rw & (1 << BIO_RW))
+			multi->num_stripes = map->num_stripes;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -871,18 +889,17 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		stripe_index = do_div(stripe_nr, map->num_stripes);
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
-	*phys = map->stripes[stripe_index].physical + stripe_offset +
-		stripe_nr * map->stripe_len;
-
-	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
-			 BTRFS_BLOCK_GROUP_DUP)) {
-		/* we limit the length of each bio to what fits in a stripe */
-		*length = min_t(u64, em->len - offset,
-			      map->stripe_len - stripe_offset);
-	} else {
-		*length = em->len - offset;
+	BUG_ON(stripe_index != 0 && multi->num_stripes > 1);
+
+	for (i = 0; i < multi->num_stripes; i++) {
+		multi->stripes[i].physical =
+			map->stripes[stripe_index].physical + stripe_offset +
+			stripe_nr * map->stripe_len;
+		multi->stripes[i].dev = map->stripes[stripe_index].dev;
+		stripe_index++;
 	}
-	*dev = map->stripes[stripe_index].dev;
+	*multi_ret = multi;
+out:
 	free_extent_map(em);
 	spin_unlock(&em_tree->lock);
 	return 0;
@@ -895,7 +912,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 				   unsigned int bytes_done, int err)
 #endif
 {
-	struct multi_bio *multi = bio->bi_private;
+	struct btrfs_multi_bio *multi = bio->bi_private;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
@@ -904,7 +921,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 	if (err)
 		multi->error = err;
 
-	if (atomic_dec_and_test(&multi->stripes)) {
+	if (atomic_dec_and_test(&multi->stripes_pending)) {
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
 
@@ -927,11 +944,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
 	u64 logical = bio->bi_sector << 9;
-	u64 physical;
 	u64 length = 0;
 	u64 map_length;
 	struct bio_vec *bvec;
-	struct multi_bio *multi = NULL;
+	struct btrfs_multi_bio *multi = NULL;
 	int i;
 	int ret;
 	int dev_nr = 0;
@@ -943,26 +959,22 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
+
+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi);
+	BUG_ON(ret);
+
+	total_devs = multi->num_stripes;
+	if (map_length < length) {
+		printk("mapping failed logical %Lu bio len %Lu "
+		       "len %Lu\n", logical, length, map_length);
+		BUG();
+	}
+	multi->end_io = first_bio->bi_end_io;
+	multi->private = first_bio->bi_private;
+	atomic_set(&multi->stripes_pending, multi->num_stripes);
+
 	while(dev_nr < total_devs) {
-		ret = btrfs_map_block(map_tree, rw, dev_nr, logical,
-				      &physical, &map_length, &dev,
-				      &total_devs);
-		if (map_length < length) {
-			printk("mapping failed logical %Lu bio len %Lu physical %Lu "
-			       "len %Lu\n", logical, length, physical, map_length);
-			BUG();
-		}
-		BUG_ON(map_length < length);
 		if (total_devs > 1) {
-			if (!multi) {
-				multi = kmalloc(sizeof(*multi), GFP_NOFS);
-				atomic_set(&multi->stripes, 1);
-				multi->end_io = bio->bi_end_io;
-				multi->private = first_bio->bi_private;
-				multi->error = 0;
-			} else {
-				atomic_inc(&multi->stripes);
-			}
 			if (dev_nr < total_devs - 1) {
 				bio = bio_clone(first_bio, GFP_NOFS);
 				BUG_ON(!bio);
@@ -972,7 +984,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 			bio->bi_private = multi;
 			bio->bi_end_io = end_bio_multi_stripe;
 		}
-		bio->bi_sector = physical >> 9;
+		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
+		dev = multi->stripes[dev_nr].dev;
 		bio->bi_bdev = dev->bdev;
 		spin_lock(&dev->io_lock);
 		dev->total_ios++;
@@ -980,6 +993,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 		submit_bio(rw, bio);
 		dev_nr++;
 	}
+	if (total_devs == 1)
+		kfree(multi);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f188591e987e21b6f7f8864c66a02858b95b530e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Btrfs: Retry metadata reads in the face of checksum failures

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 008d3640e8c..3b927f69832 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -788,9 +788,31 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 	}
 }
 
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	int ret;
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+		ret = map->num_stripes;
+	else
+		ret = 1;
+	free_extent_map(em);
+	spin_unlock(&em_tree->lock);
+	return ret;
+}
+
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret)
+		    struct btrfs_multi_bio **multi_ret, int mirror_num)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -822,6 +844,9 @@ again:
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
 
+	if (mirror_num > map->num_stripes)
+		mirror_num = 0;
+
 	/* if our multi bio struct is too small, back off and try again */
 	if (multi_ret && (rw & (1 << BIO_RW)) &&
 	    stripes_allocated < map->num_stripes &&
@@ -862,7 +887,9 @@ again:
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
 		if (rw & (1 << BIO_RW))
 			multi->num_stripes = map->num_stripes;
-		else {
+		else if (mirror_num) {
+			stripe_index = mirror_num - 1;
+		} else {
 			int i;
 			u64 least = (u64)-1;
 			struct btrfs_device *cur;
@@ -880,6 +907,8 @@ again:
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
 			multi->num_stripes = map->num_stripes;
+		else if (mirror_num)
+			stripe_index = mirror_num - 1;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -938,7 +967,8 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 }
 
-int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num)
 {
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
@@ -960,7 +990,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 
-	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi);
+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+			      mirror_num);
 	BUG_ON(ret);
 
 	total_devs = multi->num_stripes;
-- 
cgit v1.2.3


From f29844623de29a12358d7fba35d0959465b64adf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 Apr 2008 16:19:33 -0400
Subject: Btrfs: Write out all super blocks on commit, and bring back proper
 barrier support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3b927f69832..07d43553141 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,7 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		device->devid = devid;
+		device->barriers = 1;
 		device->name = kstrdup(path, GFP_NOFS);
 		if (!device->name) {
 			kfree(device);
@@ -208,6 +209,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 	struct buffer_head *bh;
 	int ret;
 	u64 devid;
+	u64 transid;
 
 	mutex_lock(&uuid_mutex);
 
@@ -236,14 +238,14 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 		goto error_brelse;
 	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
-	printk("found device %Lu on %s\n", devid, path);
+	transid = btrfs_super_generation(disk_super);
+	printk("found device %Lu transid %Lu on %s\n", devid, transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
 error_brelse:
 	brelse(bh);
 error_close:
 	close_bdev_excl(bdev);
-	printk("scan one closes bdev %s\n", path);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1143,7 +1145,7 @@ static int read_one_dev(struct btrfs_root *root,
 	device = btrfs_find_device(root, devid);
 	if (!device) {
 		printk("warning devid %Lu not found already\n", devid);
-		device = kmalloc(sizeof(*device), GFP_NOFS);
+		device = kzalloc(sizeof(*device), GFP_NOFS);
 		if (!device)
 			return -ENOMEM;
 		list_add(&device->dev_list,
-- 
cgit v1.2.3


From 73f61b2a6459df982cb4faf0e5cf5ac8b153aaff Mon Sep 17 00:00:00 2001
From: Miguel <miguel.filipe@gmail.com>
Date: Fri, 11 Apr 2008 15:50:59 -0400
Subject: Btrfs: bio_endio support for linux 2.6.23 and older.

bio_endio() changed prototype on linux 2.6.24, support older kernels
using the older prototype.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 07d43553141..82bc6cfc110 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -960,7 +960,11 @@ static int end_bio_multi_stripe(struct bio *bio,
 			err = multi->error;
 		kfree(multi);
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+		bio_endio(bio, bio->bi_size, err);
+#else
 		bio_endio(bio, err);
+#endif
 	} else {
 		bio_put(bio);
 	}
-- 
cgit v1.2.3


From b248a4152956cbae1b23f4c70ef5b51d6ea2ecfb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Apr 2008 09:48:18 -0400
Subject: Btrfs: A few updates for 2.6.18 and versions older than 2.6.25

This includes fixing a missing spinlock init call that caused oops on mount
for most kernels other than 2.6.25.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 82bc6cfc110..f81519f0e4a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -126,6 +126,7 @@ static int device_list_add(const char *path,
 		}
 		device->devid = devid;
 		device->barriers = 1;
+		spin_lock_init(&device->io_lock);
 		device->name = kstrdup(path, GFP_NOFS);
 		if (!device->name) {
 			kfree(device);
@@ -759,8 +760,8 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, c
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
-	BUG_ON(ret);
 	spin_unlock(&em_tree->lock);
+	BUG_ON(ret);
 	free_extent_map(em);
 	return ret;
 }
@@ -799,6 +800,7 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, len);
+	spin_unlock(&em_tree->lock);
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -808,7 +810,6 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	else
 		ret = 1;
 	free_extent_map(em);
-	spin_unlock(&em_tree->lock);
 	return ret;
 }
 
@@ -840,6 +841,7 @@ again:
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
+	spin_unlock(&em_tree->lock);
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -855,7 +857,6 @@ again:
 	    ((map->type & BTRFS_BLOCK_GROUP_RAID1) ||
 	     (map->type & BTRFS_BLOCK_GROUP_DUP))) {
 		stripes_allocated = map->num_stripes;
-		spin_unlock(&em_tree->lock);
 		free_extent_map(em);
 		kfree(multi);
 		goto again;
@@ -932,7 +933,6 @@ again:
 	*multi_ret = multi;
 out:
 	free_extent_map(em);
-	spin_unlock(&em_tree->lock);
 	return 0;
 }
 
@@ -1060,16 +1060,15 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	length = key->offset;
 	spin_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+	spin_unlock(&map_tree->map_tree.lock);
 
 	/* already mapped? */
 	if (em && em->start <= logical && em->start + em->len > logical) {
 		free_extent_map(em);
-		spin_unlock(&map_tree->map_tree.lock);
 		return 0;
 	} else if (em) {
 		free_extent_map(em);
 	}
-	spin_unlock(&map_tree->map_tree.lock);
 
 	map = kzalloc(sizeof(*map), GFP_NOFS);
 	if (!map)
@@ -1110,8 +1109,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
 	spin_lock(&map_tree->map_tree.lock);
 	ret = add_extent_mapping(&map_tree->map_tree, em);
-	BUG_ON(ret);
 	spin_unlock(&map_tree->map_tree.lock);
+	BUG_ON(ret);
 	free_extent_map(em);
 
 	return 0;
@@ -1154,7 +1153,7 @@ static int read_one_dev(struct btrfs_root *root,
 			return -ENOMEM;
 		list_add(&device->dev_list,
 			 &root->fs_info->fs_devices->devices);
-		device->total_ios = 0;
+		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 	}
 
-- 
cgit v1.2.3


From e17cade25ff8074101d653557a78df09c16ca276 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 15 Apr 2008 15:41:47 -0400
Subject: Btrfs: Add chunk uuids and update multi-device back references

Block headers now store the chunk tree uuid

Chunk items records the device uuid for each stripes

Device extent items record better back refs to the chunk tree

Block groups record better back refs to the chunk tree

The chunk tree format has also changed.  The objectid of BTRFS_CHUNK_ITEM_KEY
used to be the logical offset of the chunk.  Now it is a chunk tree id,
with the logical offset being stored in the offset field of the key.

This allows a single chunk tree to record multiple logical address spaces,
upping the number of bytes indexed by a chunk tree from 2^64 to
2^128.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 76 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 50 insertions(+), 26 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f81519f0e4a..23ebd95b25e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -180,7 +180,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		bdev = open_bdev_excl(device->name, flags, holder);
-printk("opening %s devid %Lu\n", device->name, device->devid);
+
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
 			ret = PTR_ERR(bdev);
@@ -190,7 +190,6 @@ printk("opening %s devid %Lu\n", device->name, device->devid);
 			fs_devices->latest_bdev = bdev;
 		if (device->devid == fs_devices->lowest_devid) {
 			fs_devices->lowest_bdev = bdev;
-printk("lowest bdev %s\n", device->name);
 		}
 		device->bdev = bdev;
 	}
@@ -372,7 +371,9 @@ error:
 
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
-			   u64 owner, u64 num_bytes, u64 *start)
+			   u64 chunk_tree, u64 chunk_objectid,
+			   u64 chunk_offset,
+			   u64 num_bytes, u64 *start)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -400,7 +401,14 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	extent = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_dev_extent);
-	btrfs_set_dev_extent_owner(leaf, extent, owner);
+	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
+		    BTRFS_UUID_SIZE);
+
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
 	btrfs_mark_buffer_dirty(leaf);
 err:
@@ -408,17 +416,18 @@ err:
 	return ret;
 }
 
-static int find_next_chunk(struct btrfs_root *root, u64 *objectid)
+static int find_next_chunk(struct btrfs_root *root, u64 objectid, u64 *offset)
 {
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
+	struct btrfs_chunk *chunk;
 	struct btrfs_key found_key;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	key.objectid = (u64)-1;
+	key.objectid = objectid;
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
@@ -430,11 +439,18 @@ static int find_next_chunk(struct btrfs_root *root, u64 *objectid)
 
 	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
 	if (ret) {
-		*objectid = 0;
+		*offset = 0;
 	} else {
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
-		*objectid = found_key.objectid + found_key.offset;
+		if (found_key.objectid != objectid)
+			*offset = 0;
+		else {
+			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					       struct btrfs_chunk);
+			*offset = found_key.offset +
+				btrfs_chunk_length(path->nodes[0], chunk);
+		}
 	}
 	ret = 0;
 error:
@@ -520,9 +536,12 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_set_device_group(leaf, dev_item, 0);
+	btrfs_set_device_seek_speed(leaf, dev_item, 0);
+	btrfs_set_device_bandwidth(leaf, dev_item, 0);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
-	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 	ret = 0;
 
@@ -674,7 +693,10 @@ again:
 		return -ENOSPC;
 	}
 
-	ret = find_next_chunk(chunk_root, &key.objectid);
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+			      &key.offset);
 	if (ret)
 		return ret;
 
@@ -696,8 +718,9 @@ again:
 		*num_bytes = calc_size * num_stripes;
 
 	index = 0;
-printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes);
+printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 	while(index < num_stripes) {
+		struct btrfs_stripe *stripe;
 		BUG_ON(list_empty(&private_devs));
 		cur = private_devs.next;
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -708,26 +731,28 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes
 			list_move_tail(&device->dev_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
-					     key.objectid,
-					     calc_size, &dev_offset);
+			     info->chunk_root->root_key.objectid,
+			     BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
+			     calc_size, &dev_offset);
 		BUG_ON(ret);
-printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, calc_size, device->devid, type);
+printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, calc_size, device->devid, type);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
 
 		map->stripes[index].dev = device;
 		map->stripes[index].physical = dev_offset;
-		btrfs_set_stack_stripe_devid(stripes + index, device->devid);
-		btrfs_set_stack_stripe_offset(stripes + index, dev_offset);
+		stripe = stripes + index;
+		btrfs_set_stack_stripe_devid(stripe, device->devid);
+		btrfs_set_stack_stripe_offset(stripe, dev_offset);
+		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
 		physical = dev_offset;
 		index++;
 	}
 	BUG_ON(!list_empty(&private_devs));
 
-	/* key.objectid was set above */
-	key.offset = *num_bytes;
-	key.type = BTRFS_CHUNK_ITEM_KEY;
+	/* key was set above */
+	btrfs_set_stack_chunk_length(chunk, *num_bytes);
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
 	btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
 	btrfs_set_stack_chunk_type(chunk, type);
@@ -745,14 +770,14 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, c
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 				btrfs_chunk_item_size(num_stripes));
 	BUG_ON(ret);
-	*start = key.objectid;
+	*start = key.offset;;
 
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
 	em->bdev = (struct block_device *)map;
-	em->start = key.objectid;
-	em->len = key.offset;
+	em->start = key.offset;
+	em->len = *num_bytes;
 	em->block_start = 0;
 
 	kfree(chunk);
@@ -1056,8 +1081,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	int ret;
 	int i;
 
-	logical = key->objectid;
-	length = key->offset;
+	logical = key->offset;
+	length = btrfs_chunk_length(leaf, chunk);
 	spin_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
 	spin_unlock(&map_tree->map_tree.lock);
@@ -1131,7 +1156,7 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
-	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 
 	return 0;
 }
@@ -1143,7 +1168,6 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
-
 	devid = btrfs_device_id(leaf, dev_item);
 	device = btrfs_find_device(root, devid);
 	if (!device) {
-- 
cgit v1.2.3


From 321aecc65671ae8136bd2ca6879b56f0221f8ac8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Apr 2008 10:49:51 -0400
Subject: Btrfs: Add RAID10 support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 46 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23ebd95b25e..e6417a573d4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -33,6 +33,7 @@ struct map_lookup {
 	int stripe_len;
 	int sector_size;
 	int num_stripes;
+	int sub_stripes;
 	struct btrfs_bio_stripe stripes[];
 };
 
@@ -641,6 +642,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 avail;
 	u64 max_avail = 0;
 	int num_stripes = 1;
+	int sub_stripes = 0;
 	int looped = 0;
 	int ret;
 	int index;
@@ -658,6 +660,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
 	}
+	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		if (num_stripes < 4)
+			return -ENOSPC;
+		num_stripes &= ~(u32)1;
+		sub_stripes = 2;
+	}
 again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
@@ -714,6 +723,8 @@ again:
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		*num_bytes = calc_size;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		*num_bytes = calc_size * num_stripes / sub_stripes;
 	else
 		*num_bytes = calc_size * num_stripes;
 
@@ -760,12 +771,14 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, cal
 	btrfs_set_stack_chunk_io_align(chunk, stripe_len);
 	btrfs_set_stack_chunk_io_width(chunk, stripe_len);
 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
 	map->sector_size = extent_root->sectorsize;
 	map->stripe_len = stripe_len;
 	map->io_align = stripe_len;
 	map->io_width = stripe_len;
 	map->type = type;
 	map->num_stripes = num_stripes;
+	map->sub_stripes = sub_stripes;
 
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 				btrfs_chunk_item_size(num_stripes));
@@ -832,6 +845,8 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	map = (struct map_lookup *)em->bdev;
 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
 		ret = map->num_stripes;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		ret = map->sub_stripes;
 	else
 		ret = 1;
 	free_extent_map(em);
@@ -849,6 +864,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	u64 stripe_offset;
 	u64 stripe_nr;
 	int stripes_allocated = 8;
+	int stripes_required = 1;
 	int stripe_index;
 	int i;
 	struct btrfs_multi_bio *multi = NULL;
@@ -877,10 +893,16 @@ again:
 		mirror_num = 0;
 
 	/* if our multi bio struct is too small, back off and try again */
-	if (multi_ret && (rw & (1 << BIO_RW)) &&
-	    stripes_allocated < map->num_stripes &&
-	    ((map->type & BTRFS_BLOCK_GROUP_RAID1) ||
-	     (map->type & BTRFS_BLOCK_GROUP_DUP))) {
+	if (rw & (1 << BIO_RW)) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_DUP)) {
+			stripes_required = map->num_stripes;
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripes_required = map->sub_stripes;
+		}
+	}
+	if (multi_ret && rw == WRITE &&
+	    stripes_allocated < stripes_required) {
 		stripes_allocated = map->num_stripes;
 		free_extent_map(em);
 		kfree(multi);
@@ -900,6 +922,7 @@ again:
 	stripe_offset = offset - stripe_offset;
 
 	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_RAID10 |
 			 BTRFS_BLOCK_GROUP_DUP)) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
@@ -937,6 +960,19 @@ again:
 			multi->num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+		int factor = map->num_stripes / map->sub_stripes;
+		int orig_stripe_nr = stripe_nr;
+
+		stripe_index = do_div(stripe_nr, factor);
+		stripe_index *= map->sub_stripes;
+
+		if (rw & (1 << BIO_RW))
+			multi->num_stripes = map->sub_stripes;
+		else if (mirror_num)
+			stripe_index += mirror_num - 1;
+		else
+			stripe_index += orig_stripe_nr % map->sub_stripes;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -946,7 +982,6 @@ again:
 		stripe_index = do_div(stripe_nr, map->num_stripes);
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
-	BUG_ON(stripe_index != 0 && multi->num_stripes > 1);
 
 	for (i = 0; i < multi->num_stripes; i++) {
 		multi->stripes[i].physical =
@@ -1120,6 +1155,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
 	map->type = btrfs_chunk_type(leaf, chunk);
+	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
 	for (i = 0; i < num_stripes; i++) {
 		map->stripes[i].physical =
 			btrfs_stripe_offset_nr(leaf, chunk, i);
-- 
cgit v1.2.3


From 3b951516ed703af0f6d82053937655ad69b60864 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Apr 2008 11:29:12 -0400
Subject: Btrfs: Use the extent map cache to find the logical disk block during
 data retries

The data read retry code needs to find the logical disk block before it
can resubmit new bios.  But, finding this block isn't allowed to take
the fs_mutex because that will deadlock with a number of different callers.

This changes the retry code to use the extent map cache instead, but
that requires the extent map cache to have the extent we're looking for.
This is a problem because btrfs_drop_extent_cache just drops the entire
extent instead of the little tiny part it is invalidating.

The bulk of the code in this patch changes btrfs_drop_extent_cache to
invalidate only a portion of the extent cache, and changes btrfs_get_extent
to deal with the results.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e6417a573d4..0e658c1d821 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -883,6 +883,9 @@ again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	spin_unlock(&em_tree->lock);
+	if (!em) {
+		printk("unable to find logical %Lu\n", logical);
+	}
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
-- 
cgit v1.2.3


From 7bf3b490dfa57bdef5892dffa5021469a0cd2b7f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Apr 2008 11:58:30 -0400
Subject: Btrfs: Avoid 64 bit div for RAID10

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0e658c1d821..c11b6fd408a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -724,7 +724,7 @@ again:
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		*num_bytes = calc_size;
 	else if (type & BTRFS_BLOCK_GROUP_RAID10)
-		*num_bytes = calc_size * num_stripes / sub_stripes;
+		*num_bytes = calc_size * (num_stripes / sub_stripes);
 	else
 		*num_bytes = calc_size * num_stripes;
 
-- 
cgit v1.2.3


From a443755f1ca3e190e12e3a845ddecb3ee1782512 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 10:29:38 -0400
Subject: Btrfs: Check device uuids along with devids

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c11b6fd408a..cdf0019cca2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -69,15 +69,18 @@ int btrfs_cleanup_fs_uuids(void)
 	return 0;
 }
 
-static struct btrfs_device *__find_device(struct list_head *head, u64 devid)
+static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
+					  u8 *uuid)
 {
 	struct btrfs_device *dev;
 	struct list_head *cur;
 
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
-		if (dev->devid == devid)
+		if (dev->devid == devid &&
+		    !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE)) {
 			return dev;
+		}
 	}
 	return NULL;
 }
@@ -117,7 +120,8 @@ static int device_list_add(const char *path,
 		fs_devices->num_devices = 0;
 		device = NULL;
 	} else {
-		device = __find_device(&fs_devices->devices, devid);
+		device = __find_device(&fs_devices->devices, devid,
+				       disk_super->dev_item.uuid);
 	}
 	if (!device) {
 		device = kzalloc(sizeof(*device), GFP_NOFS);
@@ -126,6 +130,8 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		device->devid = devid;
+		memcpy(device->uuid, disk_super->dev_item.uuid,
+		       BTRFS_UUID_SIZE);
 		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 		device->name = kstrdup(path, GFP_NOFS);
@@ -1098,11 +1104,12 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	return 0;
 }
 
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid)
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+				       u8 *uuid)
 {
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 
-	return __find_device(head, devid);
+	return __find_device(head, devid, uuid);
 }
 
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
@@ -1115,6 +1122,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	u64 logical;
 	u64 length;
 	u64 devid;
+	u8 uuid[BTRFS_UUID_SIZE];
 	int num_stripes;
 	int ret;
 	int i;
@@ -1163,7 +1171,10 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		map->stripes[i].physical =
 			btrfs_stripe_offset_nr(leaf, chunk, i);
 		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
-		map->stripes[i].dev = btrfs_find_device(root, devid);
+		read_extent_buffer(leaf, uuid, (unsigned long)
+				   btrfs_stripe_dev_uuid_nr(chunk, i),
+				   BTRFS_UUID_SIZE);
+		map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
 		if (!map->stripes[i].dev) {
 			kfree(map);
 			free_extent_map(em);
@@ -1207,8 +1218,13 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+
 	devid = btrfs_device_id(leaf, dev_item);
-	device = btrfs_find_device(root, devid);
+	read_extent_buffer(leaf, dev_uuid,
+			   (unsigned long)btrfs_device_uuid(dev_item),
+			   BTRFS_UUID_SIZE);
+	device = btrfs_find_device(root, devid, dev_uuid);
 	if (!device) {
 		printk("warning devid %Lu not found already\n", devid);
 		device = kzalloc(sizeof(*device), GFP_NOFS);
-- 
cgit v1.2.3


From 7ae9c09d8f001eb19ee2ba219dc5c3d4f6d60614 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 10:29:49 -0400
Subject: Btrfs: Add support for labels in the super block

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cdf0019cca2..93aa36e2436 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -57,9 +57,7 @@ int btrfs_cleanup_fs_uuids(void)
 			devices_cur = fs_devices->devices.next;
 			dev = list_entry(devices_cur, struct btrfs_device,
 					 dev_list);
-			printk("uuid cleanup finds %s\n", dev->name);
 			if (dev->bdev) {
-				printk("closing\n");
 				close_bdev_excl(dev->bdev);
 			}
 			list_del(&dev->dev_list);
@@ -149,7 +147,6 @@ static int device_list_add(const char *path,
 	}
 	if (fs_devices->lowest_devid > devid) {
 		fs_devices->lowest_devid = devid;
-		printk("lowest devid now %Lu\n", devid);
 	}
 	*fs_devices_ret = fs_devices;
 	return 0;
@@ -166,7 +163,6 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
 			close_bdev_excl(device->bdev);
-			printk("close devices closes %s\n", device->name);
 		}
 		device->bdev = NULL;
 	}
@@ -220,11 +216,9 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 
 	mutex_lock(&uuid_mutex);
 
-	printk("scan one opens %s\n", path);
 	bdev = open_bdev_excl(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
-		printk("open failed\n");
 		ret = PTR_ERR(bdev);
 		goto error;
 	}
@@ -240,13 +234,20 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 	disk_super = (struct btrfs_super_block *)bh->b_data;
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 	    sizeof(disk_super->magic))) {
-		printk("no btrfs found on %s\n", path);
 		ret = -EINVAL;
 		goto error_brelse;
 	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
-	printk("found device %Lu transid %Lu on %s\n", devid, transid, path);
+	if (disk_super->label[0])
+		printk("device label %s ", disk_super->label);
+	else {
+		/* FIXME, make a readl uuid parser */
+		printk("device fsid %llx-%llx ",
+		       *(unsigned long long *)disk_super->fsid,
+		       *(unsigned long long *)(disk_super->fsid + 8));
+	}
+	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
 error_brelse:
-- 
cgit v1.2.3


From 9b3f68b90674419add8be1c0aa740dcdf04f44cc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 10:29:51 -0400
Subject: Btrfs: Calculate appropriate chunk sizes for both small and large
 filesystems

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 93aa36e2436..e3ddd7fb8ed 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -627,6 +627,27 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
+			       int sub_stripes)
+{
+	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
+		return calc_size;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		return calc_size * (num_stripes / sub_stripes);
+	else
+		return calc_size * num_stripes;
+}
+
+
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 *start,
 		      u64 *num_bytes, u64 type)
@@ -643,11 +664,14 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
+	int min_chunk_size = 8 * 1024 * 1024;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
-	u64 min_free = calc_size;
+	u64 max_chunk_size = calc_size;
+	u64 min_free;
 	u64 avail;
 	u64 max_avail = 0;
+	u64 percent_max;
 	int num_stripes = 1;
 	int sub_stripes = 0;
 	int looped = 0;
@@ -666,6 +690,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
+		if (num_stripes < 2)
+			return -ENOSPC;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
@@ -674,13 +700,45 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		num_stripes &= ~(u32)1;
 		sub_stripes = 2;
 	}
+
+	if (type & BTRFS_BLOCK_GROUP_DATA) {
+		max_chunk_size = 10 * calc_size;
+		min_chunk_size = 256 * 1024 * 1024;
+	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+		max_chunk_size = 4 * calc_size;
+		min_chunk_size = 64 * 1024 * 1024;
+	} else {
+		min_chunk_size = 32 * 1024 * 1024;
+	}
+
+	/* we don't want a chunk larger than 10% of the FS */
+	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
+	max_chunk_size = min(percent_max, max_chunk_size);
+
+	if (calc_size * num_stripes > max_chunk_size) {
+		calc_size = max_chunk_size;
+		do_div(calc_size, num_stripes);
+		do_div(calc_size, stripe_len);
+		calc_size *= stripe_len;
+	}
+	/* we don't want tiny stripes */
+	*num_bytes = chunk_bytes_by_type(type, calc_size,
+					 num_stripes, sub_stripes);
+	calc_size = max_t(u64, chunk_bytes_by_type(type, min_chunk_size,
+		          num_stripes, sub_stripes), calc_size);
+
 again:
+	do_div(calc_size, stripe_len);
+	calc_size *= stripe_len;
+
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
 	index = 0;
 
 	if (type & BTRFS_BLOCK_GROUP_DUP)
 		min_free = calc_size * 2;
+	else
+		min_free = calc_size;
 
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
@@ -727,13 +785,9 @@ again:
 	}
 
 	stripes = &chunk->stripe;
+	*num_bytes = chunk_bytes_by_type(type, calc_size,
+					 num_stripes, sub_stripes);
 
-	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
-		*num_bytes = calc_size;
-	else if (type & BTRFS_BLOCK_GROUP_RAID10)
-		*num_bytes = calc_size * (num_stripes / sub_stripes);
-	else
-		*num_bytes = calc_size * num_stripes;
 
 	index = 0;
 printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
-- 
cgit v1.2.3


From a40a90a0420abd5ff86a0917facd3293ebb6a9b6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 11:55:51 -0400
Subject: Btrfs: Fix chunk allocation when some devices don't have enough room
 for stripes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e3ddd7fb8ed..fe5b00986d2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -664,7 +664,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
-	int min_chunk_size = 8 * 1024 * 1024;
+	int min_stripe_size = 1 * 1024 * 1024;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
 	u64 max_chunk_size = calc_size;
@@ -673,6 +673,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 max_avail = 0;
 	u64 percent_max;
 	int num_stripes = 1;
+	int min_stripes = 1;
 	int sub_stripes = 0;
 	int looped = 0;
 	int ret;
@@ -683,15 +684,20 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (list_empty(dev_list))
 		return -ENOSPC;
 
-	if (type & (BTRFS_BLOCK_GROUP_RAID0))
+	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
-	if (type & (BTRFS_BLOCK_GROUP_DUP))
+		min_stripes = 2;
+	}
+	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
 		num_stripes = 2;
+		min_stripes = 2;
+	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
 		if (num_stripes < 2)
 			return -ENOSPC;
+		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
@@ -699,22 +705,26 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
 		sub_stripes = 2;
+		min_stripes = 4;
 	}
 
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
 		max_chunk_size = 10 * calc_size;
-		min_chunk_size = 256 * 1024 * 1024;
+		min_stripe_size = 64 * 1024 * 1024;
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
 		max_chunk_size = 4 * calc_size;
-		min_chunk_size = 64 * 1024 * 1024;
-	} else {
-		min_chunk_size = 32 * 1024 * 1024;
+		min_stripe_size = 32 * 1024 * 1024;
+	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		calc_size = 8 * 1024 * 1024;
+		max_chunk_size = calc_size * 2;
+		min_stripe_size = 1 * 1024 * 1024;
 	}
 
 	/* we don't want a chunk larger than 10% of the FS */
 	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
 	max_chunk_size = min(percent_max, max_chunk_size);
 
+again:
 	if (calc_size * num_stripes > max_chunk_size) {
 		calc_size = max_chunk_size;
 		do_div(calc_size, num_stripes);
@@ -722,12 +732,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		calc_size *= stripe_len;
 	}
 	/* we don't want tiny stripes */
-	*num_bytes = chunk_bytes_by_type(type, calc_size,
-					 num_stripes, sub_stripes);
-	calc_size = max_t(u64, chunk_bytes_by_type(type, min_chunk_size,
-		          num_stripes, sub_stripes), calc_size);
+	calc_size = max_t(u64, min_stripe_size, calc_size);
 
-again:
 	do_div(calc_size, stripe_len);
 	calc_size *= stripe_len;
 
@@ -746,19 +752,27 @@ again:
 
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
-		if (avail > max_avail)
-			max_avail = avail;
 		if (avail >= min_free) {
 			list_move_tail(&device->dev_list, &private_devs);
 			index++;
 			if (type & BTRFS_BLOCK_GROUP_DUP)
 				index++;
-		}
+		} else if (avail > max_avail)
+			max_avail = avail;
 		if (cur == dev_list)
 			break;
 	}
 	if (index < num_stripes) {
 		list_splice(&private_devs, dev_list);
+		if (index >= min_stripes) {
+			num_stripes = index;
+			if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+				num_stripes /= sub_stripes;
+				num_stripes *= sub_stripes;
+			}
+			looped = 1;
+			goto again;
+		}
 		if (!looped && max_avail > 0) {
 			looped = 1;
 			calc_size = max_avail;
@@ -766,7 +780,6 @@ again:
 		}
 		return -ENOSPC;
 	}
-
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-- 
cgit v1.2.3


From ad5bd91ecee2b687071da00094564fd075f934ee Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Apr 2008 08:28:10 -0400
Subject: Btrfs: Add 1MB to the min_free in alloc_chunk

This properly reflects the first 1MB we skip at the start of the device

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fe5b00986d2..4fc92cf813d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -746,6 +746,9 @@ again:
 	else
 		min_free = calc_size;
 
+	/* we add 1MB because we never use the first 1MB of the device */
+	min_free += 1024 * 1024;
+
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
-- 
cgit v1.2.3


From f2d8d74d7874f8f81222363cd6459a365796e35a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Apr 2008 10:03:05 -0400
Subject: Btrfs: Make an unplug function that doesn't unplug every spindle

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 79 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 22 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4fc92cf813d..46024070650 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
@@ -930,9 +931,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	return ret;
 }
 
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
-		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret, int mirror_num)
+static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+			     u64 logical, u64 *length,
+			     struct btrfs_multi_bio **multi_ret,
+			     int mirror_num, struct page *unplug_page)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -944,6 +946,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int stripes_required = 1;
 	int stripe_index;
 	int i;
+	int num_stripes;
 	struct btrfs_multi_bio *multi = NULL;
 
 	if (multi_ret && !(rw & (1 << BIO_RW))) {
@@ -960,10 +963,14 @@ again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	spin_unlock(&em_tree->lock);
+
+	if (!em && unplug_page)
+		return 0;
+
 	if (!em) {
 		printk("unable to find logical %Lu\n", logical);
+		BUG();
 	}
-	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
 	map = (struct map_lookup *)em->bdev;
@@ -1010,14 +1017,15 @@ again:
 	} else {
 		*length = em->len - offset;
 	}
-	if (!multi_ret)
+
+	if (!multi_ret && !unplug_page)
 		goto out;
 
-	multi->num_stripes = 1;
+	num_stripes = 1;
 	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (rw & (1 << BIO_RW))
-			multi->num_stripes = map->num_stripes;
+		if (unplug_page || (rw & (1 << BIO_RW)))
+			num_stripes = map->num_stripes;
 		else if (mirror_num) {
 			stripe_index = mirror_num - 1;
 		} else {
@@ -1037,7 +1045,7 @@ again:
 		}
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
-			multi->num_stripes = map->num_stripes;
+			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
@@ -1047,8 +1055,8 @@ again:
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
 
-		if (rw & (1 << BIO_RW))
-			multi->num_stripes = map->sub_stripes;
+		if (unplug_page || (rw & (1 << BIO_RW)))
+			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
 		else
@@ -1063,19 +1071,50 @@ again:
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
-	for (i = 0; i < multi->num_stripes; i++) {
-		multi->stripes[i].physical =
-			map->stripes[stripe_index].physical + stripe_offset +
-			stripe_nr * map->stripe_len;
-		multi->stripes[i].dev = map->stripes[stripe_index].dev;
+	for (i = 0; i < num_stripes; i++) {
+		if (unplug_page) {
+			struct btrfs_device *device;
+			struct backing_dev_info *bdi;
+
+			device = map->stripes[stripe_index].dev;
+			bdi = blk_get_backing_dev_info(device->bdev);
+			if (bdi->unplug_io_fn) {
+				bdi->unplug_io_fn(bdi, unplug_page);
+			}
+		} else {
+			multi->stripes[i].physical =
+				map->stripes[stripe_index].physical +
+				stripe_offset + stripe_nr * map->stripe_len;
+			multi->stripes[i].dev = map->stripes[stripe_index].dev;
+		}
 		stripe_index++;
 	}
-	*multi_ret = multi;
+	if (multi_ret) {
+		*multi_ret = multi;
+		multi->num_stripes = num_stripes;
+	}
 out:
 	free_extent_map(em);
 	return 0;
 }
 
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		      u64 logical, u64 *length,
+		      struct btrfs_multi_bio **multi_ret, int mirror_num)
+{
+	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+				 mirror_num, NULL);
+}
+
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+		      u64 logical, struct page *page)
+{
+	u64 length = PAGE_CACHE_SIZE;
+	return __btrfs_map_block(map_tree, READ, logical, &length,
+				 NULL, 0, page);
+}
+
+
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_multi_stripe(struct bio *bio, int err)
 #else
@@ -1122,16 +1161,12 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	u64 logical = bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
-	struct bio_vec *bvec;
 	struct btrfs_multi_bio *multi = NULL;
-	int i;
 	int ret;
 	int dev_nr = 0;
 	int total_devs = 1;
 
-	bio_for_each_segment(bvec, bio, i) {
-		length += bvec->bv_len;
-	}
+	length = bio->bi_size;
 
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
-- 
cgit v1.2.3


From 3c12ac7205d4bd679fefa722aa9b61385e4b4749 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Apr 2008 12:01:38 -0400
Subject: Btrfs: Simplify device selection for mirrored reads

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 46024070650..107fc74c3ab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1029,19 +1029,8 @@ again:
 		else if (mirror_num) {
 			stripe_index = mirror_num - 1;
 		} else {
-			int i;
-			u64 least = (u64)-1;
-			struct btrfs_device *cur;
-
-			for (i = 0; i < map->num_stripes; i++) {
-				cur = map->stripes[i].dev;
-				spin_lock(&cur->io_lock);
-				if (cur->total_ios < least) {
-					least = cur->total_ios;
-					stripe_index = i;
-				}
-				spin_unlock(&cur->io_lock);
-			}
+			u64 orig_stripe_nr = stripe_nr;
+			stripe_index = do_div(orig_stripe_nr, num_stripes);
 		}
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
@@ -1050,7 +1039,6 @@ again:
 			stripe_index = mirror_num - 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
-		int orig_stripe_nr = stripe_nr;
 
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
@@ -1059,8 +1047,11 @@ again:
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
-		else
-			stripe_index += orig_stripe_nr % map->sub_stripes;
+		else {
+			u64 orig_stripe_nr = stripe_nr;
+			stripe_index += do_div(orig_stripe_nr,
+					       map->sub_stripes);
+		}
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
-- 
cgit v1.2.3


From b30757178dad19a0388d958ff9eea66e674d39ed Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Apr 2008 09:22:07 -0400
Subject: Btrfs: Add a special device list for chunk allocations

This allows other code that needs to walk every device in the FS to do so
without locking against allocations.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 107fc74c3ab..5619e50583e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -111,6 +111,7 @@ static int device_list_add(const char *path,
 		if (!fs_devices)
 			return -ENOMEM;
 		INIT_LIST_HEAD(&fs_devices->devices);
+		INIT_LIST_HEAD(&fs_devices->alloc_list);
 		list_add(&fs_devices->list, &fs_uuids);
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
@@ -139,6 +140,7 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		list_add(&device->dev_list, &fs_devices->devices);
+		list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
 		fs_devices->num_devices++;
 	}
 
@@ -660,7 +662,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
 	struct list_head private_devs;
-	struct list_head *dev_list = &extent_root->fs_info->fs_devices->devices;
+	struct list_head *dev_list;
 	struct list_head *cur;
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
@@ -682,6 +684,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int stripe_len = 64 * 1024;
 	struct btrfs_key key;
 
+	dev_list = &extent_root->fs_info->fs_devices->alloc_list;
 	if (list_empty(dev_list))
 		return -ENOSPC;
 
@@ -752,12 +755,12 @@ again:
 
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
 		if (avail >= min_free) {
-			list_move_tail(&device->dev_list, &private_devs);
+			list_move_tail(&device->dev_alloc_list, &private_devs);
 			index++;
 			if (type & BTRFS_BLOCK_GROUP_DUP)
 				index++;
@@ -812,12 +815,12 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 		struct btrfs_stripe *stripe;
 		BUG_ON(list_empty(&private_devs));
 		cur = private_devs.next;
-		device = list_entry(cur, struct btrfs_device, dev_list);
+		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 
 		/* loop over this device again if we're doing a dup group */
 		if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
 		    (index == num_stripes - 1))
-			list_move_tail(&device->dev_list, dev_list);
+			list_move_tail(&device->dev_alloc_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
 			     info->chunk_root->root_key.objectid,
@@ -1329,6 +1332,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -ENOMEM;
 		list_add(&device->dev_list,
 			 &root->fs_info->fs_devices->devices);
+		list_add(&device->dev_alloc_list,
+			 &root->fs_info->fs_devices->alloc_list);
 		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 	}
-- 
cgit v1.2.3


From e1c4b7451e22f5b0a9fbccfa560ee7b80c35b8cd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Apr 2008 13:26:46 -0400
Subject: Fix btrfs_get_extent and get_block corner cases, and disable O_DIRECT
 reads

The generic O_DIRECT code assumes all the bios have the same bdev,
which isn't true for multi-device btrfs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5619e50583e..bccb5566fd8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1161,7 +1161,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	int total_devs = 1;
 
 	length = bio->bi_size;
-
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 
@@ -1192,6 +1191,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
+
 		bio->bi_bdev = dev->bdev;
 		spin_lock(&dev->io_lock);
 		dev->total_ios++;
-- 
cgit v1.2.3


From 84eed90fac1b927a2657ff3bb7a0f18b9cb688f7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 09:04:37 -0400
Subject: Btrfs: Add failure handling for read_sys_array

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bccb5566fd8..c63a982e31d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1365,14 +1365,14 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	struct extent_buffer *sb = root->fs_info->sb_buffer;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
-	struct btrfs_key key;
+	u8 *ptr;
+	unsigned long sb_ptr;
+	int ret = 0;
 	u32 num_stripes;
 	u32 array_size;
 	u32 len = 0;
-	u8 *ptr;
-	unsigned long sb_ptr;
 	u32 cur;
-	int ret;
+	struct btrfs_key key;
 
 	array_size = btrfs_super_sys_array_size(super_copy);
 
@@ -1397,17 +1397,19 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
 			chunk = (struct btrfs_chunk *)sb_ptr;
 			ret = read_one_chunk(root, &key, sb, chunk);
-			BUG_ON(ret);
+			if (ret)
+				break;
 			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
 			len = btrfs_chunk_item_size(num_stripes);
 		} else {
-			BUG();
+			ret = -EIO;
+			break;
 		}
 		ptr += len;
 		sb_ptr += len;
 		cur += len;
 	}
-	return 0;
+	return ret;
 }
 
 int btrfs_read_chunk_tree(struct btrfs_root *root)
-- 
cgit v1.2.3


From 8f18cf13396caae5a3d7ae91201cfb15181a9642 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 16:53:30 -0400
Subject: Btrfs: Make the resizer work based on shrinking and growing devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 324 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 312 insertions(+), 12 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c63a982e31d..a2c56de1548 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -77,7 +77,7 @@ static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
 		if (dev->devid == devid &&
-		    !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE)) {
+		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 			return dev;
 		}
 	}
@@ -293,6 +293,10 @@ static int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	 * so we make sure to start at an offset of at least 1MB
 	 */
 	search_start = max((u64)1024 * 1024, search_start);
+
+	if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+		search_start = max(root->fs_info->alloc_start, search_start);
+
 	key.objectid = device->devid;
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
@@ -380,6 +384,33 @@ error:
 	return ret;
 }
 
+int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+			  struct btrfs_device *device,
+			  u64 start)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_del_item(trans, root, path);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
@@ -560,6 +591,7 @@ out:
 	btrfs_free_path(path);
 	return ret;
 }
+
 int btrfs_update_device(struct btrfs_trans_handle *trans,
 			struct btrfs_device *device)
 {
@@ -606,6 +638,254 @@ out:
 	return ret;
 }
 
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_super_block *super_copy =
+		&device->dev_root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 diff = new_size - device->total_bytes;
+
+	btrfs_set_super_total_bytes(super_copy, old_total + diff);
+	return btrfs_update_device(trans, device);
+}
+
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    u64 chunk_tree, u64 chunk_objectid,
+			    u64 chunk_offset)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	root = root->fs_info->chunk_root;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = chunk_objectid;
+	key.offset = chunk_offset;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_del_item(trans, root, path);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+			chunk_offset)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_chunk *chunk;
+	u8 *ptr;
+	int ret = 0;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u32 cur;
+	struct btrfs_key key;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	ptr = super_copy->sys_chunk_array;
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key);
+
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+			chunk = (struct btrfs_chunk *)(ptr + len);
+			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+			len += btrfs_chunk_item_size(num_stripes);
+		} else {
+			ret = -EIO;
+			break;
+		}
+		if (key.objectid == chunk_objectid &&
+		    key.offset == chunk_offset) {
+			memmove(ptr, ptr + len, array_size - (cur + len));
+			array_size -= len;
+			btrfs_set_super_sys_array_size(super_copy, array_size);
+		} else {
+			ptr += len;
+			cur += len;
+		}
+	}
+	return ret;
+}
+
+
+int btrfs_relocate_chunk(struct btrfs_root *root,
+			 u64 chunk_tree, u64 chunk_objectid,
+			 u64 chunk_offset)
+{
+	struct extent_map_tree *em_tree;
+	struct btrfs_root *extent_root;
+	struct btrfs_trans_handle *trans;
+	struct extent_map *em;
+	struct map_lookup *map;
+	int ret;
+	int i;
+
+	root = root->fs_info->chunk_root;
+	extent_root = root->fs_info->extent_root;
+	em_tree = &root->fs_info->mapping_tree.map_tree;
+
+	/* step one, relocate all the extents inside this chunk */
+	ret = btrfs_shrink_extent_tree(extent_root, chunk_offset);
+	BUG_ON(ret);
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	/*
+	 * step two, delete the device extents and the
+	 * chunk tree entries
+	 */
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+	spin_unlock(&em_tree->lock);
+
+	BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset);
+	map = (struct map_lookup *)em->bdev;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
+					    map->stripes[i].physical);
+		BUG_ON(ret);
+	}
+	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
+			       chunk_offset);
+
+	BUG_ON(ret);
+
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+		BUG_ON(ret);
+		goto out;
+	}
+
+
+
+	spin_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	kfree(map);
+	em->bdev = NULL;
+
+	/* once for the tree */
+	free_extent_map(em);
+	spin_unlock(&em_tree->lock);
+
+out:
+	/* once for us */
+	free_extent_map(em);
+
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
+	u64 length;
+	u64 chunk_tree;
+	u64 chunk_objectid;
+	u64 chunk_offset;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 diff = device->total_bytes - new_size;
+
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	path->reada = 2;
+
+	device->total_bytes = new_size;
+	ret = btrfs_update_device(trans, device);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		goto done;
+	}
+	WARN_ON(diff > old_total);
+	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	btrfs_end_transaction(trans, root);
+
+	key.objectid = device->devid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto done;
+
+		ret = btrfs_previous_item(root, path, 0, key.type);
+		if (ret < 0)
+			goto done;
+		if (ret) {
+			ret = 0;
+			goto done;
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+		if (key.objectid != device->devid)
+			goto done;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		length = btrfs_dev_extent_length(l, dev_extent);
+
+		if (key.offset + length <= new_size)
+			goto done;
+
+		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+		btrfs_release_path(root, path);
+
+		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+					   chunk_offset);
+		if (ret)
+			goto done;
+	}
+
+done:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_key *key,
@@ -658,6 +938,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 dev_offset;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_path *path;
 	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
@@ -724,6 +1005,10 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripe_size = 1 * 1024 * 1024;
 	}
 
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	/* we don't want a chunk larger than 10% of the FS */
 	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
 	max_chunk_size = min(percent_max, max_chunk_size);
@@ -759,11 +1044,19 @@ again:
 
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
+
 		if (avail >= min_free) {
-			list_move_tail(&device->dev_alloc_list, &private_devs);
-			index++;
-			if (type & BTRFS_BLOCK_GROUP_DUP)
+			u64 ignored_start = 0;
+			ret = find_free_dev_extent(trans, device, path,
+						   min_free,
+						   &ignored_start);
+			if (ret == 0) {
+				list_move_tail(&device->dev_alloc_list,
+					       &private_devs);
 				index++;
+				if (type & BTRFS_BLOCK_GROUP_DUP)
+					index++;
+			}
 		} else if (avail > max_avail)
 			max_avail = avail;
 		if (cur == dev_list)
@@ -785,30 +1078,37 @@ again:
 			calc_size = max_avail;
 			goto again;
 		}
+		btrfs_free_path(path);
 		return -ENOSPC;
 	}
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
 			      &key.offset);
-	if (ret)
+	if (ret) {
+		btrfs_free_path(path);
 		return ret;
+	}
 
 	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
-	if (!chunk)
+	if (!chunk) {
+		btrfs_free_path(path);
 		return -ENOMEM;
+	}
 
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		kfree(chunk);
+		btrfs_free_path(path);
 		return -ENOMEM;
 	}
+	btrfs_free_path(path);
+	path = NULL;
 
 	stripes = &chunk->stripe;
 	*num_bytes = chunk_bytes_by_type(type, calc_size,
 					 num_stripes, sub_stripes);
 
-
 	index = 0;
 printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 	while(index < num_stripes) {
@@ -874,6 +1174,11 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, cal
 	em->len = *num_bytes;
 	em->block_start = 0;
 
+	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_add_system_chunk(trans, chunk_root, &key,
+				    chunk, btrfs_chunk_item_size(num_stripes));
+		BUG_ON(ret);
+	}
 	kfree(chunk);
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
@@ -1376,11 +1681,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 
 	array_size = btrfs_super_sys_array_size(super_copy);
 
-	/*
-	 * we do this loop twice, once for the device items and
-	 * once for all of the chunks.  This way there are device
-	 * structs filled in for every chunk
-	 */
 	ptr = super_copy->sys_chunk_array;
 	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
 	cur = 0;
-- 
cgit v1.2.3


From 788f20eb5affef584e75ea84bb80a4c3352a2c0e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Apr 2008 15:29:42 -0400
Subject: Btrfs: Add new ioctl to add devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a2c56de1548..b93c15aa17d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -19,6 +19,7 @@
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
+#include <linux/random.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
@@ -592,6 +593,80 @@ out:
 	return ret;
 }
 
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct list_head *cur;
+	struct list_head *devices;
+	u64 total_bytes;
+	int ret = 0;
+
+
+	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
+	if (!bdev) {
+		return -EIO;
+	}
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	devices = &root->fs_info->fs_devices->devices;
+	list_for_each(cur, devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev == bdev) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device) {
+		/* we can safely leave the fs_devices entry around */
+		ret = -ENOMEM;
+		goto out_close_bdev;
+	}
+
+	device->barriers = 1;
+	generate_random_uuid(device->uuid);
+	spin_lock_init(&device->io_lock);
+	device->name = kstrdup(device_path, GFP_NOFS);
+	if (!device->name) {
+		kfree(device);
+		goto out_close_bdev;
+	}
+	device->io_width = root->sectorsize;
+	device->io_align = root->sectorsize;
+	device->sector_size = root->sectorsize;
+	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->dev_root = root->fs_info->dev_root;
+	device->bdev = bdev;
+
+	ret = btrfs_add_device(trans, root, device);
+	if (ret)
+		goto out_close_bdev;
+
+	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+				    total_bytes + device->total_bytes);
+
+	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+				    total_bytes + 1);
+
+	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &root->fs_info->fs_devices->alloc_list);
+	root->fs_info->fs_devices->num_devices++;
+out:
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+
+out_close_bdev:
+	close_bdev_excl(bdev);
+	goto out;
+}
+
 int btrfs_update_device(struct btrfs_trans_handle *trans,
 			struct btrfs_device *device)
 {
-- 
cgit v1.2.3


From ec44a35cbeb26ab2da84cb280d778260f2312feb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Apr 2008 15:29:52 -0400
Subject: Btrfs: Add balance ioctl to restripe the chunks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 106 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b93c15aa17d..6476ecbf132 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -869,6 +869,107 @@ out:
 	return 0;
 }
 
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+
+int btrfs_balance(struct btrfs_root *dev_root)
+{
+	int ret;
+	struct list_head *cur;
+	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+	struct btrfs_device *device;
+	u64 old_size;
+	u64 size_to_free;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_chunk *chunk;
+	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key found_key;
+
+
+	dev_root = dev_root->fs_info->dev_root;
+
+	mutex_lock(&dev_root->fs_info->fs_mutex);
+	/* step one make some room on all the devices */
+	list_for_each(cur, devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		old_size = device->total_bytes;
+		size_to_free = div_factor(old_size, 1);
+		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+		if (device->total_bytes - device->bytes_used > size_to_free)
+			continue;
+
+		ret = btrfs_shrink_device(device, old_size - size_to_free);
+		BUG_ON(ret);
+
+		trans = btrfs_start_transaction(dev_root, 1);
+		BUG_ON(!trans);
+
+		ret = btrfs_grow_device(trans, device, old_size);
+		BUG_ON(ret);
+
+		btrfs_end_transaction(trans, dev_root);
+	}
+
+	/* step two, relocate all the chunks */
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+
+		/*
+		 * this shouldn't happen, it means the last relocate
+		 * failed
+		 */
+		if (ret == 0)
+			break;
+
+		ret = btrfs_previous_item(chunk_root, path, 0,
+					  BTRFS_CHUNK_ITEM_KEY);
+		if (ret) {
+			break;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != key.objectid)
+			break;
+		chunk = btrfs_item_ptr(path->nodes[0],
+				       path->slots[0],
+				       struct btrfs_chunk);
+		key.offset = found_key.offset;
+		/* chunk zero is special */
+		if (key.offset == 0)
+			break;
+
+		ret = btrfs_relocate_chunk(chunk_root,
+					   chunk_root->root_key.objectid,
+					   found_key.objectid,
+					   found_key.offset);
+		BUG_ON(ret);
+		btrfs_release_path(chunk_root, path);
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	mutex_unlock(&dev_root->fs_info->fs_mutex);
+	return ret;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -985,15 +1086,6 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static u64 div_factor(u64 num, int factor)
-{
-	if (factor == 10)
-		return num;
-	num *= factor;
-	do_div(num, 10);
-	return num;
-}
-
 static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
 			       int sub_stripes)
 {
@@ -1040,6 +1132,11 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int stripe_len = 64 * 1024;
 	struct btrfs_key key;
 
+	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (type & BTRFS_BLOCK_GROUP_DUP)) {
+		WARN_ON(1);
+		type &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
 	dev_list = &extent_root->fs_info->fs_devices->alloc_list;
 	if (list_empty(dev_list))
 		return -ENOSPC;
-- 
cgit v1.2.3


From 4235298e4fc3c1a09f659cfe2fd285024eeb2241 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Apr 2008 16:40:52 -0400
Subject: Btrfs: Drop some verbose printks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6476ecbf132..9a724113456 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1282,7 +1282,6 @@ again:
 					 num_stripes, sub_stripes);
 
 	index = 0;
-printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 	while(index < num_stripes) {
 		struct btrfs_stripe *stripe;
 		BUG_ON(list_empty(&private_devs));
@@ -1299,7 +1298,6 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 			     BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
 			     calc_size, &dev_offset);
 		BUG_ON(ret);
-printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, calc_size, device->devid, type);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
-- 
cgit v1.2.3


From a236aed14ccb0661611d4416f6b573d892bdc60a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Apr 2008 09:38:00 -0400
Subject: Btrfs: Deal with failed writes in mirrored configurations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9a724113456..57ab755aca7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1425,6 +1425,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int stripe_index;
 	int i;
 	int num_stripes;
+	int max_errors = 0;
 	struct btrfs_multi_bio *multi = NULL;
 
 	if (multi_ret && !(rw & (1 << BIO_RW))) {
@@ -1436,6 +1437,8 @@ again:
 				GFP_NOFS);
 		if (!multi)
 			return -ENOMEM;
+
+		atomic_set(&multi->error, 0);
 	}
 
 	spin_lock(&em_tree->lock);
@@ -1462,8 +1465,10 @@ again:
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 				 BTRFS_BLOCK_GROUP_DUP)) {
 			stripes_required = map->num_stripes;
+			max_errors = 1;
 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 			stripes_required = map->sub_stripes;
+			max_errors = 1;
 		}
 	}
 	if (multi_ret && rw == WRITE &&
@@ -1561,6 +1566,7 @@ again:
 	if (multi_ret) {
 		*multi_ret = multi;
 		multi->num_stripes = num_stripes;
+		multi->max_errors = max_errors;
 	}
 out:
 	free_extent_map(em);
@@ -1598,14 +1604,19 @@ static int end_bio_multi_stripe(struct bio *bio,
 		return 1;
 #endif
 	if (err)
-		multi->error = err;
+		atomic_inc(&multi->error);
 
 	if (atomic_dec_and_test(&multi->stripes_pending)) {
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
 
-		if (!err && multi->error)
-			err = multi->error;
+		/* only send an error to the higher layers if it is
+		 * beyond the tolerance of the multi-bio
+		 */
+		if (atomic_read(&multi->error) > multi->max_errors)
+			err = -EIO;
+		else
+			err = 0;
 		kfree(multi);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-- 
cgit v1.2.3


From 2fff734fafa742236aecbdcdc8b1ff4d221cbaca Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Apr 2008 14:12:09 -0400
Subject: Btrfs: Tune stripe selection for raid1 and raid10

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 57ab755aca7..6d8dd4438c7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1509,17 +1509,17 @@ again:
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
 		if (unplug_page || (rw & (1 << BIO_RW)))
 			num_stripes = map->num_stripes;
-		else if (mirror_num) {
+		else if (mirror_num)
 			stripe_index = mirror_num - 1;
-		} else {
-			u64 orig_stripe_nr = stripe_nr;
-			stripe_index = do_div(orig_stripe_nr, num_stripes);
-		}
+		else
+			stripe_index = current->pid % map->num_stripes;
+
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
+
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
 
@@ -1530,11 +1530,8 @@ again:
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
-		else {
-			u64 orig_stripe_nr = stripe_nr;
-			stripe_index += do_div(orig_stripe_nr,
-					       map->sub_stripes);
-		}
+		else
+			stripe_index += current->pid % map->sub_stripes;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
-- 
cgit v1.2.3


From 006a58a25fe8aaa7559be80dcff94d98aee84309 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 May 2008 14:43:15 -0400
Subject: Btrfs: Compile warning fixup in volume.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6d8dd4438c7..b3818757310 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -547,7 +547,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 	unsigned long ptr;
-	u64 free_devid;
+	u64 free_devid = 0;
 
 	root = root->fs_info->chunk_root;
 
-- 
cgit v1.2.3


From a061fc8da7b990faa41ca503e66faef3ecdeead0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 May 2008 11:43:44 -0400
Subject: Btrfs: Add support for online device removal

This required a few structural changes to the code that manages bdev pointers:

The VFS super block now gets an anon-bdev instead of a pointer to the
lowest bdev.  This allows us to avoid swapping the super block bdev pointer
around at run time.

The code to read in the super block no longer goes through the extent
buffer interface.  Things got ugly keeping the mapping constant.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 221 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 212 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b3818757310..55da5f0c56e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -45,6 +45,16 @@ struct map_lookup {
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 
+void btrfs_lock_volumes(void)
+{
+	mutex_lock(&uuid_mutex);
+}
+
+void btrfs_unlock_volumes(void)
+{
+	mutex_unlock(&uuid_mutex);
+}
+
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
@@ -193,12 +203,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			ret = PTR_ERR(bdev);
 			goto fail;
 		}
+		set_blocksize(bdev, 4096);
 		if (device->devid == fs_devices->latest_devid)
 			fs_devices->latest_bdev = bdev;
 		if (device->devid == fs_devices->lowest_devid) {
 			fs_devices->lowest_bdev = bdev;
 		}
 		device->bdev = bdev;
+
 	}
 	mutex_unlock(&uuid_mutex);
 	return 0;
@@ -393,6 +405,9 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_root *root = device->dev_root;
 	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf = NULL;
+	struct btrfs_dev_extent *extent = NULL;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -403,8 +418,25 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid,
+					  BTRFS_DEV_EXTENT_KEY);
+		BUG_ON(ret);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+		BUG_ON(found_key.offset > start || found_key.offset +
+		       btrfs_dev_extent_length(leaf, extent) < start);
+		ret = 0;
+	} else if (ret == 0) {
+		leaf = path->nodes[0];
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+	}
 	BUG_ON(ret);
 
+	device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
 
@@ -593,6 +625,170 @@ out:
 	return ret;
 }
 
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+			     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct block_device *bdev = device->bdev;
+	struct btrfs_device *next_dev;
+	struct btrfs_key key;
+	u64 total_bytes;
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_trans_handle *trans;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 1);
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+
+	/*
+	 * at this point, the device is zero sized.  We want to
+	 * remove it from the devices list and zero out the old super
+	 */
+	list_del_init(&device->dev_list);
+	list_del_init(&device->dev_alloc_list);
+	fs_devices = root->fs_info->fs_devices;
+
+	next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
+			      dev_list);
+	if (bdev == fs_devices->lowest_bdev)
+		fs_devices->lowest_bdev = next_dev->bdev;
+	if (bdev == root->fs_info->sb->s_bdev)
+		root->fs_info->sb->s_bdev = next_dev->bdev;
+	if (bdev == fs_devices->latest_bdev)
+		fs_devices->latest_bdev = next_dev->bdev;
+
+	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+				    total_bytes - device->total_bytes);
+
+	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+				    total_bytes - 1);
+out:
+	btrfs_free_path(path);
+	btrfs_commit_transaction(trans, root);
+	return ret;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 all_avail;
+	u64 devid;
+	int ret = 0;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&uuid_mutex);
+
+	all_avail = root->fs_info->avail_data_alloc_bits |
+		root->fs_info->avail_system_alloc_bits |
+		root->fs_info->avail_metadata_alloc_bits;
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+	    root->fs_info->fs_devices->num_devices <= 4) {
+		printk("btrfs: unable to go below four devices on raid10\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+	    root->fs_info->fs_devices->num_devices <= 2) {
+		printk("btrfs: unable to go below two devices on raid1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		goto out;
+	}
+
+	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	if (!bh) {
+		ret = -EIO;
+		goto error_close;
+	}
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+	    sizeof(disk_super->magic))) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	if (memcmp(disk_super->fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	devid = le64_to_cpu(disk_super->dev_item.devid);
+	device = btrfs_find_device(root, devid, NULL);
+	if (!device) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+
+	root->fs_info->fs_devices->num_devices--;
+
+	ret = btrfs_shrink_device(device, 0);
+	if (ret)
+		goto error_brelse;
+
+
+	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+	if (ret)
+		goto error_brelse;
+
+	/* make sure this device isn't detected as part of the FS anymore */
+	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+	set_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+
+	brelse(bh);
+
+	/* one close for the device struct or super_block */
+	close_bdev_excl(device->bdev);
+
+	/* one close for us */
+	close_bdev_excl(device->bdev);
+
+	kfree(device->name);
+	kfree(device);
+	ret = 0;
+	goto out;
+
+error_brelse:
+	brelse(bh);
+error_close:
+	close_bdev_excl(bdev);
+out:
+	mutex_unlock(&uuid_mutex);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_trans_handle *trans;
@@ -831,13 +1027,17 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
 	spin_unlock(&em_tree->lock);
 
-	BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset);
+	BUG_ON(em->start > chunk_offset ||
+	       em->start + em->len < chunk_offset);
 	map = (struct map_lookup *)em->bdev;
 
 	for (i = 0; i < map->num_stripes; i++) {
 		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
 					    map->stripes[i].physical);
 		BUG_ON(ret);
+
+		ret = btrfs_update_device(trans, map->stripes[i].dev);
+		BUG_ON(ret);
 	}
 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
 			       chunk_offset);
@@ -847,11 +1047,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
 		BUG_ON(ret);
-		goto out;
 	}
 
-
-
 	spin_lock(&em_tree->lock);
 	remove_extent_mapping(em_tree, em);
 	kfree(map);
@@ -861,7 +1058,6 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	free_extent_map(em);
 	spin_unlock(&em_tree->lock);
 
-out:
 	/* once for us */
 	free_extent_map(em);
 
@@ -1449,7 +1645,7 @@ again:
 		return 0;
 
 	if (!em) {
-		printk("unable to find logical %Lu\n", logical);
+		printk("unable to find logical %Lu len %Lu\n", logical, *length);
 		BUG();
 	}
 
@@ -1712,6 +1908,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
+
 	spin_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
 	spin_unlock(&map_tree->map_tree.lock);
@@ -1845,7 +2042,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
-	struct extent_buffer *sb = root->fs_info->sb_buffer;
+	struct extent_buffer *sb;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
 	u8 *ptr;
@@ -1857,6 +2054,12 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	u32 cur;
 	struct btrfs_key key;
 
+	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+					  BTRFS_SUPER_INFO_SIZE);
+	if (!sb)
+		return -ENOMEM;
+	btrfs_set_buffer_uptodate(sb);
+	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
 
 	ptr = super_copy->sys_chunk_array;
@@ -1867,8 +2070,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 		disk_key = (struct btrfs_disk_key *)ptr;
 		btrfs_disk_key_to_cpu(&key, disk_key);
 
-		len = sizeof(*disk_key);
-		ptr += len;
+		len = sizeof(*disk_key); ptr += len;
 		sb_ptr += len;
 		cur += len;
 
@@ -1887,6 +2089,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 		sb_ptr += len;
 		cur += len;
 	}
+	free_extent_buffer(sb);
 	return ret;
 }
 
-- 
cgit v1.2.3


From c1c4d91c38e6e5b29543b02d4a4fb7f007e44f70 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 8 May 2008 15:05:58 -0400
Subject: Btrfs: Only open block devices once during mount -o subvol=

btrfs_open_devices needed a check to see if the device was already
open.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 55da5f0c56e..aa001b08ced 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -196,6 +196,9 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	mutex_lock(&uuid_mutex);
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev)
+			continue;
+
 		bdev = open_bdev_excl(device->name, flags, holder);
 
 		if (IS_ERR(bdev)) {
-- 
cgit v1.2.3


From 323da79c9f096ed4da04e5ea00f766f75b28aeaa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 9 May 2008 11:46:48 -0400
Subject: Btrfs: Chunk relocation fine tuning, and add a few printks to show
 progress

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index aa001b08ced..b5d7bd1915b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1011,6 +1011,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	int ret;
 	int i;
 
+	printk("btrfs relocating chunk %llu\n",
+	       (unsigned long long)chunk_offset);
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
 	em_tree = &root->fs_info->mapping_tree.map_tree;
-- 
cgit v1.2.3


From 1259ab75c62462b8ffad90067b5e1f6312786a18 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 12 May 2008 13:39:03 -0400
Subject: Btrfs: Handle write errors on raid1 and raid10

When duplicate copies exist, writes are allowed to fail to one of those
copies.  This changeset includes a few changes that allow the FS to
continue even when some IOs fail.

It also adds verification of the parent generation number for btree blocks.
This generation is stored in the pointer to a block, and it ensures
that missed writes to are detected.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b5d7bd1915b..5fc7fb48147 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1807,14 +1807,19 @@ static int end_bio_multi_stripe(struct bio *bio,
 	if (atomic_dec_and_test(&multi->stripes_pending)) {
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
-
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the multi-bio
 		 */
-		if (atomic_read(&multi->error) > multi->max_errors)
+		if (atomic_read(&multi->error) > multi->max_errors) {
 			err = -EIO;
-		else
+		} else if (err) {
+			/*
+			 * this bio is actually up to date, we didn't
+			 * go over the max number of errors
+			 */
+			set_bit(BIO_UPTODATE, &bio->bi_flags);
 			err = 0;
+		}
 		kfree(multi);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-- 
cgit v1.2.3


From dfe25020689bb2d318782d2c9c7141203583fc70 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 May 2008 13:46:40 -0400
Subject: Btrfs: Add mount -o degraded to allow mounts to continue with missing
 devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 279 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 201 insertions(+), 78 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5fc7fb48147..43f74d17bce 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -73,6 +73,7 @@ int btrfs_cleanup_fs_uuids(void)
 				close_bdev_excl(dev->bdev);
 			}
 			list_del(&dev->dev_list);
+			kfree(dev->name);
 			kfree(dev);
 		}
 	}
@@ -127,7 +128,6 @@ static int device_list_add(const char *path,
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		fs_devices->lowest_devid = (u64)-1;
 		fs_devices->num_devices = 0;
 		device = NULL;
 	} else {
@@ -159,13 +159,35 @@ static int device_list_add(const char *path,
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
 	}
-	if (fs_devices->lowest_devid > devid) {
-		fs_devices->lowest_devid = devid;
-	}
 	*fs_devices_ret = fs_devices;
 	return 0;
 }
 
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *head = &fs_devices->devices;
+	struct list_head *cur;
+	struct btrfs_device *device;
+
+	mutex_lock(&uuid_mutex);
+again:
+	list_for_each(cur, head) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (!device->in_fs_metadata) {
+printk("getting rid of extra dev %s\n", device->name);
+			if (device->bdev)
+				close_bdev_excl(device->bdev);
+			list_del(&device->dev_list);
+			list_del(&device->dev_alloc_list);
+			fs_devices->num_devices--;
+			kfree(device->name);
+			kfree(device);
+			goto again;
+		}
+	}
+	mutex_unlock(&uuid_mutex);
+	return 0;
+}
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct list_head *head = &fs_devices->devices;
@@ -179,6 +201,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 			close_bdev_excl(device->bdev);
 		}
 		device->bdev = NULL;
+		device->in_fs_metadata = 0;
 	}
 	mutex_unlock(&uuid_mutex);
 	return 0;
@@ -199,6 +222,9 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		if (device->bdev)
 			continue;
 
+		if (!device->name)
+			continue;
+
 		bdev = open_bdev_excl(device->name, flags, holder);
 
 		if (IS_ERR(bdev)) {
@@ -209,10 +235,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		set_blocksize(bdev, 4096);
 		if (device->devid == fs_devices->latest_devid)
 			fs_devices->latest_bdev = bdev;
-		if (device->devid == fs_devices->lowest_devid) {
-			fs_devices->lowest_bdev = bdev;
-		}
 		device->bdev = bdev;
+		device->in_fs_metadata = 0;
 
 	}
 	mutex_unlock(&uuid_mutex);
@@ -439,7 +463,8 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(ret);
 
-	device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+	if (device->bytes_used > 0)
+		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
 
@@ -460,6 +485,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 
+	WARN_ON(!device->in_fs_metadata);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -674,8 +700,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 
 	next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
 			      dev_list);
-	if (bdev == fs_devices->lowest_bdev)
-		fs_devices->lowest_bdev = next_dev->bdev;
 	if (bdev == root->fs_info->sb->s_bdev)
 		root->fs_info->sb->s_bdev = next_dev->bdev;
 	if (bdev == fs_devices->latest_bdev)
@@ -698,7 +722,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_device *device;
 	struct block_device *bdev;
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 	struct btrfs_super_block *disk_super;
 	u64 all_avail;
 	u64 devid;
@@ -712,47 +736,73 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->avail_metadata_alloc_bits;
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-	    root->fs_info->fs_devices->num_devices <= 4) {
+	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 4) {
 		printk("btrfs: unable to go below four devices on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-	    root->fs_info->fs_devices->num_devices <= 2) {
+	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 2) {
 		printk("btrfs: unable to go below two devices on raid1\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
-	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
-		goto out;
-	}
+	if (strcmp(device_path, "missing") == 0) {
+		struct list_head *cur;
+		struct list_head *devices;
+		struct btrfs_device *tmp;
 
-	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
-	if (!bh) {
-		ret = -EIO;
-		goto error_close;
-	}
-	disk_super = (struct btrfs_super_block *)bh->b_data;
-	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-	    sizeof(disk_super->magic))) {
-		ret = -ENOENT;
-		goto error_brelse;
-	}
-	if (memcmp(disk_super->fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) {
-		ret = -ENOENT;
-		goto error_brelse;
-	}
-	devid = le64_to_cpu(disk_super->dev_item.devid);
-	device = btrfs_find_device(root, devid, NULL);
-	if (!device) {
-		ret = -ENOENT;
-		goto error_brelse;
-	}
+		device = NULL;
+		devices = &root->fs_info->fs_devices->devices;
+		list_for_each(cur, devices) {
+			tmp = list_entry(cur, struct btrfs_device, dev_list);
+			if (tmp->in_fs_metadata && !tmp->bdev) {
+				device = tmp;
+				break;
+			}
+		}
+		bdev = NULL;
+		bh = NULL;
+		disk_super = NULL;
+		if (!device) {
+			printk("btrfs: no missing devices found to remove\n");
+			goto out;
+		}
+
+	} else {
+		bdev = open_bdev_excl(device_path, 0,
+				      root->fs_info->bdev_holder);
+		if (IS_ERR(bdev)) {
+			ret = PTR_ERR(bdev);
+			goto out;
+		}
+
+		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		if (!bh) {
+			ret = -EIO;
+			goto error_close;
+		}
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+		    sizeof(disk_super->magic))) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
+		if (memcmp(disk_super->fsid, root->fs_info->fsid,
+			   BTRFS_FSID_SIZE)) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
+		devid = le64_to_cpu(disk_super->dev_item.devid);
+		device = btrfs_find_device(root, devid, NULL);
+		if (!device) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
 
+	}
 	root->fs_info->fs_devices->num_devices--;
 
 	ret = btrfs_shrink_device(device, 0);
@@ -764,19 +814,25 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (ret)
 		goto error_brelse;
 
-	/* make sure this device isn't detected as part of the FS anymore */
-	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-	set_buffer_dirty(bh);
-	sync_dirty_buffer(bh);
-
-	brelse(bh);
-
-	/* one close for the device struct or super_block */
-	close_bdev_excl(device->bdev);
+	if (bh) {
+		/* make sure this device isn't detected as part of
+		 * the FS anymore
+		 */
+		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+		set_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
 
-	/* one close for us */
-	close_bdev_excl(device->bdev);
+		brelse(bh);
+	}
 
+	if (device->bdev) {
+		/* one close for the device struct or super_block */
+		close_bdev_excl(device->bdev);
+	}
+	if (bdev) {
+		/* one close for us */
+		close_bdev_excl(bdev);
+	}
 	kfree(device->name);
 	kfree(device);
 	ret = 0;
@@ -785,7 +841,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 error_brelse:
 	brelse(bh);
 error_close:
-	close_bdev_excl(bdev);
+	if (bdev)
+		close_bdev_excl(bdev);
 out:
 	mutex_unlock(&uuid_mutex);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -839,6 +896,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->total_bytes = i_size_read(bdev->bd_inode);
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
+	device->in_fs_metadata = 1;
 
 	ret = btrfs_add_device(trans, root, device);
 	if (ret)
@@ -1041,8 +1099,10 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 					    map->stripes[i].physical);
 		BUG_ON(ret);
 
-		ret = btrfs_update_device(trans, map->stripes[i].dev);
-		BUG_ON(ret);
+		if (map->stripes[i].dev) {
+			ret = btrfs_update_device(trans, map->stripes[i].dev);
+			BUG_ON(ret);
+		}
 	}
 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
 			       chunk_offset);
@@ -1415,10 +1475,13 @@ again:
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 
-		avail = device->total_bytes - device->bytes_used;
+		if (device->total_bytes > device->bytes_used)
+			avail = device->total_bytes - device->bytes_used;
+		else
+			avail = 0;
 		cur = cur->next;
 
-		if (avail >= min_free) {
+		if (device->in_fs_metadata && avail >= min_free) {
 			u64 ignored_start = 0;
 			ret = find_free_dev_extent(trans, device, path,
 						   min_free,
@@ -1430,7 +1493,7 @@ again:
 				if (type & BTRFS_BLOCK_GROUP_DUP)
 					index++;
 			}
-		} else if (avail > max_avail)
+		} else if (device->in_fs_metadata && avail > max_avail)
 			max_avail = avail;
 		if (cur == dev_list)
 			break;
@@ -1610,6 +1673,22 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	return ret;
 }
 
+static int find_live_mirror(struct map_lookup *map, int first, int num,
+			    int optimal)
+{
+	int i;
+	if (map->stripes[optimal].dev->bdev)
+		return optimal;
+	for (i = first; i < first + num; i++) {
+		if (map->stripes[i].dev->bdev)
+			return i;
+	}
+	/* we couldn't find one that doesn't fail.  Just return something
+	 * and the io error handling code will clean up eventually
+	 */
+	return optimal;
+}
+
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			     u64 logical, u64 *length,
 			     struct btrfs_multi_bio **multi_ret,
@@ -1712,8 +1791,11 @@ again:
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
-		else
-			stripe_index = current->pid % map->num_stripes;
+		else {
+			stripe_index = find_live_mirror(map, 0,
+					    map->num_stripes,
+					    current->pid % map->num_stripes);
+		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
@@ -1731,8 +1813,11 @@ again:
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
-		else
-			stripe_index += current->pid % map->sub_stripes;
+		else {
+			stripe_index = find_live_mirror(map, stripe_index,
+					      map->sub_stripes, stripe_index +
+					      current->pid % map->sub_stripes);
+		}
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -1749,9 +1834,11 @@ again:
 			struct backing_dev_info *bdi;
 
 			device = map->stripes[stripe_index].dev;
-			bdi = blk_get_backing_dev_info(device->bdev);
-			if (bdi->unplug_io_fn) {
-				bdi->unplug_io_fn(bdi, unplug_page);
+			if (device->bdev) {
+				bdi = blk_get_backing_dev_info(device->bdev);
+				if (bdi->unplug_io_fn) {
+					bdi->unplug_io_fn(bdi, unplug_page);
+				}
 			}
 		} else {
 			multi->stripes[i].physical =
@@ -1880,12 +1967,21 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
-
-		bio->bi_bdev = dev->bdev;
-		spin_lock(&dev->io_lock);
-		dev->total_ios++;
-		spin_unlock(&dev->io_lock);
-		submit_bio(rw, bio);
+		if (dev && dev->bdev) {
+			bio->bi_bdev = dev->bdev;
+			spin_lock(&dev->io_lock);
+			dev->total_ios++;
+			spin_unlock(&dev->io_lock);
+			submit_bio(rw, bio);
+		} else {
+			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
+			bio->bi_sector = logical >> 9;
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+			bio_endio(bio, bio->bi_size, -EIO);
+#else
+			bio_endio(bio, -EIO);
+#endif
+		}
 		dev_nr++;
 	}
 	if (total_devs == 1)
@@ -1901,6 +1997,27 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 	return __find_device(head, devid, uuid);
 }
 
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+					    u64 devid, u8 *dev_uuid)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	list_add(&device->dev_list,
+		 &fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &fs_devices->alloc_list);
+	device->barriers = 1;
+	device->dev_root = root->fs_info->dev_root;
+	device->devid = devid;
+	fs_devices->num_devices++;
+	spin_lock_init(&device->io_lock);
+	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+	return device;
+}
+
+
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -1965,11 +2082,22 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 				   btrfs_stripe_dev_uuid_nr(chunk, i),
 				   BTRFS_UUID_SIZE);
 		map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
-		if (!map->stripes[i].dev) {
+
+		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
 			kfree(map);
 			free_extent_map(em);
 			return -EIO;
 		}
+		if (!map->stripes[i].dev) {
+			map->stripes[i].dev =
+				add_missing_dev(root, devid, uuid);
+			if (!map->stripes[i].dev) {
+				kfree(map);
+				free_extent_map(em);
+				return -EIO;
+			}
+		}
+		map->stripes[i].dev->in_fs_metadata = 1;
 	}
 
 	spin_lock(&map_tree->map_tree.lock);
@@ -2016,20 +2144,15 @@ static int read_one_dev(struct btrfs_root *root,
 			   BTRFS_UUID_SIZE);
 	device = btrfs_find_device(root, devid, dev_uuid);
 	if (!device) {
-		printk("warning devid %Lu not found already\n", devid);
-		device = kzalloc(sizeof(*device), GFP_NOFS);
+		printk("warning devid %Lu missing\n", devid);
+		device = add_missing_dev(root, devid, dev_uuid);
 		if (!device)
 			return -ENOMEM;
-		list_add(&device->dev_list,
-			 &root->fs_info->fs_devices->devices);
-		list_add(&device->dev_alloc_list,
-			 &root->fs_info->fs_devices->alloc_list);
-		device->barriers = 1;
-		spin_lock_init(&device->io_lock);
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
+	device->in_fs_metadata = 1;
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
-- 
cgit v1.2.3


From a0af469b58944f6e8c5c8ecbebb42997baf0cb9e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 May 2008 16:03:06 -0400
Subject: Fix btrfs_open_devices to deal with changes since the scan ioctls

Devices can change after the scan ioctls are done, and btrfs_open_devices
needs to be able to verify them as they are opened and used by the FS.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 70 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 59 insertions(+), 11 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 43f74d17bce..501d23d3ebf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -71,6 +71,7 @@ int btrfs_cleanup_fs_uuids(void)
 					 dev_list);
 			if (dev->bdev) {
 				close_bdev_excl(dev->bdev);
+				fs_devices->open_devices--;
 			}
 			list_del(&dev->dev_list);
 			kfree(dev->name);
@@ -174,9 +175,10 @@ again:
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (!device->in_fs_metadata) {
-printk("getting rid of extra dev %s\n", device->name);
-			if (device->bdev)
+			if (device->bdev) {
 				close_bdev_excl(device->bdev);
+				fs_devices->open_devices--;
+			}
 			list_del(&device->dev_list);
 			list_del(&device->dev_alloc_list);
 			fs_devices->num_devices--;
@@ -188,6 +190,7 @@ printk("getting rid of extra dev %s\n", device->name);
 	mutex_unlock(&uuid_mutex);
 	return 0;
 }
+
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct list_head *head = &fs_devices->devices;
@@ -199,10 +202,12 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
 			close_bdev_excl(device->bdev);
+			fs_devices->open_devices--;
 		}
 		device->bdev = NULL;
 		device->in_fs_metadata = 0;
 	}
+	fs_devices->mounted = 0;
 	mutex_unlock(&uuid_mutex);
 	return 0;
 }
@@ -214,9 +219,19 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	struct list_head *head = &fs_devices->devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
-	int ret;
+	struct block_device *latest_bdev = NULL;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 latest_devid = 0;
+	u64 latest_transid = 0;
+	u64 transid;
+	u64 devid;
+	int ret = 0;
 
 	mutex_lock(&uuid_mutex);
+	if (fs_devices->mounted)
+		goto out;
+
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev)
@@ -229,21 +244,52 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
-			ret = PTR_ERR(bdev);
-			goto fail;
+			goto error;
 		}
 		set_blocksize(bdev, 4096);
-		if (device->devid == fs_devices->latest_devid)
-			fs_devices->latest_bdev = bdev;
+
+		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		if (!bh)
+			goto error_close;
+
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+		    sizeof(disk_super->magic)))
+			goto error_brelse;
+
+		devid = le64_to_cpu(disk_super->dev_item.devid);
+		if (devid != device->devid)
+			goto error_brelse;
+
+		transid = btrfs_super_generation(disk_super);
+		if (transid > latest_transid) {
+			latest_devid = devid;
+			latest_transid = transid;
+			latest_bdev = bdev;
+		}
+
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
+		fs_devices->open_devices++;
+		continue;
 
+error_brelse:
+		brelse(bh);
+error_close:
+		close_bdev_excl(bdev);
+error:
+		continue;
 	}
+	if (fs_devices->open_devices == 0) {
+		ret = -EIO;
+		goto out;
+	}
+	fs_devices->mounted = 1;
+	fs_devices->latest_bdev = latest_bdev;
+	fs_devices->latest_devid = latest_devid;
+	fs_devices->latest_trans = latest_transid;
+out:
 	mutex_unlock(&uuid_mutex);
-	return 0;
-fail:
-	mutex_unlock(&uuid_mutex);
-	btrfs_close_devices(fs_devices);
 	return ret;
 }
 
@@ -828,6 +874,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
 		close_bdev_excl(device->bdev);
+		root->fs_info->fs_devices->open_devices--;
 	}
 	if (bdev) {
 		/* one close for us */
@@ -914,6 +961,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	list_add(&device->dev_alloc_list,
 		 &root->fs_info->fs_devices->alloc_list);
 	root->fs_info->fs_devices->num_devices++;
+	root->fs_info->fs_devices->open_devices++;
 out:
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-- 
cgit v1.2.3


From 6af5ac3c1aede574acde5fc477f1a1c876fb568e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 May 2008 13:14:57 -0400
Subject: Btrfs: Handle transid == 0 while opening devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 501d23d3ebf..3fbf0834e88 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -262,7 +262,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			goto error_brelse;
 
 		transid = btrfs_super_generation(disk_super);
-		if (transid > latest_transid) {
+		if (!latest_transid || transid > latest_transid) {
 			latest_devid = devid;
 			latest_transid = transid;
 			latest_bdev = bdev;
-- 
cgit v1.2.3


From 515dc3228c568f9df8b634f82b3471e20205ac16 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 May 2008 13:30:15 -0400
Subject: Btrfs: Use kzalloc on the fs_devices allocation

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3fbf0834e88..333e5269577 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -120,7 +120,7 @@ static int device_list_add(const char *path,
 
 	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
-		fs_devices = kmalloc(sizeof(*fs_devices), GFP_NOFS);
+		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 		if (!fs_devices)
 			return -ENOMEM;
 		INIT_LIST_HEAD(&fs_devices->devices);
@@ -129,7 +129,6 @@ static int device_list_add(const char *path,
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		fs_devices->num_devices = 0;
 		device = NULL;
 	} else {
 		device = __find_device(&fs_devices->devices, devid,
-- 
cgit v1.2.3


From 0ef3e66b6700eb8f052daa8b89443ff872fbbdfc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 24 May 2008 14:04:53 -0400
Subject: Btrfs: Allocator fix variety pack

* Force chunk allocation when find_free_extent has to do a full scan
* Record the max key at the start of defrag so it doesn't run forever
* Block groups might not be contiguous, make a forward search for the
  next block group in extent-tree.c
* Get rid of extra checks for total fs size
* Fix relocate_one_reference to avoid relocating the same file data block
  twice when referenced by an older transaction
* Use the open device count when allocating chunks so that we don't
  try to allocate from devices that don't exist

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 333e5269577..722eb455015 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -750,10 +750,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	if (bdev == fs_devices->latest_bdev)
 		fs_devices->latest_bdev = next_dev->bdev;
 
-	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
-				    total_bytes - device->total_bytes);
-
 	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
 				    total_bytes - 1);
@@ -849,6 +845,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	}
 	root->fs_info->fs_devices->num_devices--;
+	root->fs_info->fs_devices->open_devices--;
 
 	ret = btrfs_shrink_device(device, 0);
 	if (ret)
@@ -873,7 +870,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
 		close_bdev_excl(device->bdev);
-		root->fs_info->fs_devices->open_devices--;
 	}
 	if (bdev) {
 		/* one close for us */
@@ -1450,7 +1446,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		return -ENOSPC;
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		num_stripes = extent_root->fs_info->fs_devices->open_devices;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
@@ -1459,13 +1455,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
-				  btrfs_super_num_devices(&info->super_copy));
+			    extent_root->fs_info->fs_devices->open_devices);
 		if (num_stripes < 2)
 			return -ENOSPC;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		num_stripes = extent_root->fs_info->fs_devices->open_devices;
 		if (num_stripes < 4)
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
-- 
cgit v1.2.3


From 8b7128429235d9bd72cfd5ed20c77c4f3118f744 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Jun 2008 16:50:36 -0400
Subject: Btrfs: Add async worker threads for pre and post IO checksumming

Btrfs has been using workqueues to spread the checksumming load across
other CPUs in the system.  But, workqueues only schedule work on the
same CPU that queued the work, giving them a limited benefit for systems with
higher CPU counts.

This code adds a generic facility to schedule work with pools of kthreads,
and changes the bio submission code to queue bios up.  The queueing is
important to make sure large numbers of procs on the system don't
turn streaming workloads into random workloads by sending IO down
concurrently.

The end result of all of this is much higher performance (and CPU usage) when
doing checksumming on large machines.  Two worker pools are created,
one for writes and one for endio processing.  The two could deadlock if
we tried to service both from a single pool.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 157 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 722eb455015..c57458ce633 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "async-thread.h"
 
 struct map_lookup {
 	u64 type;
@@ -110,6 +111,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
 	return NULL;
 }
 
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+int run_scheduled_bios(struct btrfs_device *device)
+{
+	struct bio *pending;
+	struct backing_dev_info *bdi;
+	struct bio *tail;
+	struct bio *cur;
+	int again = 0;
+	unsigned long num_run = 0;
+
+	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+loop:
+	spin_lock(&device->io_lock);
+
+	/* take all the bios off the list at once and process them
+	 * later on (without the lock held).  But, remember the
+	 * tail and other pointers so the bios can be properly reinserted
+	 * into the list if we hit congestion
+	 */
+	pending = device->pending_bios;
+	tail = device->pending_bio_tail;
+	WARN_ON(pending && !tail);
+	device->pending_bios = NULL;
+	device->pending_bio_tail = NULL;
+
+	/*
+	 * if pending was null this time around, no bios need processing
+	 * at all and we can stop.  Otherwise it'll loop back up again
+	 * and do an additional check so no bios are missed.
+	 *
+	 * device->running_pending is used to synchronize with the
+	 * schedule_bio code.
+	 */
+	if (pending) {
+		again = 1;
+		device->running_pending = 1;
+	} else {
+		again = 0;
+		device->running_pending = 0;
+	}
+	spin_unlock(&device->io_lock);
+
+	while(pending) {
+		cur = pending;
+		pending = pending->bi_next;
+		cur->bi_next = NULL;
+		atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+		submit_bio(cur->bi_rw, cur);
+		num_run++;
+
+		/*
+		 * we made progress, there is more work to do and the bdi
+		 * is now congested.  Back off and let other work structs
+		 * run instead
+		 */
+		if (pending && num_run && bdi_write_congested(bdi)) {
+			struct bio *old_head;
+
+			spin_lock(&device->io_lock);
+			old_head = device->pending_bios;
+			device->pending_bios = pending;
+			if (device->pending_bio_tail)
+				tail->bi_next = old_head;
+			else
+				device->pending_bio_tail = tail;
+
+			spin_unlock(&device->io_lock);
+			btrfs_requeue_work(&device->work);
+			goto done;
+		}
+	}
+	if (again)
+		goto loop;
+done:
+	return 0;
+}
+
+void pending_bios_fn(struct btrfs_work *work)
+{
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, work);
+	run_scheduled_bios(device);
+}
+
 static int device_list_add(const char *path,
 			   struct btrfs_super_block *disk_super,
 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -141,6 +237,7 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		device->devid = devid;
+		device->work.func = pending_bios_fn;
 		memcpy(device->uuid, disk_super->dev_item.uuid,
 		       BTRFS_UUID_SIZE);
 		device->barriers = 1;
@@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	device->barriers = 1;
+	device->work.func = pending_bios_fn;
 	generate_random_uuid(device->uuid);
 	spin_lock_init(&device->io_lock);
 	device->name = kstrdup(device_path, GFP_NOFS);
@@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 }
 
+struct async_sched {
+	struct bio *bio;
+	int rw;
+	struct btrfs_fs_info *info;
+	struct btrfs_work work;
+};
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
+		 int rw, struct bio *bio)
+{
+	int should_queue = 1;
+
+	/* don't bother with additional async steps for reads, right now */
+	if (!(rw & (1 << BIO_RW))) {
+		submit_bio(rw, bio);
+		return 0;
+	}
+
+	/*
+	 * nr_async_sumbits allows us to reliably return congestion to the
+	 * higher layers.  Otherwise, the async bio makes it appear we have
+	 * made progress against dirty pages when we've really just put it
+	 * on a queue for later
+	 */
+	atomic_inc(&root->fs_info->nr_async_submits);
+	bio->bi_next = NULL;
+	bio->bi_rw |= rw;
+
+	spin_lock(&device->io_lock);
+
+	if (device->pending_bio_tail)
+		device->pending_bio_tail->bi_next = bio;
+
+	device->pending_bio_tail = bio;
+	if (!device->pending_bios)
+		device->pending_bios = bio;
+	if (device->running_pending)
+		should_queue = 0;
+
+	spin_unlock(&device->io_lock);
+
+	if (should_queue)
+		btrfs_queue_worker(&root->fs_info->workers, &device->work);
+	return 0;
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
-		  int mirror_num)
+		  int mirror_num, int async_submit)
 {
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
@@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		dev = multi->stripes[dev_nr].dev;
 		if (dev && dev->bdev) {
 			bio->bi_bdev = dev->bdev;
-			spin_lock(&dev->io_lock);
-			dev->total_ios++;
-			spin_unlock(&dev->io_lock);
-			submit_bio(rw, bio);
+			if (async_submit)
+				schedule_bio(root, dev, rw, bio);
+			else
+				submit_bio(rw, bio);
 		} else {
 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 			bio->bi_sector = logical >> 9;
@@ -2054,6 +2205,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->barriers = 1;
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
+	device->work.func = pending_bios_fn;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
-- 
cgit v1.2.3


From 1cc127b5d1b71453091859301de4a7dd6ee96fa8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Jun 2008 14:46:17 -0400
Subject: Btrfs: Add a thread pool just for submit_bio

If a bio submission is after a lock holder waiting for the bio
on the work queue, it is possible to deadlock.  Move the bios
into their own pool.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c57458ce633..ba396857102 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2112,7 +2112,8 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	spin_unlock(&device->io_lock);
 
 	if (should_queue)
-		btrfs_queue_worker(&root->fs_info->workers, &device->work);
+		btrfs_queue_worker(&root->fs_info->submit_workers,
+				   &device->work);
 	return 0;
 }
 
-- 
cgit v1.2.3


From a213501153fd66e2359e091b1612841305ba6551 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:30 -0400
Subject: Btrfs: Replace the big fs_mutex with a collection of other locks

Extent alloctions are still protected by a large alloc_mutex.
Objectid allocations are covered by a objectid mutex
Other btree operations are protected by a lock on individual btree nodes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ba396857102..869864ddcc2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -866,7 +866,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	u64 devid;
 	int ret = 0;
 
-	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
 	mutex_lock(&uuid_mutex);
 
 	all_avail = root->fs_info->avail_data_alloc_bits |
@@ -984,7 +985,8 @@ error_close:
 		close_bdev_excl(bdev);
 out:
 	mutex_unlock(&uuid_mutex);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -1003,7 +1005,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (!bdev) {
 		return -EIO;
 	}
-	mutex_lock(&root->fs_info->fs_mutex);
+
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
+
 	trans = btrfs_start_transaction(root, 1);
 	devices = &root->fs_info->fs_devices->devices;
 	list_for_each(cur, devices) {
@@ -1057,7 +1062,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->open_devices++;
 out:
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	return ret;
 
 out_close_bdev:
@@ -1297,9 +1304,10 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	struct btrfs_key found_key;
 
 
+	BUG(); /* FIXME, needs locking */
+
 	dev_root = dev_root->fs_info->dev_root;
 
-	mutex_lock(&dev_root->fs_info->fs_mutex);
 	/* step one make some room on all the devices */
 	list_for_each(cur, devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1368,7 +1376,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	ret = 0;
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&dev_root->fs_info->fs_mutex);
 	return ret;
 }
 
-- 
cgit v1.2.3


From a74a4b97b61beede185b4b3ad359d7d378b0d312 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:31 -0400
Subject: Btrfs: Replace the transaction work queue with kthreads

This creates one kthread for commits and one kthread for
deleting old snapshots.  All the work queues are removed.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 869864ddcc2..4e7cee27aab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -271,13 +271,17 @@ again:
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (!device->in_fs_metadata) {
-			if (device->bdev) {
-				close_bdev_excl(device->bdev);
-				fs_devices->open_devices--;
-			}
+			struct block_device *bdev;
 			list_del(&device->dev_list);
 			list_del(&device->dev_alloc_list);
 			fs_devices->num_devices--;
+			if (device->bdev) {
+				bdev = device->bdev;
+				fs_devices->open_devices--;
+				mutex_unlock(&uuid_mutex);
+				close_bdev_excl(bdev);
+				mutex_lock(&uuid_mutex);
+			}
 			kfree(device->name);
 			kfree(device);
 			goto again;
-- 
cgit v1.2.3


From 7d9eb12c8739e7dc80c78c6b3596f912ecd8f941 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jul 2008 14:19:17 -0400
Subject: Btrfs: Add locking around volume management (device
 add/remove/balance)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 58 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 44 insertions(+), 14 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4e7cee27aab..5e6ee7a6f73 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -56,6 +56,18 @@ void btrfs_unlock_volumes(void)
 	mutex_unlock(&uuid_mutex);
 }
 
+static void lock_chunks(struct btrfs_root *root)
+{
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static void unlock_chunks(struct btrfs_root *root)
+{
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
@@ -822,6 +834,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = device->devid;
+	lock_chunks(root);
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
@@ -856,6 +869,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 				    total_bytes - 1);
 out:
 	btrfs_free_path(path);
+	unlock_chunks(root);
 	btrfs_commit_transaction(trans, root);
 	return ret;
 }
@@ -870,9 +884,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	u64 devid;
 	int ret = 0;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-	mutex_lock(&root->fs_info->chunk_mutex);
 	mutex_lock(&uuid_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
 
 	all_avail = root->fs_info->avail_data_alloc_bits |
 		root->fs_info->avail_system_alloc_bits |
@@ -988,9 +1001,8 @@ error_close:
 	if (bdev)
 		close_bdev_excl(bdev);
 out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
-	mutex_unlock(&root->fs_info->chunk_mutex);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -1010,10 +1022,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EIO;
 	}
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
 
 	trans = btrfs_start_transaction(root, 1);
+	lock_chunks(root);
 	devices = &root->fs_info->fs_devices->devices;
 	list_for_each(cur, devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1065,9 +1077,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->num_devices++;
 	root->fs_info->fs_devices->open_devices++;
 out:
+	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->chunk_mutex);
-	mutex_unlock(&root->fs_info->alloc_mutex);
+	mutex_unlock(&root->fs_info->volume_mutex);
 
 	return ret;
 
@@ -1122,7 +1134,7 @@ out:
 	return ret;
 }
 
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size)
 {
 	struct btrfs_super_block *super_copy =
@@ -1134,6 +1146,16 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 	return btrfs_update_device(trans, device);
 }
 
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	int ret;
+	lock_chunks(device->dev_root);
+	ret = __btrfs_grow_device(trans, device, new_size);
+	unlock_chunks(device->dev_root);
+	return ret;
+}
+
 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    u64 chunk_tree, u64 chunk_objectid,
@@ -1234,6 +1256,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
+	lock_chunks(root);
+
 	/*
 	 * step two, delete the device extents and the
 	 * chunk tree entries
@@ -1278,6 +1302,7 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	/* once for us */
 	free_extent_map(em);
 
+	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
 	return 0;
 }
@@ -1308,8 +1333,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	struct btrfs_key found_key;
 
 
-	BUG(); /* FIXME, needs locking */
-
+	mutex_lock(&dev_root->fs_info->volume_mutex);
 	dev_root = dev_root->fs_info->dev_root;
 
 	/* step one make some room on all the devices */
@@ -1355,13 +1379,14 @@ int btrfs_balance(struct btrfs_root *dev_root)
 
 		ret = btrfs_previous_item(chunk_root, path, 0,
 					  BTRFS_CHUNK_ITEM_KEY);
-		if (ret) {
+		if (ret)
 			break;
-		}
+
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
 		if (found_key.objectid != key.objectid)
 			break;
+
 		chunk = btrfs_item_ptr(path->nodes[0],
 				       path->slots[0],
 				       struct btrfs_chunk);
@@ -1370,16 +1395,17 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		if (key.offset == 0)
 			break;
 
+		btrfs_release_path(chunk_root, path);
 		ret = btrfs_relocate_chunk(chunk_root,
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
 					   found_key.offset);
 		BUG_ON(ret);
-		btrfs_release_path(chunk_root, path);
 	}
 	ret = 0;
 error:
 	btrfs_free_path(path);
+	mutex_unlock(&dev_root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -1419,14 +1445,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
 	path->reada = 2;
 
+	lock_chunks(root);
+
 	device->total_bytes = new_size;
 	ret = btrfs_update_device(trans, device);
 	if (ret) {
+		unlock_chunks(root);
 		btrfs_end_transaction(trans, root);
 		goto done;
 	}
 	WARN_ON(diff > old_total);
 	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
 
 	key.objectid = device->devid;
-- 
cgit v1.2.3


From bcc63abbf3e9bf948a1b0129b3e6120ec7d7f698 Mon Sep 17 00:00:00 2001
From: Yan <zheng.yan@oracle.com>
Date: Wed, 30 Jul 2008 16:29:20 -0400
Subject: Btrfs: implement memory reclaim for leaf reference cache

The memory reclaiming issue happens when snapshot exists. In that
case, some cache entries may not be used during old snapshot dropping,
so they will remain in the cache until umount.

The patch adds a field to struct btrfs_leaf_ref to record create time. Besides,
the patch makes all dead roots of a given snapshot linked together in order of
create time. After a old snapshot was completely dropped, we check the dead
root list and remove all cache entries created before the oldest dead root in
the list.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e6ee7a6f73..18db4cbe279 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2527,4 +2527,3 @@ again:
 error:
 	return ret;
 }
-
-- 
cgit v1.2.3


From 492bb6deee3416ad792dcd8584ebd95c463af1a6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 31 Jul 2008 16:29:02 -0400
Subject: Btrfs: Hold a reference on bios during submit_bio, add some extra bio
 checks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 18db4cbe279..0de51e37e78 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -180,7 +180,11 @@ loop:
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
 		atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+
+		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+		bio_get(cur);
 		submit_bio(cur->bi_rw, cur);
+		bio_put(cur);
 		num_run++;
 
 		/*
@@ -188,10 +192,11 @@ loop:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && num_run && bdi_write_congested(bdi)) {
+		if (pending && bdi_write_congested(bdi)) {
 			struct bio *old_head;
 
 			spin_lock(&device->io_lock);
+
 			old_head = device->pending_bios;
 			device->pending_bios = pending;
 			if (device->pending_bio_tail)
@@ -2125,7 +2130,9 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
+		bio_get(bio);
 		submit_bio(rw, bio);
+		bio_put(bio);
 		return 0;
 	}
 
@@ -2136,6 +2143,7 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	 * on a queue for later
 	 */
 	atomic_inc(&root->fs_info->nr_async_submits);
+	WARN_ON(bio->bi_next);
 	bio->bi_next = NULL;
 	bio->bi_rw |= rw;
 
-- 
cgit v1.2.3


From 7d2b4daa67379960477568abda62b8ba9ee3a8aa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 5 Aug 2008 10:13:57 -0400
Subject: Btrfs: Fix the multi-bio code to save the original bio for completion

The multi-bio code is responsible for duplicating blocks in raid1 and
single spindle duplication.  It has counters to make sure all of
the locations for a given extent are properly written before io completion
is returned to the higher layers.

But, it didn't always complete the same bio it was given, sometimes a
clone was completed instead.  This lead to problems with the async
work queues because they saved a pointer to the bio in a struct off
bi_private.

The fix is to remember the original bio and only complete that one.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0de51e37e78..09311b3066d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2070,6 +2070,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
+	int is_orig_bio = 0;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
@@ -2078,7 +2079,14 @@ static int end_bio_multi_stripe(struct bio *bio,
 	if (err)
 		atomic_inc(&multi->error);
 
+	if (bio == multi->orig_bio)
+		is_orig_bio = 1;
+
 	if (atomic_dec_and_test(&multi->stripes_pending)) {
+		if (!is_orig_bio) {
+			bio_put(bio);
+			bio = multi->orig_bio;
+		}
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
 		/* only send an error to the higher layers if it is
@@ -2101,7 +2109,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 #else
 		bio_endio(bio, err);
 #endif
-	} else {
+	} else if (!is_orig_bio) {
 		bio_put(bio);
 	}
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
@@ -2196,6 +2204,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	}
 	multi->end_io = first_bio->bi_end_io;
 	multi->private = first_bio->bi_private;
+	multi->orig_bio = first_bio;
 	atomic_set(&multi->stripes_pending, multi->num_stripes);
 
 	while(dev_nr < total_devs) {
-- 
cgit v1.2.3


From 0986fe9eac24fd186927c3b87af51d62f8ab92cd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:15 -0400
Subject: Btrfs: Count async bios separately from async checksum work items

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 09311b3066d..23a5b0aba00 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -179,7 +179,7 @@ loop:
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
-		atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+		atomic_dec(&device->dev_root->fs_info->nr_async_bios);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 		bio_get(cur);
@@ -2145,12 +2145,12 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	}
 
 	/*
-	 * nr_async_sumbits allows us to reliably return congestion to the
+	 * nr_async_bios allows us to reliably return congestion to the
 	 * higher layers.  Otherwise, the async bio makes it appear we have
 	 * made progress against dirty pages when we've really just put it
 	 * on a queue for later
 	 */
-	atomic_inc(&root->fs_info->nr_async_submits);
+	atomic_inc(&root->fs_info->nr_async_bios);
 	WARN_ON(bio->bi_next);
 	bio->bi_next = NULL;
 	bio->bi_rw |= rw;
-- 
cgit v1.2.3


From b64a2851ba25b3410a87d3d1b751155612105c8e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 20 Aug 2008 13:39:41 -0400
Subject: Btrfs: Wait for async bio submissions to make some progress at queue
 time

Before, the btrfs bdi congestion function was used to test for too many
async bios.  This keeps that check to throttle pdflush, but also
adds a check while queuing bios.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23a5b0aba00..2652660e607 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -138,12 +138,18 @@ int run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
+	struct btrfs_fs_info *fs_info;
 	struct bio *tail;
 	struct bio *cur;
 	int again = 0;
 	unsigned long num_run = 0;
+	unsigned long limit;
 
 	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+	fs_info = device->dev_root->fs_info;
+	limit = btrfs_async_submit_limit(fs_info);
+	limit = limit * 2 / 3;
+
 loop:
 	spin_lock(&device->io_lock);
 
@@ -179,7 +185,11 @@ loop:
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
-		atomic_dec(&device->dev_root->fs_info->nr_async_bios);
+		atomic_dec(&fs_info->nr_async_bios);
+
+		if (atomic_read(&fs_info->nr_async_bios) < limit &&
+		    waitqueue_active(&fs_info->async_submit_wait))
+			wake_up(&fs_info->async_submit_wait);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 		bio_get(cur);
@@ -2135,6 +2145,7 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 		 int rw, struct bio *bio)
 {
 	int should_queue = 1;
+	unsigned long limit;
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
@@ -2171,6 +2182,11 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	if (should_queue)
 		btrfs_queue_worker(&root->fs_info->submit_workers,
 				   &device->work);
+
+	limit = btrfs_async_submit_limit(root->fs_info);
+	wait_event_timeout(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->nr_async_bios) < limit),
+			   HZ/10);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9473f16c75606fe6b2e5000525fe9766114505f3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 28 Aug 2008 06:15:24 -0400
Subject: Btrfs: Throttle for async bio submits higher up the chain

The current code waits for the count of async bio submits to get below
a given threshold if it is too high right after adding the latest bio
to the work queue.  This isn't optimal because the caller may have
sequential adjacent bios pending they are waiting to send down the pipe.

This changeset requires the caller to wait on the async bio count,
and changes the async checksumming submits to wait for async bios any
time they self throttle.

The end result is much higher sequential throughput.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2652660e607..5b1b60839d2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2145,7 +2145,6 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 		 int rw, struct bio *bio)
 {
 	int should_queue = 1;
-	unsigned long limit;
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
@@ -2182,11 +2181,6 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	if (should_queue)
 		btrfs_queue_worker(&root->fs_info->submit_workers,
 				   &device->work);
-
-	limit = btrfs_async_submit_limit(root->fs_info);
-	wait_event_timeout(root->fs_info->async_submit_wait,
-			   (atomic_read(&root->fs_info->nr_async_bios) < limit),
-			   HZ/10);
 	return 0;
 }
 
-- 
cgit v1.2.3


From a1b32a5932cfac7c38b442582285f3da2a09dfd8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 5 Sep 2008 16:09:51 -0400
Subject: Btrfs: Add debugging checks to track down corrupted metadata

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5b1b60839d2..37a8ea23e81 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -94,8 +94,8 @@ int btrfs_cleanup_fs_uuids(void)
 	return 0;
 }
 
-static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
-					  u8 *uuid)
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+						   u64 devid, u8 *uuid)
 {
 	struct btrfs_device *dev;
 	struct list_head *cur;
@@ -110,7 +110,7 @@ static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
 	return NULL;
 }
 
-static struct btrfs_fs_devices *find_fsid(u8 *fsid)
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 {
 	struct list_head *cur;
 	struct btrfs_fs_devices *fs_devices;
@@ -134,7 +134,7 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
  * the list if the block device is congested.  This way, multiple devices
  * can make progress from a single worker thread.
  */
-int run_scheduled_bios(struct btrfs_device *device)
+static int noinline run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
@@ -233,7 +233,7 @@ void pending_bios_fn(struct btrfs_work *work)
 	run_scheduled_bios(device);
 }
 
-static int device_list_add(const char *path,
+static noinline int device_list_add(const char *path,
 			   struct btrfs_super_block *disk_super,
 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 {
@@ -480,10 +480,10 @@ error:
  * called very infrequently and that a given device has a small number
  * of extents
  */
-static int find_free_dev_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_device *device,
-				struct btrfs_path *path,
-				u64 num_bytes, u64 *start)
+static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_device *device,
+					 struct btrfs_path *path,
+					 u64 num_bytes, u64 *start)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
@@ -645,7 +645,7 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
 			   u64 chunk_offset,
@@ -693,7 +693,8 @@ err:
 	return ret;
 }
 
-static int find_next_chunk(struct btrfs_root *root, u64 objectid, u64 *offset)
+static noinline int find_next_chunk(struct btrfs_root *root,
+				    u64 objectid, u64 *offset)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -735,8 +736,8 @@ error:
 	return ret;
 }
 
-static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
-			   u64 *objectid)
+static noinline int find_next_devid(struct btrfs_root *root,
+				    struct btrfs_path *path, u64 *objectid)
 {
 	int ret;
 	struct btrfs_key key;
@@ -1103,8 +1104,8 @@ out_close_bdev:
 	goto out;
 }
 
-int btrfs_update_device(struct btrfs_trans_handle *trans,
-			struct btrfs_device *device)
+int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
+				 struct btrfs_device *device)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1544,8 +1545,8 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
-			       int sub_stripes)
+static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
+					int num_stripes, int sub_stripes)
 {
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		return calc_size;
@@ -2141,8 +2142,9 @@ struct async_sched {
  * This will add one bio to the pending list for a device and make sure
  * the work struct is scheduled.
  */
-int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
-		 int rw, struct bio *bio)
+static int noinline schedule_bio(struct btrfs_root *root,
+				 struct btrfs_device *device,
+				 int rw, struct bio *bio)
 {
 	int should_queue = 1;
 
-- 
cgit v1.2.3


From 325cd4bafeb6cfb44addd6e807a9b74170d1be31 Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Fri, 5 Sep 2008 16:43:54 -0400
Subject: Btrfs: properly set blocksize when adding new device.

---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 37a8ea23e81..1546fa6f4f7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1079,6 +1079,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (ret)
 		goto out_close_bdev;
 
+	set_blocksize(device->bdev, 4096);
+
 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
 				    total_bytes + device->total_bytes);
-- 
cgit v1.2.3


From 0f9dd46cda36b8de3b9f48bc42bd09d20b9c3b52 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 23 Sep 2008 13:14:11 -0400
Subject: Btrfs: free space accounting redo

1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size.  The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing.  If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible.  When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.

2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset.  also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.

3) cleaned up the allocation code to make it a little easier to read and a
little less complicated.  Basically there are 3 steps, first look from our
provided hint.  If we couldn't find from that given hint, start back at our
original search start and look for space from there.  If that fails try to
allocate space if we can and start looking again.  If not we're screwed and need
to start over again.

4) small fixes.  there were some issues in volumes.c where we wouldn't allocate
the rest of the disk.  fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space.  Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space.  Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups.  The alloc_hint has fixed
this slight degredation and made things semi-normal.

There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space.  This only happens with metadata
allocations, and only when we are almost full.  So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall.  I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1546fa6f4f7..b9e5c2d82dd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -64,8 +64,8 @@ static void lock_chunks(struct btrfs_root *root)
 
 static void unlock_chunks(struct btrfs_root *root)
 {
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 }
 
 int btrfs_cleanup_fs_uuids(void)
@@ -1668,8 +1668,13 @@ again:
 	else
 		min_free = calc_size;
 
-	/* we add 1MB because we never use the first 1MB of the device */
-	min_free += 1024 * 1024;
+	/*
+	 * we add 1MB because we never use the first 1MB of the device, unless
+	 * we've looped, then we are likely allocating the maximum amount of
+	 * space left already
+	 */
+	if (!looped)
+		min_free += 1024 * 1024;
 
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
-- 
cgit v1.2.3


From 2b1f55b0f0d0d1a66470ef4ea2696cd5dd741a12 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 24 Sep 2008 11:48:04 -0400
Subject: Remove Btrfs compat code for older kernels

Btrfs had compatibility code for kernels back to 2.6.18.  These have
been removed, and will be maintained in a separate backport
git tree from now on.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9e5c2d82dd..ddf89626498 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2080,20 +2080,11 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 }
 
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_multi_stripe(struct bio *bio, int err)
-#else
-static int end_bio_multi_stripe(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
 	int is_orig_bio = 0;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
 	if (err)
 		atomic_inc(&multi->error);
 
@@ -2122,17 +2113,10 @@ static int end_bio_multi_stripe(struct bio *bio,
 		}
 		kfree(multi);
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-		bio_endio(bio, bio->bi_size, err);
-#else
 		bio_endio(bio, err);
-#endif
 	} else if (!is_orig_bio) {
 		bio_put(bio);
 	}
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 struct async_sched {
@@ -2248,11 +2232,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		} else {
 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 			bio->bi_sector = logical >> 9;
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-			bio_endio(bio, bio->bi_size, -EIO);
-#else
 			bio_endio(bio, -EIO);
-#endif
 		}
 		dev_nr++;
 	}
-- 
cgit v1.2.3


From 1a40e23b95da45051ee4d74374c58ae87a14051c Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Fri, 26 Sep 2008 10:09:34 -0400
Subject: Btrfs: update space balancing code

This patch updates the space balancing code to utilize the new
backref format.  Before, btrfs-vol -b would break any COW links
on data blocks or metadata.  This was slow and caused the amount
of space used to explode if a large number of snapshots were present.

The new code can keeps the sharing of all data extents and
most of the tree blocks.

To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.

To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).

To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ddf89626498..51f113119b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1268,7 +1268,7 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	em_tree = &root->fs_info->mapping_tree.map_tree;
 
 	/* step one, relocate all the extents inside this chunk */
-	ret = btrfs_shrink_extent_tree(extent_root, chunk_offset);
+	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
 	BUG_ON(ret);
 
 	trans = btrfs_start_transaction(root, 1);
@@ -1308,15 +1308,18 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 		BUG_ON(ret);
 	}
 
+	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+	BUG_ON(ret);
+
 	spin_lock(&em_tree->lock);
 	remove_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+
 	kfree(map);
 	em->bdev = NULL;
 
 	/* once for the tree */
 	free_extent_map(em);
-	spin_unlock(&em_tree->lock);
-
 	/* once for us */
 	free_extent_map(em);
 
-- 
cgit v1.2.3


From 8c8bee1d7ca47fc75b6bd24a8085c525a2394c02 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 29 Sep 2008 11:19:10 -0400
Subject: Btrfs: Wait for IO on the block device inodes of newly added devices

btrfs-vol -a /dev/xxx will zero the first and last two MB of the device.
The kernel code needs to wait for this IO to finish before it adds
the device.

btrfs metadata IO does not happen through the block device inode.  A
separate address space is used, allowing the zero filled buffer heads in
the block device inode to be written to disk after FS metadata starts
going down to the disk via the btrfs metadata inode.

The end result is zero filled metadata blocks after adding new devices
into the filesystem.

The fix is a simple filemap_write_and_wait on the block device inode
before actually inserting it into the pool of available devices.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 51f113119b2..f63cf7621a0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1038,6 +1038,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EIO;
 	}
 
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
 	mutex_lock(&root->fs_info->volume_mutex);
 
 	trans = btrfs_start_transaction(root, 1);
-- 
cgit v1.2.3


From a62b940160d8125016e85046e68ae621c99e751f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 3 Oct 2008 16:31:08 -0400
Subject: Btrfs: cast bio->bi_sector to a u64 before shifting

On 32 bit machines without CONFIG_LBD, the bi_sector field is only 32 bits.
Btrfs needs to cast it before shifting up, or we end up doing IO into
the wrong place.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f63cf7621a0..2eed7f91f51 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2187,7 +2187,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
-	u64 logical = bio->bi_sector << 9;
+	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
 	struct btrfs_multi_bio *multi = NULL;
-- 
cgit v1.2.3


From c8b978188c9a0fd3d535c13debd19d522b726f1f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 29 Oct 2008 14:49:59 -0400
Subject: Btrfs: Add zlib compression support

This is a large change for adding compression on reading and writing,
both for inline and regular extents.  It does some fairly large
surgery to the writeback paths.

Compression is off by default and enabled by mount -o compress.  Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.

If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.

* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler.  This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.

* Inline extents are inserted at delalloc time now.  This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.

* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.

From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field.  Neither the encryption or the
'other' field are currently used.

In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k.  This is a
software only limit, the disk format supports u64 sized compressed extents.

In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k.  This is a software only limit
and will be subject to tuning later.

Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data.  This way additional encodings can be
layered on without having to figure out which encoding to checksum.

Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread.  This makes it tricky to
spread the compression load across all the cpus on the box.  We'll have to
look at parallel pdflush walks of dirty inodes at a later time.

Decompression is hooked into readpages and it does spread across CPUs nicely.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2eed7f91f51..7db4cfd03a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1816,6 +1816,7 @@ again:
 	em->start = key.offset;
 	em->len = *num_bytes;
 	em->block_start = 0;
+	em->block_len = em->len;
 
 	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_add_system_chunk(trans, chunk_root, &key,
@@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em->start = logical;
 	em->len = length;
 	em->block_start = 0;
+	em->block_len = em->len;
 
 	map->num_stripes = num_stripes;
 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
-- 
cgit v1.2.3


From 2517920135b0d29e70453e5b03d70d7b94207df3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 29 Oct 2008 14:49:05 -0400
Subject: Btrfs: nuke fs wide allocation mutex V2

This patch removes the giant fs_info->alloc_mutex and replaces it with a bunch
of little locks.

There is now a pinned_mutex, which is used when messing with the pinned_extents
extent io tree, and the extent_ins_mutex which is used with the pending_del and
extent_ins extent io trees.

The locking for the extent tree stuff was inspired by a patch that Yan Zheng
wrote to fix a race condition, I cleaned it up some and changed the locking
around a little bit, but the idea remains the same.  Basically instead of
holding the extent_ins_mutex throughout the processing of an extent on the
extent_ins or pending_del trees, we just hold it while we're searching and when
we clear the bits on those trees, and lock the extent for the duration of the
operations on the extent.

Also to keep from getting hung up waiting to lock an extent, I've added a
try_lock_extent so if we cannot lock the extent, move on to the next one in the
tree and we'll come back to that one.  I have tested this heavily and it does
not appear to break anything.  This has to be applied on top of my
find_free_extent redo patch.

I tested this patch on top of Yan's space reblancing code and it worked fine.
The only thing that has changed since the last version is I pulled out all my
debugging stuff, apparently I forgot to run guilt refresh before I sent the
last patch out.  Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/volumes.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7db4cfd03a9..cbb9bb31431 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -58,14 +58,12 @@ void btrfs_unlock_volumes(void)
 
 static void lock_chunks(struct btrfs_root *root)
 {
-	mutex_lock(&root->fs_info->alloc_mutex);
 	mutex_lock(&root->fs_info->chunk_mutex);
 }
 
 static void unlock_chunks(struct btrfs_root *root)
 {
 	mutex_unlock(&root->fs_info->chunk_mutex);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 }
 
 int btrfs_cleanup_fs_uuids(void)
-- 
cgit v1.2.3


From 5f2cc086ccab27ac5252b3883ac004347860b4c7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 7 Nov 2008 18:22:45 -0500
Subject: Btrfs: Avoid unplug storms during commit

While doing a commit, btrfs makes sure all the metadata blocks
were properly written to disk, calling wait_on_page_writeback for
each page.  This writeback happens after allowing another transaction
to start, so it competes for the disk with other processes in the FS.

If the page writeback bit is still set, each wait_on_page_writeback might
trigger an unplug, even though the page might be waiting for checksumming
to finish or might be waiting for the async work queue to submit the
bio.

This trades wait_on_page_writeback for waiting on the extent writeback
bits.  It won't trigger any unplugs and substantially improves performance
in a number of workloads.

This also changes the async bio submission to avoid requeueing if there
is only one device.  The requeue just wastes CPU time because there are
no other devices to service.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cbb9bb31431..80a27284dbf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -200,7 +200,8 @@ loop:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi)) {
+		if (pending && bdi_write_congested(bdi) &&
+		    fs_info->fs_devices->open_devices > 1) {
 			struct bio *old_head;
 
 			spin_lock(&device->io_lock);
-- 
cgit v1.2.3


From 2b82032c34ec40515d3c45c36cd1961f37977de8 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 17 Nov 2008 21:11:30 -0500
Subject: Btrfs: Seed device support

Seed device is a special btrfs with SEEDING super flag
set and can only be mounted in read-only mode. Seed
devices allow people to create new btrfs on top of it.

The new FS contains the same contents as the seed device,
but it can be mounted in read-write mode.

This patch does the following:

1) split code in btrfs_alloc_chunk into two parts. The first part does makes
the newly allocated chunk usable, but does not do any operation that modifies
the chunk tree. The second part does the the chunk tree modifications. This
division is for the bootstrap step of adding storage to the seed device.

2) Update device management code to handle seed device.
The basic idea is: For an FS grown from seed devices, its
seed devices are put into a list. Seed devices are
opened on demand at mounting time. If any seed device is
missing or has been changed, btrfs kernel module will
refuse to mount the FS.

3) make btrfs_find_block_group not return NULL when all
block groups are read-only.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/volumes.c | 1131 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 848 insertions(+), 283 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 80a27284dbf..d6f1996de62 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -40,6 +40,12 @@ struct map_lookup {
 	struct btrfs_bio_stripe stripes[];
 };
 
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+
+
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
@@ -69,25 +75,31 @@ static void unlock_chunks(struct btrfs_root *root)
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
-	struct list_head *uuid_cur;
-	struct list_head *devices_cur;
 	struct btrfs_device *dev;
 
-	list_for_each(uuid_cur, &fs_uuids) {
-		fs_devices = list_entry(uuid_cur, struct btrfs_fs_devices,
-					list);
+	while (!list_empty(&fs_uuids)) {
+		fs_devices = list_entry(fs_uuids.next,
+					struct btrfs_fs_devices, list);
+		list_del(&fs_devices->list);
 		while(!list_empty(&fs_devices->devices)) {
-			devices_cur = fs_devices->devices.next;
-			dev = list_entry(devices_cur, struct btrfs_device,
-					 dev_list);
+			dev = list_entry(fs_devices->devices.next,
+					 struct btrfs_device, dev_list);
 			if (dev->bdev) {
 				close_bdev_excl(dev->bdev);
 				fs_devices->open_devices--;
 			}
+			fs_devices->num_devices--;
+			if (dev->writeable)
+				fs_devices->rw_devices--;
 			list_del(&dev->dev_list);
+			list_del(&dev->dev_alloc_list);
 			kfree(dev->name);
 			kfree(dev);
 		}
+		WARN_ON(fs_devices->num_devices);
+		WARN_ON(fs_devices->open_devices);
+		WARN_ON(fs_devices->rw_devices);
+		kfree(fs_devices);
 	}
 	return 0;
 }
@@ -257,6 +269,9 @@ static noinline int device_list_add(const char *path,
 				       disk_super->dev_item.uuid);
 	}
 	if (!device) {
+		if (fs_devices->opened)
+			return -EBUSY;
+
 		device = kzalloc(sizeof(*device), GFP_NOFS);
 		if (!device) {
 			/* we can safely leave the fs_devices entry around */
@@ -273,8 +288,9 @@ static noinline int device_list_add(const char *path,
 			kfree(device);
 			return -ENOMEM;
 		}
+		INIT_LIST_HEAD(&device->dev_alloc_list);
 		list_add(&device->dev_list, &fs_devices->devices);
-		list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
+		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
 	}
 
@@ -288,58 +304,94 @@ static noinline int device_list_add(const char *path,
 
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *head = &fs_devices->devices;
+	struct list_head *tmp;
 	struct list_head *cur;
 	struct btrfs_device *device;
+	int seed_devices = 0;
 
 	mutex_lock(&uuid_mutex);
 again:
-	list_for_each(cur, head) {
+	list_for_each_safe(cur, tmp, &fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
-		if (!device->in_fs_metadata) {
-			struct block_device *bdev;
-			list_del(&device->dev_list);
-			list_del(&device->dev_alloc_list);
+		if (device->in_fs_metadata)
+			continue;
+
+		if (device->bdev) {
+			close_bdev_excl(device->bdev);
+			device->bdev = NULL;
+			fs_devices->open_devices--;
+		}
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			device->writeable = 0;
+			fs_devices->rw_devices--;
+		}
+		if (!seed_devices) {
+			list_del_init(&device->dev_list);
 			fs_devices->num_devices--;
-			if (device->bdev) {
-				bdev = device->bdev;
-				fs_devices->open_devices--;
-				mutex_unlock(&uuid_mutex);
-				close_bdev_excl(bdev);
-				mutex_lock(&uuid_mutex);
-			}
 			kfree(device->name);
 			kfree(device);
-			goto again;
 		}
 	}
+
+	if (fs_devices->seed) {
+		fs_devices = fs_devices->seed;
+		seed_devices = 1;
+		goto again;
+	}
+
 	mutex_unlock(&uuid_mutex);
 	return 0;
 }
 
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *head = &fs_devices->devices;
+	struct btrfs_fs_devices *seed_devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
+again:
+	if (--fs_devices->opened > 0)
+		return 0;
 
-	mutex_lock(&uuid_mutex);
-	list_for_each(cur, head) {
+	list_for_each(cur, &fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
 			close_bdev_excl(device->bdev);
 			fs_devices->open_devices--;
 		}
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			fs_devices->rw_devices--;
+		}
+
 		device->bdev = NULL;
+		device->writeable = 0;
 		device->in_fs_metadata = 0;
 	}
-	fs_devices->mounted = 0;
-	mutex_unlock(&uuid_mutex);
+	fs_devices->opened = 0;
+	fs_devices->seeding = 0;
+	fs_devices->sprouted = 0;
+
+	seed_devices = fs_devices->seed;
+	fs_devices->seed = NULL;
+	if (seed_devices) {
+		fs_devices = seed_devices;
+		goto again;
+	}
 	return 0;
 }
 
-int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-		       int flags, void *holder)
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	ret = __btrfs_close_devices(fs_devices);
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -350,24 +402,18 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	struct btrfs_super_block *disk_super;
 	u64 latest_devid = 0;
 	u64 latest_transid = 0;
-	u64 transid;
 	u64 devid;
+	int seeding = 1;
 	int ret = 0;
 
-	mutex_lock(&uuid_mutex);
-	if (fs_devices->mounted)
-		goto out;
-
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev)
 			continue;
-
 		if (!device->name)
 			continue;
 
-		bdev = open_bdev_excl(device->name, flags, holder);
-
+		bdev = open_bdev_excl(device->name, MS_RDONLY, holder);
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
 			goto error;
@@ -387,16 +433,32 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		if (devid != device->devid)
 			goto error_brelse;
 
-		transid = btrfs_super_generation(disk_super);
-		if (!latest_transid || transid > latest_transid) {
+		if (memcmp(device->uuid, disk_super->dev_item.uuid,
+			   BTRFS_UUID_SIZE))
+			goto error_brelse;
+
+		device->generation = btrfs_super_generation(disk_super);
+		if (!latest_transid || device->generation > latest_transid) {
 			latest_devid = devid;
-			latest_transid = transid;
+			latest_transid = device->generation;
 			latest_bdev = bdev;
 		}
 
+		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+			device->writeable = 0;
+		} else {
+			device->writeable = !bdev_read_only(bdev);
+			seeding = 0;
+		}
+
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
 		fs_devices->open_devices++;
+		if (device->writeable) {
+			fs_devices->rw_devices++;
+			list_add(&device->dev_alloc_list,
+				 &fs_devices->alloc_list);
+		}
 		continue;
 
 error_brelse:
@@ -410,11 +472,32 @@ error:
 		ret = -EIO;
 		goto out;
 	}
-	fs_devices->mounted = 1;
+	fs_devices->seeding = seeding;
+	fs_devices->opened = 1;
 	fs_devices->latest_bdev = latest_bdev;
 	fs_devices->latest_devid = latest_devid;
 	fs_devices->latest_trans = latest_transid;
+	fs_devices->total_rw_bytes = 0;
 out:
+	return ret;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       int flags, void *holder)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	if (fs_devices->opened) {
+		if (fs_devices->sprouted) {
+			ret = -EBUSY;
+		} else {
+			fs_devices->opened++;
+			ret = 0;
+		}
+	} else {
+		ret = __btrfs_open_devices(fs_devices, holder);
+	}
 	mutex_unlock(&uuid_mutex);
 	return ret;
 }
@@ -481,12 +564,12 @@ error:
  */
 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
 					 struct btrfs_device *device,
-					 struct btrfs_path *path,
 					 u64 num_bytes, u64 *start)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
 	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
 	u64 hole_size = 0;
 	u64 last_byte = 0;
 	u64 search_start = 0;
@@ -496,8 +579,11 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	int start_found;
 	struct extent_buffer *l;
 
-	start_found = 0;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 	path->reada = 2;
+	start_found = 0;
 
 	/* FIXME use last free of some kind */
 
@@ -581,7 +667,6 @@ check_pending:
 	/* we have to make sure we didn't find an extent that has already
 	 * been allocated by the map tree or the original allocation
 	 */
-	btrfs_release_path(root, path);
 	BUG_ON(*start < search_start);
 
 	if (*start + num_bytes > search_end) {
@@ -589,10 +674,10 @@ check_pending:
 		goto error;
 	}
 	/* check for pending inserts here */
-	return 0;
+	ret = 0;
 
 error:
-	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -644,11 +729,10 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset,
-			   u64 num_bytes, u64 *start)
+			   u64 chunk_offset, u64 start, u64 num_bytes)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -662,13 +746,8 @@ int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = find_free_dev_extent(trans, device, path, num_bytes, start);
-	if (ret) {
-		goto err;
-	}
-
 	key.objectid = device->devid;
-	key.offset = *start;
+	key.offset = start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(*extent));
@@ -687,7 +766,6 @@ int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
 	btrfs_mark_buffer_dirty(leaf);
-err:
 	btrfs_free_path(path);
 	return ret;
 }
@@ -735,12 +813,18 @@ error:
 	return ret;
 }
 
-static noinline int find_next_devid(struct btrfs_root *root,
-				    struct btrfs_path *path, u64 *objectid)
+static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
 {
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
+	struct btrfs_path *path;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
@@ -763,7 +847,7 @@ static noinline int find_next_devid(struct btrfs_root *root,
 	}
 	ret = 0;
 error:
-	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -781,7 +865,6 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 	unsigned long ptr;
-	u64 free_devid = 0;
 
 	root = root->fs_info->chunk_root;
 
@@ -789,13 +872,9 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = find_next_devid(root, path, &free_devid);
-	if (ret)
-		goto out;
-
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
-	key.offset = free_devid;
+	key.offset = device->devid;
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(*dev_item));
@@ -805,8 +884,8 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
 
-	device->devid = free_devid;
 	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_generation(leaf, dev_item, 0);
 	btrfs_set_device_type(leaf, dev_item, device->type);
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
@@ -819,9 +898,11 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+	ptr = (unsigned long)btrfs_device_fsid(dev_item);
+	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
-	ret = 0;
 
+	ret = 0;
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -832,11 +913,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 {
 	int ret;
 	struct btrfs_path *path;
-	struct block_device *bdev = device->bdev;
-	struct btrfs_device *next_dev;
 	struct btrfs_key key;
-	u64 total_bytes;
-	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_trans_handle *trans;
 
 	root = root->fs_info->chunk_root;
@@ -863,25 +940,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	ret = btrfs_del_item(trans, root, path);
 	if (ret)
 		goto out;
-
-	/*
-	 * at this point, the device is zero sized.  We want to
-	 * remove it from the devices list and zero out the old super
-	 */
-	list_del_init(&device->dev_list);
-	list_del_init(&device->dev_alloc_list);
-	fs_devices = root->fs_info->fs_devices;
-
-	next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
-			      dev_list);
-	if (bdev == root->fs_info->sb->s_bdev)
-		root->fs_info->sb->s_bdev = next_dev->bdev;
-	if (bdev == fs_devices->latest_bdev)
-		fs_devices->latest_bdev = next_dev->bdev;
-
-	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
-	btrfs_set_super_num_devices(&root->fs_info->super_copy,
-				    total_bytes - 1);
 out:
 	btrfs_free_path(path);
 	unlock_chunks(root);
@@ -892,11 +950,14 @@ out:
 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_device *device;
+	struct btrfs_device *next_device;
 	struct block_device *bdev;
 	struct buffer_head *bh = NULL;
 	struct btrfs_super_block *disk_super;
 	u64 all_avail;
 	u64 devid;
+	u64 num_devices;
+	u8 *dev_uuid;
 	int ret = 0;
 
 	mutex_lock(&uuid_mutex);
@@ -907,14 +968,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->avail_metadata_alloc_bits;
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 4) {
+	    root->fs_info->fs_devices->rw_devices <= 4) {
 		printk("btrfs: unable to go below four devices on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 2) {
+	    root->fs_info->fs_devices->rw_devices <= 2) {
 		printk("btrfs: unable to go below two devices on raid1\n");
 		ret = -EINVAL;
 		goto out;
@@ -941,15 +1002,15 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			printk("btrfs: no missing devices found to remove\n");
 			goto out;
 		}
-
 	} else {
-		bdev = open_bdev_excl(device_path, 0,
+		bdev = open_bdev_excl(device_path, MS_RDONLY,
 				      root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
 			goto out;
 		}
 
+		set_blocksize(bdev, 4096);
 		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
 		if (!bh) {
 			ret = -EIO;
@@ -957,45 +1018,97 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-		    sizeof(disk_super->magic))) {
-			ret = -ENOENT;
-			goto error_brelse;
-		}
-		if (memcmp(disk_super->fsid, root->fs_info->fsid,
-			   BTRFS_FSID_SIZE)) {
+			    sizeof(disk_super->magic))) {
 			ret = -ENOENT;
 			goto error_brelse;
 		}
 		devid = le64_to_cpu(disk_super->dev_item.devid);
-		device = btrfs_find_device(root, devid, NULL);
+		dev_uuid = disk_super->dev_item.uuid;
+		device = btrfs_find_device(root, devid, dev_uuid,
+					   disk_super->fsid);
 		if (!device) {
 			ret = -ENOENT;
 			goto error_brelse;
 		}
+	}
 
+	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+		printk("btrfs: unable to remove the only writeable device\n");
+		ret = -EINVAL;
+		goto error_brelse;
+	}
+
+	if (device->writeable) {
+		list_del_init(&device->dev_alloc_list);
+		root->fs_info->fs_devices->rw_devices--;
 	}
-	root->fs_info->fs_devices->num_devices--;
-	root->fs_info->fs_devices->open_devices--;
 
 	ret = btrfs_shrink_device(device, 0);
 	if (ret)
 		goto error_brelse;
 
-
 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
 	if (ret)
 		goto error_brelse;
 
-	if (bh) {
+	device->in_fs_metadata = 0;
+	if (device->fs_devices == root->fs_info->fs_devices) {
+		list_del_init(&device->dev_list);
+		root->fs_info->fs_devices->num_devices--;
+		if (device->bdev)
+			device->fs_devices->open_devices--;
+	}
+
+	next_device = list_entry(root->fs_info->fs_devices->devices.next,
+				 struct btrfs_device, dev_list);
+	if (device->bdev == root->fs_info->sb->s_bdev)
+		root->fs_info->sb->s_bdev = next_device->bdev;
+	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+
+	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+
+	if (device->fs_devices != root->fs_info->fs_devices) {
+		BUG_ON(device->writeable);
+		brelse(bh);
+		if (bdev)
+			close_bdev_excl(bdev);
+
+		if (device->bdev) {
+			close_bdev_excl(device->bdev);
+			device->bdev = NULL;
+			device->fs_devices->open_devices--;
+		}
+		if (device->fs_devices->open_devices == 0) {
+			struct btrfs_fs_devices *fs_devices;
+			fs_devices = root->fs_info->fs_devices;
+			while (fs_devices) {
+				if (fs_devices->seed == device->fs_devices)
+					break;
+				fs_devices = fs_devices->seed;
+			}
+			fs_devices->seed = device->fs_devices->seed;
+			device->fs_devices->seed = NULL;
+			__btrfs_close_devices(device->fs_devices);
+		}
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * at this point, the device is zero sized.  We want to
+	 * remove it from the devices list and zero out the old super
+	 */
+	if (device->writeable) {
 		/* make sure this device isn't detected as part of
 		 * the FS anymore
 		 */
 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
 		set_buffer_dirty(bh);
 		sync_dirty_buffer(bh);
-
-		brelse(bh);
 	}
+	brelse(bh);
 
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
@@ -1021,6 +1134,129 @@ out:
 	return ret;
 }
 
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_fs_devices *old_devices;
+	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_device *device;
+	u64 super_flags;
+
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
+	if (!fs_devices->seeding || fs_devices->opened != 1)
+		return -EINVAL;
+
+	old_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!old_devices)
+		return -ENOMEM;
+
+	memcpy(old_devices, fs_devices, sizeof(*old_devices));
+	old_devices->opened = 1;
+	old_devices->sprouted = 1;
+	INIT_LIST_HEAD(&old_devices->devices);
+	INIT_LIST_HEAD(&old_devices->alloc_list);
+	list_splice_init(&fs_devices->devices, &old_devices->devices);
+	list_splice_init(&fs_devices->alloc_list, &old_devices->alloc_list);
+	list_for_each_entry(device, &old_devices->devices, dev_list) {
+		device->fs_devices = old_devices;
+	}
+	list_add(&old_devices->list, &fs_uuids);
+
+	fs_devices->seeding = 0;
+	fs_devices->num_devices = 0;
+	fs_devices->open_devices = 0;
+	fs_devices->seed = old_devices;
+
+	generate_random_uuid(fs_devices->fsid);
+	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	super_flags = btrfs_super_flags(disk_super) &
+		      ~BTRFS_SUPER_FLAG_SEEDING;
+	btrfs_set_super_flags(disk_super, super_flags);
+
+	return 0;
+}
+
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dev_item *dev_item;
+	struct btrfs_device *device;
+	struct btrfs_key key;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+	u64 devid;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	root = root->fs_info->chunk_root;
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = BTRFS_DEV_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto error;
+
+		leaf = path->nodes[0];
+next_slot:
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret > 0)
+				break;
+			if (ret < 0)
+				goto error;
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+			btrfs_release_path(root, path);
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+		    key.type != BTRFS_DEV_ITEM_KEY)
+			break;
+
+		dev_item = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_dev_item);
+		devid = btrfs_device_id(leaf, dev_item);
+		read_extent_buffer(leaf, dev_uuid,
+				   (unsigned long)btrfs_device_uuid(dev_item),
+				   BTRFS_UUID_SIZE);
+		read_extent_buffer(leaf, fs_uuid,
+				   (unsigned long)btrfs_device_fsid(dev_item),
+				   BTRFS_UUID_SIZE);
+		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+		BUG_ON(!device);
+
+		if (device->fs_devices->seeding) {
+			btrfs_set_device_generation(leaf, dev_item,
+						    device->generation);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+
+		path->slots[0]++;
+		goto next_slot;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_trans_handle *trans;
@@ -1028,26 +1264,34 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	struct block_device *bdev;
 	struct list_head *cur;
 	struct list_head *devices;
+	struct super_block *sb = root->fs_info->sb;
 	u64 total_bytes;
+	int seeding_dev = 0;
 	int ret = 0;
 
+	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+		return -EINVAL;
 
 	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
 	if (!bdev) {
 		return -EIO;
 	}
 
+	if (root->fs_info->fs_devices->seeding) {
+		seeding_dev = 1;
+		down_write(&sb->s_umount);
+		mutex_lock(&uuid_mutex);
+	}
+
 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
 	mutex_lock(&root->fs_info->volume_mutex);
 
-	trans = btrfs_start_transaction(root, 1);
-	lock_chunks(root);
 	devices = &root->fs_info->fs_devices->devices;
 	list_for_each(cur, devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
-			goto out;
+			goto error;
 		}
 	}
 
@@ -1055,18 +1299,31 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (!device) {
 		/* we can safely leave the fs_devices entry around */
 		ret = -ENOMEM;
-		goto out_close_bdev;
+		goto error;
 	}
 
-	device->barriers = 1;
-	device->work.func = pending_bios_fn;
-	generate_random_uuid(device->uuid);
-	spin_lock_init(&device->io_lock);
 	device->name = kstrdup(device_path, GFP_NOFS);
 	if (!device->name) {
 		kfree(device);
-		goto out_close_bdev;
+		ret = -ENOMEM;
+		goto error;
 	}
+
+	ret = find_next_devid(root, &device->devid);
+	if (ret) {
+		kfree(device);
+		goto error;
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+	lock_chunks(root);
+
+	device->barriers = 1;
+	device->writeable = 1;
+	device->work.func = pending_bios_fn;
+	generate_random_uuid(device->uuid);
+	spin_lock_init(&device->io_lock);
+	device->generation = trans->transid;
 	device->io_width = root->sectorsize;
 	device->io_align = root->sectorsize;
 	device->sector_size = root->sectorsize;
@@ -1074,12 +1331,22 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
+	set_blocksize(device->bdev, 4096);
 
-	ret = btrfs_add_device(trans, root, device);
-	if (ret)
-		goto out_close_bdev;
+	if (seeding_dev) {
+		sb->s_flags &= ~MS_RDONLY;
+		ret = btrfs_prepare_sprout(trans, root);
+		BUG_ON(ret);
+	}
 
-	set_blocksize(device->bdev, 4096);
+	device->fs_devices = root->fs_info->fs_devices;
+	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &root->fs_info->fs_devices->alloc_list);
+	root->fs_info->fs_devices->num_devices++;
+	root->fs_info->fs_devices->open_devices++;
+	root->fs_info->fs_devices->rw_devices++;
+	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
@@ -1089,20 +1356,34 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
 				    total_bytes + 1);
 
-	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
-	list_add(&device->dev_alloc_list,
-		 &root->fs_info->fs_devices->alloc_list);
-	root->fs_info->fs_devices->num_devices++;
-	root->fs_info->fs_devices->open_devices++;
-out:
+	if (seeding_dev) {
+		ret = init_first_rw_device(trans, root, device);
+		BUG_ON(ret);
+		ret = btrfs_finish_sprout(trans, root);
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_add_device(trans, root, device);
+	}
+
 	unlock_chunks(root);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->volume_mutex);
+	btrfs_commit_transaction(trans, root);
 
-	return ret;
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
 
-out_close_bdev:
+		ret = btrfs_relocate_sys_chunks(root);
+		BUG_ON(ret);
+	}
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	return ret;
+error:
 	close_bdev_excl(bdev);
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
+	}
 	goto out;
 }
 
@@ -1160,7 +1441,15 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 diff = new_size - device->total_bytes;
 
+	if (!device->writeable)
+		return -EACCES;
+	if (new_size <= device->total_bytes)
+		return -EINVAL;
+
 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
+	device->fs_devices->total_rw_bytes += diff;
+
+	device->total_bytes = new_size;
 	return btrfs_update_device(trans, device);
 }
 
@@ -1248,7 +1537,6 @@ int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 	return ret;
 }
 
-
 int btrfs_relocate_chunk(struct btrfs_root *root,
 			 u64 chunk_tree, u64 chunk_objectid,
 			 u64 chunk_offset)
@@ -1308,24 +1596,82 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 		BUG_ON(ret);
 	}
 
-	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
-	BUG_ON(ret);
+	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+	BUG_ON(ret);
+
+	spin_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+
+	kfree(map);
+	em->bdev = NULL;
+
+	/* once for the tree */
+	free_extent_map(em);
+	/* once for us */
+	free_extent_map(em);
+
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 chunk_tree = chunk_root->root_key.objectid;
+	u64 chunk_type;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+		BUG_ON(ret == 0);
+
+		ret = btrfs_previous_item(chunk_root, path, key.objectid,
+					  key.type);
+		if (ret < 0)
+			goto error;
+		if (ret > 0)
+			break;
 
-	spin_lock(&em_tree->lock);
-	remove_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
-	kfree(map);
-	em->bdev = NULL;
+		chunk = btrfs_item_ptr(leaf, path->slots[0],
+				       struct btrfs_chunk);
+		chunk_type = btrfs_chunk_type(leaf, chunk);
+		btrfs_release_path(chunk_root, path);
 
-	/* once for the tree */
-	free_extent_map(em);
-	/* once for us */
-	free_extent_map(em);
+		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+						   found_key.objectid,
+						   found_key.offset);
+			BUG_ON(ret);
+		}
 
-	unlock_chunks(root);
-	btrfs_end_transaction(trans, root);
-	return 0;
+		if (found_key.offset == 0)
+			break;
+		key.offset = found_key.offset - 1;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
 }
 
 static u64 div_factor(u64 num, int factor)
@@ -1337,7 +1683,6 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
-
 int btrfs_balance(struct btrfs_root *dev_root)
 {
 	int ret;
@@ -1353,6 +1698,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key found_key;
 
+	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
 
 	mutex_lock(&dev_root->fs_info->volume_mutex);
 	dev_root = dev_root->fs_info->dev_root;
@@ -1363,7 +1710,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
-		if (device->total_bytes - device->bytes_used > size_to_free)
+		if (!device->writeable ||
+		    device->total_bytes - device->bytes_used > size_to_free)
 			continue;
 
 		ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -1453,6 +1801,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 diff = device->total_bytes - new_size;
 
+	if (new_size >= device->total_bytes)
+		return -EINVAL;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1469,6 +1819,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	lock_chunks(root);
 
 	device->total_bytes = new_size;
+	if (device->writeable)
+		device->fs_devices->total_rw_bytes -= diff;
 	ret = btrfs_update_device(trans, device);
 	if (ret) {
 		unlock_chunks(root);
@@ -1561,32 +1913,27 @@ static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
 		return calc_size * num_stripes;
 }
 
-
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *extent_root, u64 *start,
-		      u64 *num_bytes, u64 type)
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root,
+			       struct map_lookup **map_ret,
+			       u64 *num_bytes, u64 *stripe_size,
+			       u64 start, u64 type)
 {
-	u64 dev_offset;
 	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
-	struct btrfs_path *path;
-	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
-	struct btrfs_chunk *chunk;
-	struct list_head private_devs;
-	struct list_head *dev_list;
+	struct btrfs_fs_devices *fs_devices = info->fs_devices;
 	struct list_head *cur;
+	struct map_lookup *map = NULL;
 	struct extent_map_tree *em_tree;
-	struct map_lookup *map;
 	struct extent_map *em;
+	struct list_head private_devs;
 	int min_stripe_size = 1 * 1024 * 1024;
-	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
 	u64 max_chunk_size = calc_size;
 	u64 min_free;
 	u64 avail;
 	u64 max_avail = 0;
-	u64 percent_max;
+	u64 dev_offset;
 	int num_stripes = 1;
 	int min_stripes = 1;
 	int sub_stripes = 0;
@@ -1594,19 +1941,17 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int ret;
 	int index;
 	int stripe_len = 64 * 1024;
-	struct btrfs_key key;
 
 	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
 	    (type & BTRFS_BLOCK_GROUP_DUP)) {
 		WARN_ON(1);
 		type &= ~BTRFS_BLOCK_GROUP_DUP;
 	}
-	dev_list = &extent_root->fs_info->fs_devices->alloc_list;
-	if (list_empty(dev_list))
+	if (list_empty(&fs_devices->alloc_list))
 		return -ENOSPC;
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		num_stripes = extent_root->fs_info->fs_devices->open_devices;
+		num_stripes = fs_devices->rw_devices;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
@@ -1614,14 +1959,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-		num_stripes = min_t(u64, 2,
-			    extent_root->fs_info->fs_devices->open_devices);
+		num_stripes = min_t(u64, 2, fs_devices->rw_devices);
 		if (num_stripes < 2)
 			return -ENOSPC;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		num_stripes = extent_root->fs_info->fs_devices->open_devices;
+		num_stripes = fs_devices->rw_devices;
 		if (num_stripes < 4)
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
@@ -1641,15 +1985,19 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripe_size = 1 * 1024 * 1024;
 	}
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	/* we don't want a chunk larger than 10% of the FS */
-	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
-	max_chunk_size = min(percent_max, max_chunk_size);
+	/* we don't want a chunk larger than 10% of writeable space */
+	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+			     max_chunk_size);
 
 again:
+	if (!map || map->num_stripes != num_stripes) {
+		kfree(map);
+		map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+		if (!map)
+			return -ENOMEM;
+		map->num_stripes = num_stripes;
+	}
+
 	if (calc_size * num_stripes > max_chunk_size) {
 		calc_size = max_chunk_size;
 		do_div(calc_size, num_stripes);
@@ -1662,8 +2010,7 @@ again:
 	do_div(calc_size, stripe_len);
 	calc_size *= stripe_len;
 
-	INIT_LIST_HEAD(&private_devs);
-	cur = dev_list->next;
+	cur = fs_devices->alloc_list.next;
 	index = 0;
 
 	if (type & BTRFS_BLOCK_GROUP_DUP)
@@ -1679,10 +2026,10 @@ again:
 	if (!looped)
 		min_free += 1024 * 1024;
 
-	/* build a private list of devices we will allocate from */
+	INIT_LIST_HEAD(&private_devs);
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
-
+		BUG_ON(!device->writeable);
 		if (device->total_bytes > device->bytes_used)
 			avail = device->total_bytes - device->bytes_used;
 		else
@@ -1690,24 +2037,28 @@ again:
 		cur = cur->next;
 
 		if (device->in_fs_metadata && avail >= min_free) {
-			u64 ignored_start = 0;
-			ret = find_free_dev_extent(trans, device, path,
-						   min_free,
-						   &ignored_start);
+			ret = find_free_dev_extent(trans, device,
+						   min_free, &dev_offset);
 			if (ret == 0) {
 				list_move_tail(&device->dev_alloc_list,
 					       &private_devs);
+				map->stripes[index].dev = device;
+				map->stripes[index].physical = dev_offset;
 				index++;
-				if (type & BTRFS_BLOCK_GROUP_DUP)
+				if (type & BTRFS_BLOCK_GROUP_DUP) {
+					map->stripes[index].dev = device;
+					map->stripes[index].physical =
+						dev_offset + calc_size;
 					index++;
+				}
 			}
 		} else if (device->in_fs_metadata && avail > max_avail)
 			max_avail = avail;
-		if (cur == dev_list)
+		if (cur == &fs_devices->alloc_list)
 			break;
 	}
+	list_splice(&private_devs, &fs_devices->alloc_list);
 	if (index < num_stripes) {
-		list_splice(&private_devs, dev_list);
 		if (index >= min_stripes) {
 			num_stripes = index;
 			if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -1722,115 +2073,246 @@ again:
 			calc_size = max_avail;
 			goto again;
 		}
-		btrfs_free_path(path);
+		kfree(map);
 		return -ENOSPC;
 	}
-	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
-	key.type = BTRFS_CHUNK_ITEM_KEY;
-	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-			      &key.offset);
-	if (ret) {
-		btrfs_free_path(path);
-		return ret;
-	}
+	map->sector_size = extent_root->sectorsize;
+	map->stripe_len = stripe_len;
+	map->io_align = stripe_len;
+	map->io_width = stripe_len;
+	map->type = type;
+	map->num_stripes = num_stripes;
+	map->sub_stripes = sub_stripes;
 
-	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
-	if (!chunk) {
-		btrfs_free_path(path);
-		return -ENOMEM;
-	}
+	*map_ret = map;
+	*stripe_size = calc_size;
+	*num_bytes = chunk_bytes_by_type(type, calc_size,
+					 num_stripes, sub_stripes);
 
-	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-	if (!map) {
-		kfree(chunk);
-		btrfs_free_path(path);
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em) {
+		kfree(map);
 		return -ENOMEM;
 	}
-	btrfs_free_path(path);
-	path = NULL;
+	em->bdev = (struct block_device *)map;
+	em->start = start;
+	em->len = *num_bytes;
+	em->block_start = 0;
+	em->block_len = em->len;
 
-	stripes = &chunk->stripe;
-	*num_bytes = chunk_bytes_by_type(type, calc_size,
-					 num_stripes, sub_stripes);
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	spin_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+	BUG_ON(ret);
+	free_extent_map(em);
 
-	index = 0;
-	while(index < num_stripes) {
-		struct btrfs_stripe *stripe;
-		BUG_ON(list_empty(&private_devs));
-		cur = private_devs.next;
-		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+	ret = btrfs_make_block_group(trans, extent_root, 0, type,
+				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				     start, *num_bytes);
+	BUG_ON(ret);
 
-		/* loop over this device again if we're doing a dup group */
-		if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
-		    (index == num_stripes - 1))
-			list_move_tail(&device->dev_alloc_list, dev_list);
+	index = 0;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		dev_offset = map->stripes[index].physical;
 
 		ret = btrfs_alloc_dev_extent(trans, device,
-			     info->chunk_root->root_key.objectid,
-			     BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
-			     calc_size, &dev_offset);
+				info->chunk_root->root_key.objectid,
+				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				start, dev_offset, calc_size);
 		BUG_ON(ret);
-		device->bytes_used += calc_size;
+		index++;
+	}
+
+	return 0;
+}
+
+static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				struct map_lookup *map, u64 chunk_offset,
+				u64 chunk_size, u64 stripe_size)
+{
+	u64 dev_offset;
+	struct btrfs_key key;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_device *device;
+	struct btrfs_chunk *chunk;
+	struct btrfs_stripe *stripe;
+	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
+	int index = 0;
+	int ret;
+
+	chunk = kzalloc(item_size, GFP_NOFS);
+	if (!chunk)
+		return -ENOMEM;
+
+	index = 0;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		device->bytes_used += stripe_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
+		index++;
+	}
+
+	index = 0;
+	stripe = &chunk->stripe;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		dev_offset = map->stripes[index].physical;
 
-		map->stripes[index].dev = device;
-		map->stripes[index].physical = dev_offset;
-		stripe = stripes + index;
 		btrfs_set_stack_stripe_devid(stripe, device->devid);
 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
-		physical = dev_offset;
+		stripe++;
 		index++;
 	}
-	BUG_ON(!list_empty(&private_devs));
 
-	/* key was set above */
-	btrfs_set_stack_chunk_length(chunk, *num_bytes);
+	btrfs_set_stack_chunk_length(chunk, chunk_size);
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
-	btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
-	btrfs_set_stack_chunk_type(chunk, type);
-	btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
-	btrfs_set_stack_chunk_io_align(chunk, stripe_len);
-	btrfs_set_stack_chunk_io_width(chunk, stripe_len);
+	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_type(chunk, map->type);
+	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
-	btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
-	map->sector_size = extent_root->sectorsize;
-	map->stripe_len = stripe_len;
-	map->io_align = stripe_len;
-	map->io_width = stripe_len;
-	map->type = type;
-	map->num_stripes = num_stripes;
-	map->sub_stripes = sub_stripes;
+	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
 
-	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
-				btrfs_chunk_item_size(num_stripes));
-	BUG_ON(ret);
-	*start = key.offset;;
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	key.offset = chunk_offset;
 
-	em = alloc_extent_map(GFP_NOFS);
-	if (!em)
-		return -ENOMEM;
-	em->bdev = (struct block_device *)map;
-	em->start = key.offset;
-	em->len = *num_bytes;
-	em->block_start = 0;
-	em->block_len = em->len;
+	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+	BUG_ON(ret);
 
-	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		ret = btrfs_add_system_chunk(trans, chunk_root, &key,
-				    chunk, btrfs_chunk_item_size(num_stripes));
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+					     item_size);
 		BUG_ON(ret);
 	}
 	kfree(chunk);
+	return 0;
+}
 
-	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
-	spin_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 type)
+{
+	u64 chunk_offset;
+	u64 chunk_size;
+	u64 stripe_size;
+	struct map_lookup *map;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	int ret;
+
+	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+			      &chunk_offset);
+	if (ret)
+		return ret;
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+				  &stripe_size, chunk_offset, type);
+	if (ret)
+		return ret;
+
+	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+				   chunk_size, stripe_size);
+	BUG_ON(ret);
+	return 0;
+}
+
+static int noinline init_first_rw_device(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct btrfs_device *device)
+{
+	u64 chunk_offset;
+	u64 sys_chunk_offset;
+	u64 chunk_size;
+	u64 sys_chunk_size;
+	u64 stripe_size;
+	u64 sys_stripe_size;
+	u64 alloc_profile;
+	struct map_lookup *map;
+	struct map_lookup *sys_map;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+
+	ret = find_next_chunk(fs_info->chunk_root,
+			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
+	BUG_ON(ret);
+
+	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
+			(fs_info->metadata_alloc_profile &
+			 fs_info->avail_metadata_alloc_bits);
+	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+				  &stripe_size, chunk_offset, alloc_profile);
+	BUG_ON(ret);
+
+	sys_chunk_offset = chunk_offset + chunk_size;
+
+	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
+			(fs_info->system_alloc_profile &
+			 fs_info->avail_system_alloc_bits);
+	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
+				  &sys_chunk_size, &sys_stripe_size,
+				  sys_chunk_offset, alloc_profile);
+	BUG_ON(ret);
+
+	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
+	BUG_ON(ret);
+
+	/*
+	 * Modifying chunk tree needs allocating new blocks from both
+	 * system block group and metadata block group. So we only can
+	 * do operations require modifying the chunk tree after both
+	 * block groups were created.
+	 */
+	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+				   chunk_size, stripe_size);
+	BUG_ON(ret);
+
+	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
+				   sys_chunk_offset, sys_chunk_size,
+				   sys_stripe_size);
 	BUG_ON(ret);
+	return 0;
+}
+
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	int readonly = 0;
+	int i;
+
+	spin_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+	spin_unlock(&map_tree->map_tree.lock);
+	if (!em)
+		return 1;
+
+	map = (struct map_lookup *)em->bdev;
+	for (i = 0; i < map->num_stripes; i++) {
+		if (!map->stripes[i].dev->writeable) {
+			readonly = 1;
+			break;
+		}
+	}
 	free_extent_map(em);
-	return ret;
+	return readonly;
 }
 
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
@@ -2227,6 +2709,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
+		BUG_ON(rw == WRITE && !dev->writeable);
 		if (dev && dev->bdev) {
 			bio->bi_bdev = dev->bdev;
 			if (async_submit)
@@ -2246,11 +2729,23 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 }
 
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
-				       u8 *uuid)
+				       u8 *uuid, u8 *fsid)
 {
-	struct list_head *head = &root->fs_info->fs_devices->devices;
-
-	return __find_device(head, devid, uuid);
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *cur_devices;
+
+	cur_devices = root->fs_info->fs_devices;
+	while (cur_devices) {
+		if (!fsid ||
+		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			device = __find_device(&cur_devices->devices,
+					       devid, uuid);
+			if (device)
+				return device;
+		}
+		cur_devices = cur_devices->seed;
+	}
+	return NULL;
 }
 
 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
@@ -2262,8 +2757,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device = kzalloc(sizeof(*device), GFP_NOFS);
 	list_add(&device->dev_list,
 		 &fs_devices->devices);
-	list_add(&device->dev_alloc_list,
-		 &fs_devices->alloc_list);
 	device->barriers = 1;
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
@@ -2274,7 +2767,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	return device;
 }
 
-
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -2339,8 +2831,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		read_extent_buffer(leaf, uuid, (unsigned long)
 				   btrfs_stripe_dev_uuid_nr(chunk, i),
 				   BTRFS_UUID_SIZE);
-		map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
-
+		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+							NULL);
 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
 			kfree(map);
 			free_extent_map(em);
@@ -2387,6 +2879,50 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	return 0;
 }
 
+static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devices;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+
+	fs_devices = root->fs_info->fs_devices->seed;
+	while (fs_devices) {
+		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			ret = 0;
+			goto out;
+		}
+		fs_devices = fs_devices->seed;
+	}
+
+	fs_devices = find_fsid(fsid);
+	if (!fs_devices) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (fs_devices->opened) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = __btrfs_open_devices(fs_devices, root->fs_info->bdev_holder);
+	if (ret)
+		goto out;
+
+	if (!fs_devices->seeding) {
+		__btrfs_close_devices(fs_devices);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	fs_devices->seed = root->fs_info->fs_devices->seed;
+	root->fs_info->fs_devices->seed = fs_devices;
+	fs_devices->sprouted = 1;
+out:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
 static int read_one_dev(struct btrfs_root *root,
 			struct extent_buffer *leaf,
 			struct btrfs_dev_item *dev_item)
@@ -2394,23 +2930,50 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
+	int seed_devices = 0;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 
 	devid = btrfs_device_id(leaf, dev_item);
 	read_extent_buffer(leaf, dev_uuid,
 			   (unsigned long)btrfs_device_uuid(dev_item),
 			   BTRFS_UUID_SIZE);
-	device = btrfs_find_device(root, devid, dev_uuid);
-	if (!device) {
-		printk("warning devid %Lu missing\n", devid);
-		device = add_missing_dev(root, devid, dev_uuid);
-		if (!device)
-			return -ENOMEM;
+	read_extent_buffer(leaf, fs_uuid,
+			   (unsigned long)btrfs_device_fsid(dev_item),
+			   BTRFS_UUID_SIZE);
+
+	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+		ret = open_seed_devices(root, fs_uuid);
+		if (ret)
+			return ret;
+		seed_devices = 1;
+	}
+
+	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+	if (!device || !device->bdev) {
+		if (!btrfs_test_opt(root, DEGRADED) || seed_devices)
+			return -EIO;
+
+		if (!device) {
+			printk("warning devid %Lu missing\n", devid);
+			device = add_missing_dev(root, devid, dev_uuid);
+			if (!device)
+				return -ENOMEM;
+		}
+	}
+
+	if (device->fs_devices != root->fs_info->fs_devices) {
+		BUG_ON(device->writeable);
+		if (device->generation !=
+		    btrfs_device_generation(leaf, dev_item))
+			return -EINVAL;
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->in_fs_metadata = 1;
+	if (device->writeable)
+		device->fs_devices->total_rw_bytes += device->total_bytes;
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
@@ -2528,12 +3091,15 @@ again:
 				dev_item = btrfs_item_ptr(leaf, slot,
 						  struct btrfs_dev_item);
 				ret = read_one_dev(root, leaf, dev_item);
-				BUG_ON(ret);
+				if (ret)
+					goto error;
 			}
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
 			struct btrfs_chunk *chunk;
 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
 			ret = read_one_chunk(root, &found_key, leaf, chunk);
+			if (ret)
+				goto error;
 		}
 		path->slots[0]++;
 	}
@@ -2542,9 +3108,8 @@ again:
 		btrfs_release_path(root, path);
 		goto again;
 	}
-
-	btrfs_free_path(path);
 	ret = 0;
 error:
+	btrfs_free_path(path);
 	return ret;
 }
-- 
cgit v1.2.3


From 7cbd8a839118eede2393f0926b8e15027162bcd6 Mon Sep 17 00:00:00 2001
From: yanhai zhu <zhu.yanhai@gmail.com>
Date: Wed, 12 Nov 2008 14:38:54 -0500
Subject: Btrfs: Add a missing return pointer check

Add a missing kzalloc() return pointer check in add_missing_dev().

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d6f1996de62..ecf0633ab8c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2755,6 +2755,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 
 	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device)
+		return NULL;
 	list_add(&device->dev_list,
 		 &fs_devices->devices);
 	device->barriers = 1;
-- 
cgit v1.2.3


From 15916de835a683bd8133a0d1ac0c982b795ab4ff Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 19 Nov 2008 21:17:22 -0500
Subject: Btrfs: Fixes for 2.6.28-rc API changes

* open/close_bdev_excl -> open/close_bdev_exclusive
* blkdev_issue_discard takes a GFP mask now
* Fix blkdev_issue_discard usage now that it is enabled

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ecf0633ab8c..c3ee63f92a5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -85,7 +85,7 @@ int btrfs_cleanup_fs_uuids(void)
 			dev = list_entry(fs_devices->devices.next,
 					 struct btrfs_device, dev_list);
 			if (dev->bdev) {
-				close_bdev_excl(dev->bdev);
+				close_bdev_exclusive(dev->bdev, dev->mode);
 				fs_devices->open_devices--;
 			}
 			fs_devices->num_devices--;
@@ -317,7 +317,7 @@ again:
 			continue;
 
 		if (device->bdev) {
-			close_bdev_excl(device->bdev);
+			close_bdev_exclusive(device->bdev, device->mode);
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
@@ -356,7 +356,7 @@ again:
 	list_for_each(cur, &fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
-			close_bdev_excl(device->bdev);
+			close_bdev_exclusive(device->bdev, device->mode);
 			fs_devices->open_devices--;
 		}
 		if (device->writeable) {
@@ -391,7 +391,8 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 	return ret;
 }
 
-int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
+int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+			 int flags, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -413,7 +414,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 		if (!device->name)
 			continue;
 
-		bdev = open_bdev_excl(device->name, MS_RDONLY, holder);
+		bdev = open_bdev_exclusive(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
 			goto error;
@@ -453,6 +454,8 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
+		device->mode = flags;
+
 		fs_devices->open_devices++;
 		if (device->writeable) {
 			fs_devices->rw_devices++;
@@ -464,7 +467,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, MS_RDONLY);
 error:
 		continue;
 	}
@@ -496,7 +499,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			ret = 0;
 		}
 	} else {
-		ret = __btrfs_open_devices(fs_devices, holder);
+		ret = __btrfs_open_devices(fs_devices, flags, holder);
 	}
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -514,7 +517,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 
 	mutex_lock(&uuid_mutex);
 
-	bdev = open_bdev_excl(path, flags, holder);
+	bdev = open_bdev_exclusive(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
 		ret = PTR_ERR(bdev);
@@ -551,7 +554,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 error_brelse:
 	brelse(bh);
 error_close:
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, flags);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1003,7 +1006,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = open_bdev_excl(device_path, MS_RDONLY,
+		bdev = open_bdev_exclusive(device_path, MS_RDONLY,
 				      root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
@@ -1073,10 +1076,10 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		BUG_ON(device->writeable);
 		brelse(bh);
 		if (bdev)
-			close_bdev_excl(bdev);
+			close_bdev_exclusive(bdev, MS_RDONLY);
 
 		if (device->bdev) {
-			close_bdev_excl(device->bdev);
+			close_bdev_exclusive(device->bdev, device->mode);
 			device->bdev = NULL;
 			device->fs_devices->open_devices--;
 		}
@@ -1112,11 +1115,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
-		close_bdev_excl(device->bdev);
+		close_bdev_exclusive(device->bdev, device->mode);
 	}
 	if (bdev) {
 		/* one close for us */
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, MS_RDONLY);
 	}
 	kfree(device->name);
 	kfree(device);
@@ -1127,7 +1130,7 @@ error_brelse:
 	brelse(bh);
 error_close:
 	if (bdev)
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, MS_RDONLY);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
@@ -1272,7 +1275,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
 		return -EINVAL;
 
-	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
+	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
 	if (!bdev) {
 		return -EIO;
 	}
@@ -1331,6 +1334,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
+	device->mode = 0;
 	set_blocksize(device->bdev, 4096);
 
 	if (seeding_dev) {
@@ -1379,7 +1383,7 @@ out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 error:
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, 0);
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
@@ -2907,7 +2911,8 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 		goto out;
 	}
 
-	ret = __btrfs_open_devices(fs_devices, root->fs_info->bdev_holder);
+	ret = __btrfs_open_devices(fs_devices, MS_RDONLY,
+				   root->fs_info->bdev_holder);
 	if (ret)
 		goto out;
 
-- 
cgit v1.2.3


From 4b4e25f2a6ddb070bab7f7dd2bd2926fb8db9e04 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 20 Nov 2008 10:22:27 -0500
Subject: Btrfs: compat code fixes

The btrfs git kernel trees is used to build a standalone tree for
compiling against older kernels.  This commit makes the standalone tree
work with 2.6.27

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c3ee63f92a5..724ead54529 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,9 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
+#include <linux/version.h>
 #include <asm/div64.h>
+#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
-- 
cgit v1.2.3


From b2950863c61bc24cf0f63bc05947d9d50663c4c0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 09:54:17 -0500
Subject: Btrfs: make things static and include the right headers

Shut up various sparse warnings about symbols that should be either
static or have their declarations in scope.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/volumes.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 724ead54529..769f2c5d9e9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -238,7 +238,7 @@ done:
 	return 0;
 }
 
-void pending_bios_fn(struct btrfs_work *work)
+static void pending_bios_fn(struct btrfs_work *work)
 {
 	struct btrfs_device *device;
 
@@ -686,7 +686,7 @@ error:
 	return ret;
 }
 
-int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 			  struct btrfs_device *device,
 			  u64 start)
 {
@@ -1393,7 +1393,7 @@ error:
 	goto out;
 }
 
-int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
+static int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
 				 struct btrfs_device *device)
 {
 	int ret;
@@ -1497,7 +1497,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 			chunk_offset)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
@@ -1543,7 +1543,7 @@ int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 	return ret;
 }
 
-int btrfs_relocate_chunk(struct btrfs_root *root,
+static int btrfs_relocate_chunk(struct btrfs_root *root,
 			 u64 chunk_tree, u64 chunk_objectid,
 			 u64 chunk_offset)
 {
@@ -1884,7 +1884,7 @@ done:
 	return ret;
 }
 
-int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
-- 
cgit v1.2.3


From 97288f2c71e46965a2010baea10548fcb5f3af73 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 06:36:09 -0500
Subject: Btrfs: corret fmode_t annotations

Make sure to propagate fmode_t properly and use the right constants for
it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/volumes.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 769f2c5d9e9..6c523b3360f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -394,7 +394,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 }
 
 int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-			 int flags, void *holder)
+			 fmode_t flags, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -469,7 +469,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_exclusive(bdev, MS_RDONLY);
+		close_bdev_exclusive(bdev, FMODE_READ);
 error:
 		continue;
 	}
@@ -488,7 +488,7 @@ out:
 }
 
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-		       int flags, void *holder)
+		       fmode_t flags, void *holder)
 {
 	int ret;
 
@@ -507,7 +507,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	return ret;
 }
 
-int btrfs_scan_one_device(const char *path, int flags, void *holder,
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret)
 {
 	struct btrfs_super_block *disk_super;
@@ -1008,7 +1008,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = open_bdev_exclusive(device_path, MS_RDONLY,
+		bdev = open_bdev_exclusive(device_path, FMODE_READ,
 				      root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
@@ -1078,7 +1078,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		BUG_ON(device->writeable);
 		brelse(bh);
 		if (bdev)
-			close_bdev_exclusive(bdev, MS_RDONLY);
+			close_bdev_exclusive(bdev, FMODE_READ);
 
 		if (device->bdev) {
 			close_bdev_exclusive(device->bdev, device->mode);
@@ -1121,7 +1121,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 	if (bdev) {
 		/* one close for us */
-		close_bdev_exclusive(bdev, MS_RDONLY);
+		close_bdev_exclusive(bdev, FMODE_READ);
 	}
 	kfree(device->name);
 	kfree(device);
@@ -1132,7 +1132,7 @@ error_brelse:
 	brelse(bh);
 error_close:
 	if (bdev)
-		close_bdev_exclusive(bdev, MS_RDONLY);
+		close_bdev_exclusive(bdev, FMODE_READ);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
@@ -2913,7 +2913,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 		goto out;
 	}
 
-	ret = __btrfs_open_devices(fs_devices, MS_RDONLY,
+	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
 				   root->fs_info->bdev_holder);
 	if (ret)
 		goto out;
-- 
cgit v1.2.3


From d20f7043fa65659136c1a7c3c456eeeb5c6f431f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:58:54 -0500
Subject: Btrfs: move data checksumming into a dedicated tree

Btrfs stores checksums for each data block.  Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block.  This means that when we read the inode,
we've probably read in at least some checksums as well.

But, this has a few problems:

* The checksums are indexed by logical offset in the file.  When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data.  It would be faster if we could checksum
the compressed data instead.

* If we implement encryption, we'll be checksumming the plain text and
storing that on disk.  This is significantly less secure.

* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct.  This makes the raid
layer balancing and extent moving much more expensive.

* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.

* There is potentitally one copy of the checksum in each subvolume
referencing an extent.

The solution used here is to store the extent checksums in a dedicated
tree.  This allows us to index the checksums by phyiscal extent
start and length.  It means:

* The checksum is against the data stored on disk, after any compression
or encryption is done.

* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.

This makes compression significantly faster by reducing the amount of
data that needs to be checksummed.  It will also allow much faster
raid management code in general.

The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent.  This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6c523b3360f..2049d179ccd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2771,6 +2771,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->work.func = pending_bios_fn;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
+	INIT_LIST_HEAD(&device->dev_alloc_list);
 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
 	return device;
 }
-- 
cgit v1.2.3


From a512bbf855ff0af474257475f2e6da7acd854f52 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 8 Dec 2008 16:46:26 -0500
Subject: Btrfs: superblock duplication

This patch implements superblock duplication. Superblocks
are stored at offset 16K, 64M and 256G on every devices.
Spaces used by superblocks are preserved by the allocator,
which uses a reverse mapping function to find the logical
addresses that correspond to superblocks. Thank you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/volumes.c | 107 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 87 insertions(+), 20 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2049d179ccd..a79b3cc09e9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -423,15 +423,11 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		}
 		set_blocksize(bdev, 4096);
 
-		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		bh = btrfs_read_dev_super(bdev);
 		if (!bh)
 			goto error_close;
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-		    sizeof(disk_super->magic)))
-			goto error_brelse;
-
 		devid = le64_to_cpu(disk_super->dev_item.devid);
 		if (devid != device->devid)
 			goto error_brelse;
@@ -529,17 +525,12 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	ret = set_blocksize(bdev, 4096);
 	if (ret)
 		goto error_close;
-	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	bh = btrfs_read_dev_super(bdev);
 	if (!bh) {
 		ret = -EIO;
 		goto error_close;
 	}
 	disk_super = (struct btrfs_super_block *)bh->b_data;
-	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-	    sizeof(disk_super->magic))) {
-		ret = -EINVAL;
-		goto error_brelse;
-	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
 	if (disk_super->label[0])
@@ -553,7 +544,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
-error_brelse:
 	brelse(bh);
 error_close:
 	close_bdev_exclusive(bdev, flags);
@@ -1016,17 +1006,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		}
 
 		set_blocksize(bdev, 4096);
-		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		bh = btrfs_read_dev_super(bdev);
 		if (!bh) {
 			ret = -EIO;
 			goto error_close;
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-			    sizeof(disk_super->magic))) {
-			ret = -ENOENT;
-			goto error_brelse;
-		}
 		devid = le64_to_cpu(disk_super->dev_item.devid);
 		dev_uuid = disk_super->dev_item.uuid;
 		device = btrfs_find_device(root, devid, dev_uuid,
@@ -2563,6 +2548,88 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 				 mirror_num, NULL);
 }
 
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len)
+{
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 *buf;
+	u64 bytenr;
+	u64 length;
+	u64 stripe_nr;
+	int i, j, nr = 0;
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_start, 1);
+	spin_unlock(&em_tree->lock);
+
+	BUG_ON(!em || em->start != chunk_start);
+	map = (struct map_lookup *)em->bdev;
+
+	length = em->len;
+	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		do_div(length, map->num_stripes / map->sub_stripes);
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+		do_div(length, map->num_stripes);
+
+	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+	BUG_ON(!buf);
+
+	for (i = 0; i < map->num_stripes; i++) {
+		if (devid && map->stripes[i].dev->devid != devid)
+			continue;
+		if (map->stripes[i].physical > physical ||
+		    map->stripes[i].physical + length <= physical)
+			continue;
+
+		stripe_nr = physical - map->stripes[i].physical;
+		do_div(stripe_nr, map->stripe_len);
+
+		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+			do_div(stripe_nr, map->sub_stripes);
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+		}
+		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		for (j = 0; j < nr; j++) {
+			if (buf[j] == bytenr)
+				break;
+		}
+		if (j == nr)
+			buf[nr++] = bytenr;
+	}
+
+	for (i = 0; i > nr; i++) {
+		struct btrfs_multi_bio *multi;
+		struct btrfs_bio_stripe *stripe;
+		int ret;
+
+		length = 1;
+		ret = btrfs_map_block(map_tree, WRITE, buf[i],
+				      &length, &multi, 0);
+		BUG_ON(ret);
+
+		stripe = multi->stripes;
+		for (j = 0; j < multi->num_stripes; j++) {
+			if (stripe->physical >= physical &&
+			    physical < stripe->physical + length)
+				break;
+		}
+		BUG_ON(j >= multi->num_stripes);
+		kfree(multi);
+	}
+
+	*logical = buf;
+	*naddrs = nr;
+	*stripe_len = map->stripe_len;
+
+	free_extent_map(em);
+	return 0;
+}
+
 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 		      u64 logical, struct page *page)
 {
@@ -3003,7 +3070,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
 	return read_one_dev(root, buf, dev_item);
 }
 
-int btrfs_read_sys_array(struct btrfs_root *root)
+int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	struct extent_buffer *sb;
@@ -3018,7 +3085,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	u32 cur;
 	struct btrfs_key key;
 
-	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+	sb = btrfs_find_create_tree_block(root, sb_bytenr,
 					  BTRFS_SUPER_INFO_SIZE);
 	if (!sb)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 934d375bacf9ea8a37fbfff5f3cf1c093f324095 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:43:10 -0500
Subject: Btrfs: Use map_private_extent_buffer during generic_bin_search

It is possible that generic_bin_search will be called on a tree block
that has not been locked.  This happens because cache_block_block skips
locking on the tree blocks.

Since the tree block isn't locked, we aren't allowed to change
the extent_buffer->map_token field.  Using map_private_extent_buffer
avoids any changes to the internal extent buffer fields.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a79b3cc09e9..825364fae69 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2594,12 +2594,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 			stripe_nr = stripe_nr * map->num_stripes + i;
 		}
 		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		WARN_ON(nr >= map->num_stripes);
 		for (j = 0; j < nr; j++) {
 			if (buf[j] == bytenr)
 				break;
 		}
-		if (j == nr)
+		if (j == nr) {
+			WARN_ON(nr >= map->num_stripes);
 			buf[nr++] = bytenr;
+		}
 	}
 
 	for (i = 0; i > nr; i++) {
-- 
cgit v1.2.3


From c3027eb5523d6983f12628f3fe13d8a7576db701 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:40:21 -0500
Subject: Btrfs: Add inode sequence number for NFS and reserved space in a few
 structs

This adds a sequence number to the btrfs inode that is increased on
every update.  NFS will be able to use that to detect when an inode has
changed, without relying on inaccurate time fields.

While we're here, this also:

Puts reserved space into the super block and inode

Adds a log root transid to the super so we can pick the newest super
based on the fsync log as well as the main transaction ID.  For now
the log root transid is always zero, but that'll get fixed.

Adds a starting offset to the dev_item.  This will let us do better
alignment calculations if we know the start of a partition on the disk.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 825364fae69..4d210a731d4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -890,6 +890,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_group(leaf, dev_item, 0);
 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
+	btrfs_set_device_start_offset(leaf, dev_item, 0);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-- 
cgit v1.2.3


From e4404d6e8da678d852b7f767f665f8edf76c9e9f Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 12 Dec 2008 10:03:26 -0500
Subject: Btrfs: shared seed device

This patch makes seed device possible to be shared by
multiple mounted file systems. The sharing is achieved
by cloning seed device's btrfs_fs_devices structure.
Thanks you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/volumes.c | 240 +++++++++++++++++++++++++++++------------------------
 1 file changed, 131 insertions(+), 109 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4d210a731d4..6672adcec9f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -47,7 +47,6 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
 
-
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
@@ -74,34 +73,29 @@ static void unlock_chunks(struct btrfs_root *root)
 	mutex_unlock(&root->fs_info->chunk_mutex);
 }
 
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_device *device;
+	WARN_ON(fs_devices->opened);
+	while (!list_empty(&fs_devices->devices)) {
+		device = list_entry(fs_devices->devices.next,
+				    struct btrfs_device, dev_list);
+		list_del(&device->dev_list);
+		kfree(device->name);
+		kfree(device);
+	}
+	kfree(fs_devices);
+}
+
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
-	struct btrfs_device *dev;
 
 	while (!list_empty(&fs_uuids)) {
 		fs_devices = list_entry(fs_uuids.next,
 					struct btrfs_fs_devices, list);
 		list_del(&fs_devices->list);
-		while(!list_empty(&fs_devices->devices)) {
-			dev = list_entry(fs_devices->devices.next,
-					 struct btrfs_device, dev_list);
-			if (dev->bdev) {
-				close_bdev_exclusive(dev->bdev, dev->mode);
-				fs_devices->open_devices--;
-			}
-			fs_devices->num_devices--;
-			if (dev->writeable)
-				fs_devices->rw_devices--;
-			list_del(&dev->dev_list);
-			list_del(&dev->dev_alloc_list);
-			kfree(dev->name);
-			kfree(dev);
-		}
-		WARN_ON(fs_devices->num_devices);
-		WARN_ON(fs_devices->open_devices);
-		WARN_ON(fs_devices->rw_devices);
-		kfree(fs_devices);
+		free_fs_devices(fs_devices);
 	}
 	return 0;
 }
@@ -304,12 +298,55 @@ static noinline int device_list_add(const char *path,
 	return 0;
 }
 
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_device *device;
+	struct btrfs_device *orig_dev;
+
+	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!fs_devices)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&fs_devices->devices);
+	INIT_LIST_HEAD(&fs_devices->alloc_list);
+	INIT_LIST_HEAD(&fs_devices->list);
+	fs_devices->latest_devid = orig->latest_devid;
+	fs_devices->latest_trans = orig->latest_trans;
+	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+
+	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+		device = kzalloc(sizeof(*device), GFP_NOFS);
+		if (!device)
+			goto error;
+
+		device->name = kstrdup(orig_dev->name, GFP_NOFS);
+		if (!device->name)
+			goto error;
+
+		device->devid = orig_dev->devid;
+		device->work.func = pending_bios_fn;
+		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
+		device->barriers = 1;
+		spin_lock_init(&device->io_lock);
+		INIT_LIST_HEAD(&device->dev_list);
+		INIT_LIST_HEAD(&device->dev_alloc_list);
+
+		list_add(&device->dev_list, &fs_devices->devices);
+		device->fs_devices = fs_devices;
+		fs_devices->num_devices++;
+	}
+	return fs_devices;
+error:
+	free_fs_devices(fs_devices);
+	return ERR_PTR(-ENOMEM);
+}
+
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct list_head *tmp;
 	struct list_head *cur;
 	struct btrfs_device *device;
-	int seed_devices = 0;
 
 	mutex_lock(&uuid_mutex);
 again:
@@ -328,17 +365,14 @@ again:
 			device->writeable = 0;
 			fs_devices->rw_devices--;
 		}
-		if (!seed_devices) {
-			list_del_init(&device->dev_list);
-			fs_devices->num_devices--;
-			kfree(device->name);
-			kfree(device);
-		}
+		list_del_init(&device->dev_list);
+		fs_devices->num_devices--;
+		kfree(device->name);
+		kfree(device);
 	}
 
 	if (fs_devices->seed) {
 		fs_devices = fs_devices->seed;
-		seed_devices = 1;
 		goto again;
 	}
 
@@ -348,10 +382,9 @@ again:
 
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct btrfs_fs_devices *seed_devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
-again:
+
 	if (--fs_devices->opened > 0)
 		return 0;
 
@@ -370,31 +403,38 @@ again:
 		device->writeable = 0;
 		device->in_fs_metadata = 0;
 	}
+	WARN_ON(fs_devices->open_devices);
+	WARN_ON(fs_devices->rw_devices);
 	fs_devices->opened = 0;
 	fs_devices->seeding = 0;
-	fs_devices->sprouted = 0;
 
-	seed_devices = fs_devices->seed;
-	fs_devices->seed = NULL;
-	if (seed_devices) {
-		fs_devices = seed_devices;
-		goto again;
-	}
 	return 0;
 }
 
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
+	struct btrfs_fs_devices *seed_devices = NULL;
 	int ret;
 
 	mutex_lock(&uuid_mutex);
 	ret = __btrfs_close_devices(fs_devices);
+	if (!fs_devices->opened) {
+		seed_devices = fs_devices->seed;
+		fs_devices->seed = NULL;
+	}
 	mutex_unlock(&uuid_mutex);
+
+	while (seed_devices) {
+		fs_devices = seed_devices;
+		seed_devices = fs_devices->seed;
+		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
+	}
 	return ret;
 }
 
-int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-			 fmode_t flags, void *holder)
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+				fmode_t flags, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -490,12 +530,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 	mutex_lock(&uuid_mutex);
 	if (fs_devices->opened) {
-		if (fs_devices->sprouted) {
-			ret = -EBUSY;
-		} else {
-			fs_devices->opened++;
-			ret = 0;
-		}
+		fs_devices->opened++;
+		ret = 0;
 	} else {
 		ret = __btrfs_open_devices(fs_devices, flags, holder);
 	}
@@ -1043,12 +1079,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		goto error_brelse;
 
 	device->in_fs_metadata = 0;
-	if (device->fs_devices == root->fs_info->fs_devices) {
-		list_del_init(&device->dev_list);
-		root->fs_info->fs_devices->num_devices--;
-		if (device->bdev)
-			device->fs_devices->open_devices--;
-	}
+	list_del_init(&device->dev_list);
+	device->fs_devices->num_devices--;
 
 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
 				 struct btrfs_device, dev_list);
@@ -1057,34 +1089,27 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
+	if (device->bdev) {
+		close_bdev_exclusive(device->bdev, device->mode);
+		device->bdev = NULL;
+		device->fs_devices->open_devices--;
+	}
+
 	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
 
-	if (device->fs_devices != root->fs_info->fs_devices) {
-		BUG_ON(device->writeable);
-		brelse(bh);
-		if (bdev)
-			close_bdev_exclusive(bdev, FMODE_READ);
-
-		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
-			device->bdev = NULL;
-			device->fs_devices->open_devices--;
-		}
-		if (device->fs_devices->open_devices == 0) {
-			struct btrfs_fs_devices *fs_devices;
-			fs_devices = root->fs_info->fs_devices;
-			while (fs_devices) {
-				if (fs_devices->seed == device->fs_devices)
-					break;
-				fs_devices = fs_devices->seed;
-			}
-			fs_devices->seed = device->fs_devices->seed;
-			device->fs_devices->seed = NULL;
-			__btrfs_close_devices(device->fs_devices);
+	if (device->fs_devices->open_devices == 0) {
+		struct btrfs_fs_devices *fs_devices;
+		fs_devices = root->fs_info->fs_devices;
+		while (fs_devices) {
+			if (fs_devices->seed == device->fs_devices)
+				break;
+			fs_devices = fs_devices->seed;
 		}
-		ret = 0;
-		goto out;
+		fs_devices->seed = device->fs_devices->seed;
+		device->fs_devices->seed = NULL;
+		__btrfs_close_devices(device->fs_devices);
+		free_fs_devices(device->fs_devices);
 	}
 
 	/*
@@ -1099,20 +1124,10 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		set_buffer_dirty(bh);
 		sync_dirty_buffer(bh);
 	}
-	brelse(bh);
 
-	if (device->bdev) {
-		/* one close for the device struct or super_block */
-		close_bdev_exclusive(device->bdev, device->mode);
-	}
-	if (bdev) {
-		/* one close for us */
-		close_bdev_exclusive(bdev, FMODE_READ);
-	}
 	kfree(device->name);
 	kfree(device);
 	ret = 0;
-	goto out;
 
 error_brelse:
 	brelse(bh);
@@ -1133,34 +1148,41 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
+	struct btrfs_fs_devices *seed_devices;
 	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
 	struct btrfs_device *device;
 	u64 super_flags;
 
 	BUG_ON(!mutex_is_locked(&uuid_mutex));
-	if (!fs_devices->seeding || fs_devices->opened != 1)
+	if (!fs_devices->seeding)
 		return -EINVAL;
 
-	old_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-	if (!old_devices)
+	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!seed_devices)
 		return -ENOMEM;
 
-	memcpy(old_devices, fs_devices, sizeof(*old_devices));
-	old_devices->opened = 1;
-	old_devices->sprouted = 1;
-	INIT_LIST_HEAD(&old_devices->devices);
-	INIT_LIST_HEAD(&old_devices->alloc_list);
-	list_splice_init(&fs_devices->devices, &old_devices->devices);
-	list_splice_init(&fs_devices->alloc_list, &old_devices->alloc_list);
-	list_for_each_entry(device, &old_devices->devices, dev_list) {
-		device->fs_devices = old_devices;
+	old_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(old_devices)) {
+		kfree(seed_devices);
+		return PTR_ERR(old_devices);
 	}
+
 	list_add(&old_devices->list, &fs_uuids);
 
+	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+	seed_devices->opened = 1;
+	INIT_LIST_HEAD(&seed_devices->devices);
+	INIT_LIST_HEAD(&seed_devices->alloc_list);
+	list_splice_init(&fs_devices->devices, &seed_devices->devices);
+	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+	list_for_each_entry(device, &seed_devices->devices, dev_list) {
+		device->fs_devices = seed_devices;
+	}
+
 	fs_devices->seeding = 0;
 	fs_devices->num_devices = 0;
 	fs_devices->open_devices = 0;
-	fs_devices->seed = old_devices;
+	fs_devices->seed = seed_devices;
 
 	generate_random_uuid(fs_devices->fsid);
 	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -2642,7 +2664,6 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 				 NULL, 0, page);
 }
 
-
 static void end_bio_multi_stripe(struct bio *bio, int err)
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
@@ -2840,6 +2861,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
 	device->work.func = pending_bios_fn;
+	device->fs_devices = fs_devices;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
 	INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -2980,8 +3002,10 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 		ret = -ENOENT;
 		goto out;
 	}
-	if (fs_devices->opened) {
-		ret = -EBUSY;
+
+	fs_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(fs_devices)) {
+		ret = PTR_ERR(fs_devices);
 		goto out;
 	}
 
@@ -2992,13 +3016,13 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 
 	if (!fs_devices->seeding) {
 		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
 		ret = -EINVAL;
 		goto out;
 	}
 
 	fs_devices->seed = root->fs_info->fs_devices->seed;
 	root->fs_info->fs_devices->seed = fs_devices;
-	fs_devices->sprouted = 1;
 out:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -3011,7 +3035,6 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
-	int seed_devices = 0;
 	u8 fs_uuid[BTRFS_UUID_SIZE];
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 
@@ -3025,14 +3048,13 @@ static int read_one_dev(struct btrfs_root *root,
 
 	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
 		ret = open_seed_devices(root, fs_uuid);
-		if (ret)
+		if (ret && !btrfs_test_opt(root, DEGRADED))
 			return ret;
-		seed_devices = 1;
 	}
 
 	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
 	if (!device || !device->bdev) {
-		if (!btrfs_test_opt(root, DEGRADED) || seed_devices)
+		if (!btrfs_test_opt(root, DEGRADED))
 			return -EIO;
 
 		if (!device) {
@@ -3074,7 +3096,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
 	return read_one_dev(root, buf, dev_item);
 }
 
-int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr)
+int btrfs_read_sys_array(struct btrfs_root *root)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	struct extent_buffer *sb;
@@ -3089,7 +3111,7 @@ int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr)
 	u32 cur;
 	struct btrfs_key key;
 
-	sb = btrfs_find_create_tree_block(root, sb_bytenr,
+	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
 					  BTRFS_SUPER_INFO_SIZE);
 	if (!sb)
 		return -ENOMEM;
-- 
cgit v1.2.3


From d397712bcc6a759a560fd247e6053ecae091f958 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 5 Jan 2009 21:25:51 -0500
Subject: Btrfs: Fix checkpatch.pl warnings

There were many, most are fixed now.  struct-funcs.c generates some warnings
but these are bogus.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 78 +++++++++++++++++++++++++++---------------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6672adcec9f..b187b537888 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -140,7 +140,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
  * the list if the block device is congested.  This way, multiple devices
  * can make progress from a single worker thread.
  */
-static int noinline run_scheduled_bios(struct btrfs_device *device)
+static noinline int run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
@@ -187,7 +187,7 @@ loop:
 	}
 	spin_unlock(&device->io_lock);
 
-	while(pending) {
+	while (pending) {
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
@@ -458,7 +458,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 		bdev = open_bdev_exclusive(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
-			printk("open %s failed\n", device->name);
+			printk(KERN_INFO "open %s failed\n", device->name);
 			goto error;
 		}
 		set_blocksize(bdev, 4096);
@@ -570,14 +570,15 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
 	if (disk_super->label[0])
-		printk("device label %s ", disk_super->label);
+		printk(KERN_INFO "device label %s ", disk_super->label);
 	else {
 		/* FIXME, make a readl uuid parser */
-		printk("device fsid %llx-%llx ",
+		printk(KERN_INFO "device fsid %llx-%llx ",
 		       *(unsigned long long *)disk_super->fsid,
 		       *(unsigned long long *)(disk_super->fsid + 8));
 	}
-	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
+	printk(KERN_INFO "devid %llu transid %llu %s\n",
+	       (unsigned long long)devid, (unsigned long long)transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
 	brelse(bh);
@@ -683,9 +684,8 @@ no_more_items:
 				goto check_pending;
 			}
 		}
-		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
 			goto next;
-		}
 
 		start_found = 1;
 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1001,14 +1001,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
 	    root->fs_info->fs_devices->rw_devices <= 4) {
-		printk("btrfs: unable to go below four devices on raid10\n");
+		printk(KERN_ERR "btrfs: unable to go below four devices "
+		       "on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
 	    root->fs_info->fs_devices->rw_devices <= 2) {
-		printk("btrfs: unable to go below two devices on raid1\n");
+		printk(KERN_ERR "btrfs: unable to go below two "
+		       "devices on raid1\n");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1031,7 +1033,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		bh = NULL;
 		disk_super = NULL;
 		if (!device) {
-			printk("btrfs: no missing devices found to remove\n");
+			printk(KERN_ERR "btrfs: no missing devices found to "
+			       "remove\n");
 			goto out;
 		}
 	} else {
@@ -1060,7 +1063,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 
 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-		printk("btrfs: unable to remove the only writeable device\n");
+		printk(KERN_ERR "btrfs: unable to remove the only writeable "
+		       "device\n");
 		ret = -EINVAL;
 		goto error_brelse;
 	}
@@ -1286,9 +1290,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EINVAL;
 
 	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
-	if (!bdev) {
+	if (!bdev)
 		return -EIO;
-	}
 
 	if (root->fs_info->fs_devices->seeding) {
 		seeding_dev = 1;
@@ -1401,8 +1404,8 @@ error:
 	goto out;
 }
 
-static int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
-				 struct btrfs_device *device)
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+					struct btrfs_device *device)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1563,7 +1566,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	int ret;
 	int i;
 
-	printk("btrfs relocating chunk %llu\n",
+	printk(KERN_INFO "btrfs relocating chunk %llu\n",
 	       (unsigned long long)chunk_offset);
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
@@ -1748,7 +1751,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
@@ -1916,7 +1919,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
+static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
 					int num_stripes, int sub_stripes)
 {
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
@@ -2041,7 +2044,7 @@ again:
 		min_free += 1024 * 1024;
 
 	INIT_LIST_HEAD(&private_devs);
-	while(index < num_stripes) {
+	while (index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 		BUG_ON(!device->writeable);
 		if (device->total_bytes > device->bytes_used)
@@ -2242,7 +2245,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline init_first_rw_device(struct btrfs_trans_handle *trans,
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root,
 					 struct btrfs_device *device)
 {
@@ -2338,7 +2341,7 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 {
 	struct extent_map *em;
 
-	while(1) {
+	while (1) {
 		spin_lock(&tree->map_tree.lock);
 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
 		if (em)
@@ -2413,9 +2416,8 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int max_errors = 0;
 	struct btrfs_multi_bio *multi = NULL;
 
-	if (multi_ret && !(rw & (1 << BIO_RW))) {
+	if (multi_ret && !(rw & (1 << BIO_RW)))
 		stripes_allocated = 1;
-	}
 again:
 	if (multi_ret) {
 		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
@@ -2434,7 +2436,9 @@ again:
 		return 0;
 
 	if (!em) {
-		printk("unable to find logical %Lu len %Lu\n", logical, *length);
+		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+		       (unsigned long long)logical,
+		       (unsigned long long)*length);
 		BUG();
 	}
 
@@ -2541,9 +2545,8 @@ again:
 			device = map->stripes[stripe_index].dev;
 			if (device->bdev) {
 				bdi = blk_get_backing_dev_info(device->bdev);
-				if (bdi->unplug_io_fn) {
+				if (bdi->unplug_io_fn)
 					bdi->unplug_io_fn(bdi, unplug_page);
-				}
 			}
 		} else {
 			multi->stripes[i].physical =
@@ -2717,7 +2720,7 @@ struct async_sched {
  * This will add one bio to the pending list for a device and make sure
  * the work struct is scheduled.
  */
-static int noinline schedule_bio(struct btrfs_root *root,
+static noinline int schedule_bio(struct btrfs_root *root,
 				 struct btrfs_device *device,
 				 int rw, struct bio *bio)
 {
@@ -2785,8 +2788,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
 	total_devs = multi->num_stripes;
 	if (map_length < length) {
-		printk("mapping failed logical %Lu bio len %Lu "
-		       "len %Lu\n", logical, length, map_length);
+		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+		       "len %llu\n", (unsigned long long)logical,
+		       (unsigned long long)length,
+		       (unsigned long long)map_length);
 		BUG();
 	}
 	multi->end_io = first_bio->bi_end_io;
@@ -2794,7 +2799,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	multi->orig_bio = first_bio;
 	atomic_set(&multi->stripes_pending, multi->num_stripes);
 
-	while(dev_nr < total_devs) {
+	while (dev_nr < total_devs) {
 		if (total_devs > 1) {
 			if (dev_nr < total_devs - 1) {
 				bio = bio_clone(first_bio, GFP_NOFS);
@@ -3058,7 +3063,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -EIO;
 
 		if (!device) {
-			printk("warning devid %Lu missing\n", devid);
+			printk(KERN_WARNING "warning devid %llu missing\n",
+			       (unsigned long long)devid);
 			device = add_missing_dev(root, devid, dev_uuid);
 			if (!device)
 				return -ENOMEM;
@@ -3078,12 +3084,6 @@ static int read_one_dev(struct btrfs_root *root,
 	if (device->writeable)
 		device->fs_devices->total_rw_bytes += device->total_bytes;
 	ret = 0;
-#if 0
-	ret = btrfs_open_device(device);
-	if (ret) {
-		kfree(device);
-	}
-#endif
 	return ret;
 }
 
@@ -3174,7 +3174,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	key.type = 0;
 again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	while(1) {
+	while (1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
-- 
cgit v1.2.3


From 1d9e2ae949411c2f329f30e01ea0355cd02c4296 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 Jan 2009 11:58:19 -0500
Subject: Btrfs: Clear the device->running_pending flag before bailing on
 congestion

Btrfs maintains a queue of async bio submissions so the checksumming
threads don't have to wait on get_request_wait.  In order to avoid
extra wakeups, this code has a running_pending flag that is used
to tell new submissions they don't need to wake the thread.

When the threads notice congestion on a single device, they
may decide to requeue the job and move on to other devices.  This
makes sure the running_pending flag is cleared before the
job is requeued.

It should help avoid IO stalls by making sure the task is woken up
when new submissions come in.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b187b537888..3451e1cca2b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -220,6 +220,7 @@ loop:
 				tail->bi_next = old_head;
 			else
 				device->pending_bio_tail = tail;
+			device->running_pending = 0;
 
 			spin_unlock(&device->io_lock);
 			btrfs_requeue_work(&device->work);
-- 
cgit v1.2.3