aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-07-15 08:36:38 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-15 08:36:38 -0700
commit8d2567a620ae8c24968a2bdc1c906c724fac1f6a (patch)
tree8e228abbadbe760e3f015d30c2e1180a67eeb8f9
parentbcf559e385ba099996c90469c817d2eb38aba418 (diff)
parent49f1487b2e41bd8439ea39a4f15b4064e823cc54 (diff)
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (61 commits) ext4: Documention update for new ordered mode and delayed allocation ext4: do not set extents feature from the kernel ext4: Don't allow nonextenst mount option for large filesystem ext4: Enable delalloc by default. ext4: delayed allocation i_blocks fix for stat ext4: fix delalloc i_disksize early update issue ext4: Handle page without buffers in ext4_*_writepage() ext4: Add ordered mode support for delalloc ext4: Invert lock ordering of page_lock and transaction start in delalloc mm: Add range_cont mode for writeback ext4: delayed allocation ENOSPC handling percpu_counter: new function percpu_counter_sum_and_set ext4: Add delayed allocation support in data=writeback mode vfs: add hooks for ext4's delayed allocation support jbd2: Remove data=ordered mode support using jbd buffer heads ext4: Use new framework for data=ordered mode in JBD2 jbd2: Implement data=ordered mode handling via inodes vfs: export filemap_fdatawrite_range() ext4: Fix lock inversion in ext4_ext_truncate() ext4: Invert the locking order of page_lock and transaction start ...
-rw-r--r--Documentation/filesystems/ext4.txt125
-rw-r--r--fs/buffer.c19
-rw-r--r--fs/ext4/balloc.c209
-rw-r--r--fs/ext4/dir.c17
-rw-r--r--fs/ext4/ext4.h61
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h10
-rw-r--r--fs/ext4/ext4_jbd2.h21
-rw-r--r--fs/ext4/ext4_sb.h5
-rw-r--r--fs/ext4/extents.c111
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/group.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/inode.c1591
-rw-r--r--fs/ext4/mballoc.c451
-rw-r--r--fs/ext4/namei.c45
-rw-r--r--fs/ext4/resize.c52
-rw-r--r--fs/ext4/super.c142
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c294
-rw-r--r--fs/jbd2/journal.c53
-rw-r--r--fs/jbd2/transaction.c365
-rw-r--r--fs/mpage.c14
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/jbd2.h73
-rw-r--r--include/linux/mpage.h10
-rw-r--r--include/linux/percpu_counter.h12
-rw-r--r--include/linux/writeback.h1
-rw-r--r--lib/percpu_counter.c7
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/page-writeback.c3
35 files changed, 2805 insertions, 1042 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 0c5086db835..80e193d82e2 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -13,72 +13,93 @@ Mailing list: linux-ext4@vger.kernel.org
1. Quick usage instructions:
===========================
- - Grab updated e2fsprogs from
- ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
- This is a patchset on top of e2fsprogs-1.39, which can be found at
+ - Compile and install the latest version of e2fsprogs (as of this
+ writing version 1.41) from:
+
+ http://sourceforge.net/project/showfiles.php?group_id=2406
+
+ or
+
ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
- - It's still mke2fs -j /dev/hda1
+ or grab the latest git repository from:
+
+ git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
+
+ - Create a new filesystem using the ext4dev filesystem type:
+
+ # mke2fs -t ext4dev /dev/hda1
+
+ Or configure an existing ext3 filesystem to support extents and set
+ the test_fs flag to indicate that it's ok for an in-development
+ filesystem to touch this filesystem:
- - mount /dev/hda1 /wherever -t ext4dev
+ # tune2fs -O extents -E test_fs /dev/hda1
- - To enable extents,
+ If the filesystem was created with 128 byte inodes, it can be
+ converted to use 256 byte for greater efficiency via:
- mount /dev/hda1 /wherever -t ext4dev -o extents
+ # tune2fs -I 256 /dev/hda1
- - The filesystem is compatible with the ext3 driver until you add a file
- which has extents (ie: `mount -o extents', then create a file).
+ (Note: we currently do not have tools to convert an ext4dev
+ filesystem back to ext3; so please do not do try this on production
+ filesystems.)
- NOTE: The "extents" mount flag is temporary. It will soon go away and
- extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
+ - Mounting:
+
+ # mount -t ext4dev /dev/hda1 /wherever
- When comparing performance with other filesystems, remember that
- ext3/4 by default offers higher data integrity guarantees than most. So
- when comparing with a metadata-only journalling filesystem, use `mount -o
- data=writeback'. And you might as well use `mount -o nobh' too along
- with it. Making the journal larger than the mke2fs default often helps
- performance with metadata-intensive workloads.
+ ext3/4 by default offers higher data integrity guarantees than most.
+ So when comparing with a metadata-only journalling filesystem, such
+ as ext3, use `mount -o data=writeback'. And you might as well use
+ `mount -o nobh' too along with it. Making the journal larger than
+ the mke2fs default often helps performance with metadata-intensive
+ workloads.
2. Features
===========
2.1 Currently available
-* ability to use filesystems > 16TB
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
* extent format reduces metadata overhead (RAM, IO for access, transactions)
* extent format more robust in face of on-disk corruption due to magics,
* internal redunancy in tree
-
-2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
-
-* dir_index and resize inode will be on by default
-* large inodes will be used by default for fast EAs, nsec timestamps, etc
+* improved file allocation (multi-block alloc)
+* fix 32000 subdirectory limit
+* nsec timestamps for mtime, atime, ctime, create time
+* inode version field on disk (NFSv4, Lustre)
+* reduced e2fsck time via uninit_bg feature
+* journal checksumming for robustness, performance
+* persistent file preallocation (e.g for streaming media, databases)
+* ability to pack bitmaps and inode tables into larger virtual groups via the
+ flex_bg feature
+* large file support
+* Inode allocation using large virtual block groups via flex_bg
+* delayed allocation
+* large block (up to pagesize) support
+* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
+ the ordering)
2.2 Candidate features for future inclusion
-There are several under discussion, whether they all make it in is
-partly a function of how much time everyone has to work on them:
+* Online defrag (patches available but not well tested)
+* reduced mke2fs time via lazy itable initialization in conjuction with
+ the uninit_bg feature (capability to do this is available in e2fsprogs
+ but a kernel thread to do lazy zeroing of unused inode table blocks
+ after filesystem is first mounted is required for safety)
-* improved file allocation (multi-block alloc, delayed alloc; basically done)
-* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
-* nsec timestamps for mtime, atime, ctime, create time (patch exists,
- needs some e2fsck work)
-* inode version field on disk (NFSv4, Lustre; prototype exists)
-* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
-* journal checksumming for robustness, performance (prototype exists)
-* persistent file preallocation (e.g for streaming media, databases)
+There are several others under discussion, whether they all make it in is
+partly a function of how much time everyone has to work on them. Features like
+metadata checksumming have been discussed and planned for a bit but no patches
+exist yet so I'm not sure they're in the near-term roadmap.
-Features like metadata checksumming have been discussed and planned for
-a bit but no patches exist yet so I'm not sure they're in the near-term
-roadmap.
+The big performance win will come with mballoc, delalloc and flex_bg
+grouping of bitmaps and inode tables. Some test results available here:
-The big performance win will come with mballoc and delalloc. CFS has
-been using mballoc for a few years already with Lustre, and IBM + Bull
-did a lot of benchmarking on it. The reason it isn't in the first set of
-patches is partly a manageability issue, and partly because it doesn't
-directly affect the on-disk format (outside of much better allocation)
-so it isn't critical to get into the first round of changes. I believe
-Alex is working on a new set of patches right now.
+ - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
+ - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
3. Options
==========
@@ -222,9 +243,11 @@ stripe=n Number of filesystem blocks that mballoc will try
to use for allocation size and alignment. For RAID5/6
systems this should be the number of data
disks * RAID chunk size in file system blocks.
-
+delalloc (*) Deferring block allocation until write-out time.
+nodelalloc Disable delayed allocation. Blocks are allocation
+ when data is copied from user to page cache.
Data Mode
----------
+=========
There are 3 different data modes:
* writeback mode
@@ -236,10 +259,10 @@ typically provide the best ext4 performance.
* ordered mode
In data=ordered mode, ext4 only officially journals metadata, but it logically
-groups metadata and data blocks into a single unit called a transaction. When
-it's time to write the new metadata out to disk, the associated data blocks
-are written first. In general, this mode performs slightly slower than
-writeback but significantly faster than journal mode.
+groups metadata information related to data changes with the data blocks into a
+single unit called a transaction. When it's time to write the new metadata
+out to disk, the associated data blocks are written first. In general,
+this mode performs slightly slower than writeback but significantly faster than journal mode.
* journal mode
data=journal mode provides full data and metadata journaling. All new data is
@@ -247,7 +270,8 @@ written to the journal first, and then to its final location.
In the event of a crash, the journal can be replayed, bringing both data and
metadata into a consistent state. This mode is the slowest except when data
needs to be read from and written to disk at the same time where it
-outperforms all others modes.
+outperforms all others modes. Curently ext4 does not have delayed
+allocation support if this data journalling mode is selected.
References
==========
@@ -256,7 +280,8 @@ kernel source: <file:fs/ext4/>
<file:fs/jbd2/>
programs: http://e2fsprogs.sourceforge.net/
- http://ext2resize.sourceforge.net
useful links: http://fedoraproject.org/wiki/ext3-devel
http://www.bullopensource.org/ext4/
+ http://ext4.wiki.kernel.org/index.php/Main_Page
+ http://fedoraproject.org/wiki/Features/Ext4
diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c26..5fa1512cd9a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+ } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+ buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
goto recover;
+ clear_buffer_delay(bh);
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
+ if (buffer_mapped(bh) && buffer_dirty(bh) &&
+ !buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write(bh);
} else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
+ int i_size_changed = 0;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
*/
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
+ i_size_changed = 1;
}
unlock_page(page);
page_cache_release(page);
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
return copied;
}
EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d..495ab21b983 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
ext4_group_t block_group)
{
ext4_group_t actual_group;
- ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
if (actual_group == block_group)
return 1;
return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
- int group_rel = (block_group -
- le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
- EXT4_DESC_PER_BLOCK(sb);
- if (group_rel == 0 || group_rel == 1 ||
- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
- bit_max += 1;
+ bit_max += ext4_bg_num_gdb(sb, block_group);
}
if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
return 0;
}
/**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
* @sb: super block
* @block_group: given block group
*
@@ -305,7 +300,7 @@ err_out:
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc * desc;
struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
prev = rsv;
}
printk("Window map complete.\n");
- if (bad)
- BUG();
+ BUG_ON(bad);
}
#define rsv_window_dump(root, verbose) \
__rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
count -= overflow;
}
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
/**
* ext4_has_free_blocks()
- * @sbi: in-core super block structure.
+ * @sbi: in-core super block structure.
+ * @nblocks: number of neeed blocks
*
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
*/
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks)
{
- ext4_fsblk_t free_blocks, root_blocks;
+ ext4_fsblk_t free_blocks;
+ ext4_fsblk_t root_blocks = 0;
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- root_blocks = ext4_r_blocks_count(sbi->s_es);
- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+
+ if (!capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
- return 0;
- }
- return 1;
-}
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+ if (free_blocks - root_blocks < FBC_BATCH)
+ free_blocks =
+ percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+ if (free_blocks - root_blocks < nblocks)
+ return free_blocks - root_blocks;
+ return nblocks;
+ }
+
/**
* ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
}
/**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
* @count: target number of blocks to allocate
* @errp: error code
*
- * ext4_new_blocks uses a goal block to assist allocation. It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation. It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
*
*/
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
ext4_group_t ngroups;
unsigned long num = *count;
- *errp = -ENOSPC;
sb = inode->i_sb;
if (!sb) {
+ *errp = -ENODEV;
printk("ext4_new_block: nonexistent device");
return 0;
}
+ sbi = EXT4_SB(sb);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ *count = ext4_has_free_blocks(sbi, *count);
+ }
+ if (*count == 0) {
+ *errp = -ENOSPC;
+ return 0; /*return with ENOSPC error */
+ }
+ num = *count;
+
/*
* Check quota for allocation of this block.
*/
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
my_rsv = &block_i->rsv_window_node;
- if (!ext4_has_free_blocks(sbi)) {
- *errp = -ENOSPC;
- goto out;
- }
-
/*
* First, test whether the goal block is free.
*/
@@ -1734,7 +1759,7 @@ retry_alloc:
my_rsv = NULL;
if (free_blocks > 0) {
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
continue;
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
/*
@@ -1882,7 +1907,15 @@ allocated:
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= num;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
return 0;
}
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
+#define EXT4_META_BLOCK 0x1
+
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp, int flags)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
if (!test_opt(inode->i_sb, MBALLOC)) {
- unsigned long count = 1;
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
- return ret;
+ return ext4_old_new_blocks(handle, inode, goal, count, errp);
}
memset(&ar, 0, sizeof(ar));
+ /* Fill with neighbour allocated blocks */
+
ar.inode = inode;
ar.goal = goal;
- ar.len = 1;
+ ar.len = *count;
+ ar.logical = iblock;
+
+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+ /* enable in-core preallocation for data block allocation */
+ ar.flags = EXT4_MB_HINT_DATA;
+ else
+ /* disable in-core preallocation for non-regular files */
+ ar.flags = 0;
+
ret = ext4_mb_new_blocks(handle, &ar, errp);
+ *count = ar.len;
return ret;
}
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
- struct ext4_allocation_request ar;
ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
+ ret = do_blk_alloc(handle, inode, 0, goal,
+ count, errp, EXT4_META_BLOCK);
+ /*
+ * Account for the allocated meta blocks
+ */
+ if (!(*errp)) {
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_allocated_meta_blocks += *count;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}
-
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
return ret;
}
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @errp: error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int *errp)
+{
+ unsigned long count = 1;
+ return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
/**
* ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
continue;
desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, i);
+ bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea19..d3d23d73c08 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+ err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+ 0, 0, 0);
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
while (n) {
/* Do the node's children first */
- if ((n)->rb_left) {
+ if (n->rb_left) {
n = n->rb_left;
continue;
}
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
parent->rb_right = NULL;
n = parent;
}
- root->rb_node = NULL;
}
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
{
struct dir_private_info *p;
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->root.rb_node = NULL;
- p->curr_node = NULL;
- p->extra_fname = NULL;
- p->last_pos = 0;
p->curr_hash = pos2maj_hash(pos);
p->curr_minor_hash = pos2min_hash(pos);
- p->next_hash = 0;
return p;
}
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
int ret;
if (!info) {
- info = create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac..303e41cf7b1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
#include "ext4_i.h"
/*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
*/
/*
@@ -45,7 +45,7 @@
#define ext4_debug(f, a...) \
do { \
printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
- __FILE__, __LINE__, __FUNCTION__); \
+ __FILE__, __LINE__, __func__); \
printk (KERN_DEBUG f, ## a); \
} while (0)
#else
@@ -74,6 +74,9 @@
#define EXT4_MB_HINT_GOAL_ONLY 256
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL 512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED 1024
+
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
__u32 bg_reserved2[3];
};
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ __u32 free_inodes;
+ __u32 free_blocks;
+};
+
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do { \
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad2;
+ __le16 s_reserved_pad;
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+ ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+ ext4_grpblk_t add);
/* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
extern struct inode *ext4_iget(struct super_block *, unsigned long);
extern int ext4_write_inode (struct inode *, int);
extern int ext4_setattr (struct dentry *, struct iattr *);
+extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
extern void ext4_delete_inode (struct inode *);
extern int ext4_sync_inode (handle_t *, struct inode *);
extern void ext4_discard_reservation (struct inode *);
extern void ext4_dirty_inode(struct inode *);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate (struct inode *);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
}
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+ return 1 << sbi->s_log_groups_per_flex;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
- __ext4_std_error((sb), __FUNCTION__, (errno)); \
+ __ext4_std_error((sb), __func__, (errno)); \
} while (0)
/*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock,
unsigned long max_blocks, struct buffer_head *bh_result,
int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
sector_t block, unsigned long max_blocks,
struct buffer_head *bh, int create,
- int extend_disksize);
+ int extend_disksize, int flag);
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fa..6c166c0a54b 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d7..ef7409f0e7e 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
};
/*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
*/
struct rw_semaphore i_data_sem;
struct inode vfs_inode;
+ struct jbd2_inode jinode;
unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
/* mballoc */
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+
+ /* allocation reservation info for delalloc */
+ unsigned long i_reserved_data_blocks;
+ unsigned long i_reserved_meta_blocks;
+ unsigned long i_allocated_meta_blocks;
+ unsigned short i_delalloc_reserved_flag;
+ spinlock_t i_block_reservation_lock;
};
#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b2..eb8bc3afe6e 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
handle_t *handle, struct buffer_head *bh);
#define ext4_journal_get_undo_access(handle, bh) \
- __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_undo_access(__func__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
- __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_write_access(__func__, (handle), (bh))
#define ext4_journal_revoke(handle, blocknr, bh) \
- __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
#define ext4_journal_get_create_access(handle, bh) \
- __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_create_access(__func__, (handle), (bh))
#define ext4_journal_dirty_metadata(handle, bh) \
- __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+ __ext4_journal_dirty_metadata(__func__, (handle), (bh))
#define ext4_journal_forget(handle, bh) \
- __ext4_journal_forget(__FUNCTION__, (handle), (bh))
-
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+ __ext4_journal_forget(__func__, (handle), (bh))
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
}
#define ext4_journal_stop(handle) \
- __ext4_journal_stop(__FUNCTION__, (handle))
+ __ext4_journal_stop(__func__, (handle))
static inline handle_t *ext4_journal_current_handle(void)
{
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
return jbd2_journal_force_commit(journal);
}
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f219..6300226d553 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
#include <linux/rbtree.h>
/*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
*/
struct ext4_sb_info {
unsigned long s_desc_size; /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
/* locality groups */
struct ext4_locality_group *s_locality_groups;
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
};
#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3da..42c4c0c892e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
{
int err;
if (handle->h_buffer_credits > needed)
- return handle;
- if (!ext4_journal_extend(handle, needed))
- return handle;
- err = ext4_journal_restart(handle, needed);
-
- return handle;
+ return 0;
+ err = ext4_journal_extend(handle, needed);
+ if (err)
+ return err;
+ return ext4_journal_restart(handle, needed);
}
/*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
return bg_start + colour + block;
}
+/*
+ * Allocation for a meta data block
+ */
static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex, int *err)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_block(handle, inode, goal, err);
+ newblock = ext4_new_meta_block(handle, inode, goal, err);
return newblock;
}
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
return size;
}
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int lcap, icap, rcap, leafs, idxs, num;
+ int newextents = blocks;
+
+ rcap = ext4_ext_space_root_idx(inode);
+ lcap = ext4_ext_space_block(inode);
+ icap = ext4_ext_space_block_idx(inode);
+
+ /* number of new leaf blocks needed */
+ num = leafs = (newextents + lcap - 1) / lcap;
+
+ /*
+ * Worse case, we need separate index block(s)
+ * to link all new leaf blocks
+ */
+ idxs = (leafs + icap - 1) / icap;
+ do {
+ num += idxs;
+ idxs = (idxs + icap - 1) / icap;
+ } while (idxs > rcap);
+
+ return num;
+}
+
static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
alloc = 1;
}
path[0].p_hdr = eh;
+ path[0].p_bh = NULL;
i = depth;
/* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
}
path[ppos].p_depth = i;
- path[ppos].p_hdr = eh;
path[ppos].p_ext = NULL;
path[ppos].p_idx = NULL;
/* find extent */
ext4_ext_binsearch(inode, path + ppos, block);
+ /* if not an empty leaf */
+ if (path[ppos].p_ext)
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
ext4_ext_show_path(inode, path);
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* allocate all needed blocks */
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
if (newblock == 0)
return err;
@@ -981,6 +1017,8 @@ repeat:
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, path, newext, i);
+ if (err)
+ goto out;
/* refill path */
ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
#endif
- handle = ext4_ext_journal_restart(handle, credits);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
+ err = ext4_ext_journal_restart(handle, credits);
+ if (err)
goto out;
- }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret;
unsigned long allocated = 0;
struct ext4_allocation_request ar;
+ loff_t disksize;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
if (allocated > max_blocks)
allocated = max_blocks;
- /* mark the buffer unwritten */
- __set_bit(BH_Unwritten, &bh_result->b_state);
+ set_buffer_unwritten(bh_result);
goto out2;
}
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
goto out2;
}
- if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = inode->i_size;
-
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
outnew:
- __set_bit(BH_New, &bh_result->b_state);
+ if (extend_disksize) {
+ disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ }
+
+ set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
if (allocated > max_blocks)
allocated = max_blocks;
ext4_ext_show_leaf(inode, path);
- __set_bit(BH_Mapped, &bh_result->b_state);
+ set_buffer_mapped(bh_result);
bh_result->b_bdev = inode->i_sb->s_bdev;
bh_result->b_blocknr = newblock;
out2:
@@ -2744,7 +2785,7 @@ out2:
return err ? err : allocated;
}
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
{
struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
*/
err = ext4_writepage_trans_blocks(inode) + 3;
handle = ext4_journal_start(inode, err);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return;
- }
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
* Probably we need not scan at all,
* because page truncation is enough.
*/
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
/* we have to know where to truncate from in crash case */
EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
handle->h_sync = 1;
out_stop:
+ up_write(&EXT4_I(inode)->i_data_sem);
/*
* If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- up_write(&EXT4_I(inode)->i_data_sem);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
}
ret = ext4_get_blocks_wrap(handle, inode, block,
max_blocks, &map_bh,
- EXT4_CREATE_UNINITIALIZED_EXT, 0);
+ EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366a..430eb7978db 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
return ret;
}
+static struct vm_operations_struct ext4_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext4_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext4_file_mmap,
.open = generic_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
const struct inode_operations ext4_file_inode_operations = {
.truncate = ext4_truncate,
.setattr = ext4_setattr,
+ .getattr = ext4_getattr,
#ifdef CONFIG_EXT4DEV_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8..a45c3737ad3 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/jbd2.h>
+#include <linux/blkdev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -45,6 +46,7 @@
int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
.nr_to_write = 0, /* sys_fsync did this */
};
ret = sync_inode(inode, &wbc);
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
}
out:
return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7ee..c2c0a8d06d0 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
ext4_group_t block_group);
extern unsigned ext4_init_block_bitmap(struct super_block *sb,
struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c80..a92eb305344 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
struct ext4_super_block * es;
struct ext4_sb_info *sbi;
int fatal = 0, err;
+ ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
if (is_directory)
percpu_counter_dec(&sbi->s_dirs_counter);
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes++;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
}
BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
return ret;
}
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+ ext4_group_t *best_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+ struct flex_groups *flex_group = sbi->s_flex_groups;
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+ ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+ ext4_group_t ngroups = sbi->s_groups_count;
+ int flex_size = ext4_flex_bg_size(sbi);
+ ext4_group_t best_flex = parent_fbg_group;
+ int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+ int flexbg_free_blocks;
+ int flex_freeb_ratio;
+ ext4_group_t n_fbg_groups;
+ ext4_group_t i;
+
+ n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+ flexbg_free_blocks = flex_group[best_flex].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+ if (flex_group[best_flex].free_inodes &&
+ flex_freeb_ratio > free_block_ratio)
+ goto found_flexbg;
+
+ if (best_flex && best_flex == parent_fbg_group) {
+ best_flex--;
+ goto find_close_to_parent;
+ }
+
+ for (i = 0; i < n_fbg_groups; i++) {
+ if (i == parent_fbg_group || i == parent_fbg_group - 1)
+ continue;
+
+ flexbg_free_blocks = flex_group[i].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+ if (flex_freeb_ratio > free_block_ratio &&
+ flex_group[i].free_inodes) {
+ best_flex = i;
+ goto found_flexbg;
+ }
+
+ if (best_flex < 0 ||
+ (flex_group[i].free_blocks >
+ flex_group[best_flex].free_blocks &&
+ flex_group[i].free_inodes))
+ best_flex = i;
+ }
+
+ if (!flex_group[best_flex].free_inodes ||
+ !flex_group[best_flex].free_blocks)
+ return -1;
+
+found_flexbg:
+ for (i = best_flex * flex_size; i < ngroups &&
+ i < (best_flex + 1) * flex_size; i++) {
+ desc = ext4_get_group_desc(sb, i, &bh);
+ if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ *best_group = i;
+ goto out;
+ }
+ }
+
+ return -1;
+out:
+ return 0;
+}
+
/*
* Orlov's allocator for directories.
*
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
struct inode *ret;
ext4_group_t i;
int free = 0;
+ ext4_group_t flex_group;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;
+
+ if (sbi->s_log_groups_per_flex) {
+ ret2 = find_group_flex(sb, dir, &group);
+ goto got_group;
+ }
+
if (S_ISDIR(mode)) {
if (test_opt (sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
} else
ret2 = find_group_other(sb, dir, &group);
+got_group:
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -600,7 +689,7 @@ got:
/* We may have to initialize the block bitmap if it isn't already */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bh = read_block_bitmap(sb, group);
+ struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
BUFFER_TRACE(block_bh, "get block bitmap access");
err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
percpu_counter_inc(&sbi->s_dirs_counter);
sb->s_dirt = 1;
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes--;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
inode->i_uid = current->fsuid;
if (test_opt (sb, GRPID))
inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
goto fail_free_drop;
if (test_opt(sb, EXTENTS)) {
- /* set extent flag only for diretory, file and normal symlink*/
+ /* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
ext4_ext_tree_init(handle, inode);
- err = ext4_update_incompat_feature(handle, sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS);
- if (err)
- goto fail_free_drop;
}
}
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
if (IS_ERR(inode))
goto iget_failed;
+ /*
+ * If the orphans has i_nlinks > 0 then it should be able to be
+ * truncated, otherwise it won't be removed from the orphan list
+ * during processing and an infinite loop will result.
+ */
+ if (inode->i_nlink && !ext4_can_truncate(inode))
+ goto bad_orphan;
+
if (NEXT_ORPHAN(inode) > max_ino)
goto bad_orphan;
brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d970774641..8ca2763df09 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "ext4_extents.h"
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+ new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
/*
* Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
{
handle_t *handle;
+ if (ext4_should_order_data(inode))
+ ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
* direct blocks
*/
static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ int indirect_blks, int blks,
+ ext4_fsblk_t new_blocks[4], int *err)
{
int target, i;
- unsigned long count = 0;
+ unsigned long count = 0, blk_allocated = 0;
int index = 0;
ext4_fsblk_t current_block = 0;
int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
* the first direct block of this branch. That's the
* minimum number of blocks need to allocate(required)
*/
- target = blks + indirect_blks;
-
- while (1) {
+ /* first we try to allocate the indirect blocks */
+ target = indirect_blks;
+ while (target > 0) {
count = target;
/* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+ current_block = ext4_new_meta_blocks(handle, inode,
+ goal, &count, err);
if (*err)
goto failed_out;
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
new_blocks[index++] = current_block++;
count--;
}
-
- if (count > 0)
+ if (count > 0) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ printk(KERN_INFO "%s returned more blocks than "
+ "requested\n", __func__);
+ WARN_ON(1);
break;
+ }
}
- /* save the new block number for the first direct block */
- new_blocks[index] = current_block;
-
+ target = blks - count ;
+ blk_allocated = count;
+ if (!target)
+ goto allocated;
+ /* Now allocate data blocks */
+ count = target;
+ /* allocating blocks for data blocks */
+ current_block = ext4_new_blocks(handle, inode, iblock,
+ goal, &count, err);
+ if (*err && (target == blks)) {
+ /*
+ * if the allocation failed and we didn't allocate
+ * any blocks before
+ */
+ goto failed_out;
+ }
+ if (!*err) {
+ if (target == blks) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ }
+ blk_allocated += count;
+ }
+allocated:
/* total number of blocks allocated for direct blocks */
- ret = count;
+ ret = blk_allocated;
*err = 0;
return ret;
failed_out:
@@ -584,8 +631,9 @@ failed_out:
* as described above and return 0.
*/
static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- int indirect_blks, int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
+ ext4_lblk_t iblock, int indirect_blks,
+ int *blks, ext4_fsblk_t goal,
+ ext4_lblk_t *offsets, Indirect *branch)
{
int blocksize = inode->i_sb->s_blocksize;
int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
ext4_fsblk_t new_blocks[4];
ext4_fsblk_t current_block;
- num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+ num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
*blks, new_blocks, &err);
if (err)
return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
struct ext4_inode_info *ei = EXT4_I(inode);
int count = 0;
ext4_fsblk_t first_block = 0;
+ loff_t disksize;
J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
/*
* Block out ext4_truncate while we alter the tree
*/
- err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
- offsets + (partial - chain), partial);
+ err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+ &count, goal,
+ offsets + (partial - chain), partial);
/*
* The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
* protect it if you're about to implement concurrent
* ext4_get_block() -bzzz
*/
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
+ if (!err && extend_disksize) {
+ disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > ei->i_disksize)
+ ei->i_disksize = disksize;
+ }
if (err)
goto cleanup;
@@ -934,7 +989,7 @@ out:
*/
int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
unsigned long max_blocks, struct buffer_head *bh,
- int create, int extend_disksize)
+ int create, int extend_disksize, int flag)
{
int retval;
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
* with create == 1 flag.
*/
down_write((&EXT4_I(inode)->i_data_sem));
+
+ /*
+ * if the caller is from delayed allocation writeout path
+ * we have already reserved fs blocks for allocation
+ * let the underlying get_block() function know to
+ * avoid double accounting
+ */
+ if (flag)
+ EXT4_I(inode)->i_delalloc_reserved_flag = 1;
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
~EXT4_EXT_MIGRATE;
}
}
+
+ if (flag) {
+ EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+ /*
+ * Update reserved blocks/metadata blocks
+ * after successful block allocation
+ * which were deferred till now
+ */
+ if ((retval > 0) && buffer_delay(bh))
+ ext4_da_release_space(inode, retval, 0);
+ }
+
up_write((&EXT4_I(inode)->i_data_sem));
return retval;
}
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
}
ret = ext4_get_blocks_wrap(handle, inode, iblock,
- max_blocks, bh_result, create, 0);
+ max_blocks, bh_result, create, 0, 0);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
err = ext4_get_blocks_wrap(handle, inode, block, 1,
- &dummy, create, 1);
+ &dummy, create, 1, 0);
/*
* ext4_get_blocks_handle() returns number of blocks
* mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
to = from + len;
retry:
- page = __grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
- unlock_page(page);
- page_cache_release(page);
ret = PTR_ERR(handle);
goto out;
}
+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ ext4_journal_stop(handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ *pagep = page;
+
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
ext4_get_block);
@@ -1225,8 +1302,8 @@ retry:
}
if (ret) {
- ext4_journal_stop(handle);
unlock_page(page);
+ ext4_journal_stop(handle);
page_cache_release(page);
}
@@ -1236,15 +1313,6 @@ out:
return ret;
}
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- int err = jbd2_journal_dirty_data(handle, bh);
- if (err)
- ext4_journal_abort_handle(__func__, __func__,
- bh, handle, err);
- return err;
-}
-
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
}
/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- return copied;
-}
-
-/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
unsigned from, to;
int ret = 0, ret2;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext4_journal_dirty_data);
+ ret = ext4_jbd2_file_inode(handle, inode);
if (ret == 0) {
/*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
new_i_size = pos + copied;
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
loff_t new_i_size;
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}
+ unlock_page(page);
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
page_cache_release(page);
return ret ? ret : copied;
}
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ind_blks, dind_blks, tind_blks;
+
+ /* number of new indirect blocks needed */
+ ind_blks = (blocks + icap - 1) / icap;
+
+ dind_blks = (ind_blks + icap - 1) / icap;
+
+ tind_blks = 1;
+
+ return ind_blks + dind_blks + tind_blks;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_calc_metadata_amount(inode, blocks);
+
+ return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned long md_needed, mdblocks, total = 0;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+ mdblocks = ext4_calc_metadata_amount(inode, total);
+ BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+ md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+ total = md_needed + nrblocks;
+
+ if (ext4_has_free_blocks(sbi, total) < total) {
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return -ENOSPC;
+ }
+
+ /* reduce fs free blocks counter */
+ percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+
+ EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return 0; /* success */
+}
+
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int total, mdb, mdb_free, release;
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ /* recalculate the number of metablocks still need to be reserved */
+ total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+ mdb = ext4_calc_metadata_amount(inode, total);
+
+ /* figure out how many metablocks to release */
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+ /* Account for allocated meta_blocks */
+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+ release = to_free + mdb_free;
+
+ /* update fs free blocks counter for truncate case */
+ percpu_counter_add(&sbi->s_freeblocks_counter, release);
+
+ /* update per-inode reservations */
+ BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+ EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+ unsigned long offset)
+{
+ int to_release = 0;
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+
+ if ((offset <= curr_off) && (buffer_delay(bh))) {
+ to_release++;
+ clear_buffer_delay(bh);
+ }
+ curr_off = next_off;
+ } while ((bh = bh->b_this_page) != head);
+ ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+ struct inode *inode;
+ struct buffer_head lbh; /* extent of blocks */
+ unsigned long first_page, next_page; /* extent of pages */
+ get_block_t *get_block;
+ struct writeback_control *wbc;
+};
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct mpage_data mpd_pp = {
+ .bio = NULL,
+ .last_block_in_bio = 0,
+ .get_block = mpd->get_block,
+ .use_writepage = 1,
+ };
+ int ret = 0, err, nr_pages, i;
+ unsigned long index, end;
+ struct pagevec pvec;
+
+ BUG_ON(mpd->next_page <= mpd->first_page);
+
+ pagevec_init(&pvec, 0);
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+
+ /*
+ * In error case, we have to continue because
+ * remaining pages are still locked
+ * XXX: unlock and re-dirty them?
+ */
+ if (ret == 0)
+ ret = err;
+ }
+ pagevec_release(&pvec);
+ }
+ if (mpd_pp.bio)
+ mpage_bio_submit(WRITE, mpd_pp.bio);
+
+ return ret;
+}
+
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+ struct buffer_head *exbh)
+{
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+ int blocks = exbh->b_size >> inode->i_blkbits;
+ sector_t pblock = exbh->b_blocknr, cur_logical;
+ struct buffer_head *head, *bh;
+ unsigned long index, end;
+ struct pagevec pvec;
+ int nr_pages, i;
+
+ index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ pagevec_init(&pvec, 0);
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ BUG_ON(!page_has_buffers(page));
+
+ bh = page_buffers(page);
+ head = bh;
+
+ /* skip blocks out of the range */
+ do {
+ if (cur_logical >= logical)
+ break;
+ cur_logical++;
+ } while ((bh = bh->b_this_page) != head);
+
+ do {
+ if (cur_logical >= logical + blocks)
+ break;
+ if (buffer_delay(bh)) {
+ bh->b_blocknr = pblock;
+ clear_buffer_delay(bh);
+ } else if (buffer_mapped(bh))
+ BUG_ON(bh->b_blocknr != pblock);
+
+ cur_logical++;
+ pblock++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+ pagevec_release(&pvec);
+ }
+}
+
+
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int blocks, i;
+
+ blocks = bh->b_size >> inode->i_blkbits;
+ for (i = 0; i < blocks; i++)
+ unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+ struct buffer_head *lbh = &mpd->lbh;
+ int err = 0, remain = lbh->b_size;
+ sector_t next = lbh->b_blocknr;
+ struct buffer_head new;
+
+ /*
+ * We consider only non-mapped and non-allocated blocks
+ */
+ if (buffer_mapped(lbh) && !buffer_delay(lbh))
+ return;
+
+ while (remain) {
+ new.b_state = lbh->b_state;
+ new.b_blocknr = 0;
+ new.b_size = remain;
+ err = mpd->get_block(mpd->inode, next, &new, 1);
+ if (err) {
+ /*
+ * Rather than implement own error handling
+ * here, we just leave remaining blocks
+ * unallocated and try again with ->writepage()
+ */
+ break;
+ }
+ BUG_ON(new.b_size == 0);
+
+ if (buffer_new(&new))
+ __unmap_underlying_blocks(mpd->inode, &new);
+
+ /*
+ * If blocks are delayed marked, we need to
+ * put actual blocknr and drop delayed bit
+ */
+ if (buffer_delay(lbh))
+ mpage_put_bnr_to_bhs(mpd, next, &new);
+
+ /* go for the remaining blocks */
+ next += new.b_size >> mpd->inode->i_blkbits;
+ remain -= new.b_size;
+ }
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+ sector_t logical, struct buffer_head *bh)
+{
+ struct buffer_head *lbh = &mpd->lbh;
+ sector_t next;
+
+ next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+
+ /*
+ * First block in the extent
+ */
+ if (lbh->b_size == 0) {
+ lbh->b_blocknr = logical;
+ lbh->b_size = bh->b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ return;
+ }
+
+ /*
+ * Can we merge the block to our big extent?
+ */
+ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+ lbh->b_size += bh->b_size;
+ return;
+ }
+
+ /*
+ * We couldn't merge the block to our extent, so we
+ * need to flush current extent and start new one
+ */
+ mpage_da_map_blocks(mpd);
+
+ /*
+ * Now start a new extent
+ */
+ lbh->b_size = bh->b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ lbh->b_blocknr = logical;
+}
+
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+ struct writeback_control *wbc, void *data)
+{
+ struct mpage_da_data *mpd = data;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *bh, *head, fake;
+ sector_t logical;
+
+ /*
+ * Can we merge this page to current extent?
+ */
+ if (mpd->next_page != page->index) {
+ /*
+ * Nope, we can't. So, we map non-allocated blocks
+ * and start IO on them using __mpage_writepage()
+ */
+ if (mpd->next_page != mpd->first_page) {
+ mpage_da_map_blocks(mpd);
+ mpage_da_submit_io(mpd);
+ }
+
+ /*
+ * Start next extent of pages ...
+ */
+ mpd->first_page = page->index;
+
+ /*
+ * ... and blocks
+ */
+ mpd->lbh.b_size = 0;
+ mpd->lbh.b_state = 0;
+ mpd->lbh.b_blocknr = 0;
+ }
+
+ mpd->next_page = page->index + 1;
+ logical = (sector_t) page->index <<
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ if (!page_has_buffers(page)) {
+ /*
+ * There is no attached buffer heads yet (mmap?)
+ * we treat the page asfull of dirty blocks
+ */
+ bh = &fake;
+ bh->b_size = PAGE_CACHE_SIZE;
+ bh->b_state = 0;
+ set_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ } else {
+ /*
+ * Page with regular buffer heads, just add all dirty ones
+ */
+ head = page_buffers(page);
+ bh = head;
+ do {
+ BUG_ON(buffer_locked(bh));
+ if (buffer_dirty(bh))
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ logical++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+
+ return 0;
+}
+
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ get_block_t get_block)
+{
+ struct mpage_da_data mpd;
+ int ret;
+
+ if (!get_block)
+ return generic_writepages(mapping, wbc);
+
+ mpd.wbc = wbc;
+ mpd.inode = mapping->host;
+ mpd.lbh.b_size = 0;
+ mpd.lbh.b_state = 0;
+ mpd.lbh.b_blocknr = 0;
+ mpd.first_page = 0;
+ mpd.next_page = 0;
+ mpd.get_block = get_block;
+
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+
+ /*
+ * Handle last extent of pages
+ */
+ if (mpd.next_page != mpd.first_page) {
+ mpage_da_map_blocks(&mpd);
+ mpage_da_submit_io(&mpd);
+ }
+
+ return ret;
+}
+
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+
+ BUG_ON(create == 0);
+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+ /*
+ * first, we need to know whether the block is allocated already
+ * preallocated blocks are unmapped but should treated
+ * the same as allocated blocks.
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
+ if ((ret == 0) && !buffer_delay(bh_result)) {
+ /* the block isn't (pre)allocated yet, let's reserve space */
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ ret = ext4_da_reserve_space(inode, 1);
+ if (ret)
+ /* not enough space to reserve */
+ return ret;
+
+ map_bh(bh_result, inode->i_sb, 0);
+ set_buffer_new(bh_result);
+ set_buffer_delay(bh_result);
+ } else if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+
+ return ret;
+}
+#define EXT4_DELALLOC_RSVED 1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ loff_t disksize = EXT4_I(inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ handle = ext4_journal_current_handle();
+ if (!handle) {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ BUG_ON(!ret);
+ } else {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
+ }
+
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+
+ /*
+ * Update on-disk size along with block allocation
+ * we don't use 'extend_disksize' as size may change
+ * within already allocated block -bzzz
+ */
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ /*
+ * XXX: replace with spinlock if seen contended -bzzz
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+ if (EXT4_I(inode)->i_disksize == disksize) {
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
+ }
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ /*
+ * unmapped buffer is possible for holes.
+ * delay buffer is possible with delayed allocation
+ */
+ return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+ /*
+ * we don't want to do block allocation in writepage
+ * so call get_block_wrap with create = 0
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ loff_t size;
+ unsigned long len;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
+
+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ page_bufs = page_buffers(page);
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ /*
+ * We don't want to do block allocation
+ * So redirty the page and return
+ * We may reach here when we do a journal commit
+ * via journal_submit_inode_data_buffers.
+ * If we don't have mapping block we just ignore
+ * them. We can also reach here via shrink_page_list
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * The test for page_has_buffers() is subtle:
+ * We know the page is dirty but it lost buffers. That means
+ * that at some moment in time after write_begin()/write_end()
+ * has been called all buffers have been clean and thus they
+ * must have been written at least once. So they are all
+ * mapped and we can happily proceed with mapping them
+ * and writing the page.
+ *
+ * Try to initialize the buffer_heads and check whether
+ * all are mapped and non delay. We don't want to
+ * do block allocation here.
+ */
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (!ret) {
+ page_bufs = page_buffers(page);
+ /* check whether all are mapped and non delay */
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * We can't do block allocation here
+ * so just redity the page and unlock
+ * and return
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+ else
+ ret = block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+
+ return ret;
+}
+
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
+
+static int ext4_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ handle_t *handle = NULL;
+ int needed_blocks;
+ int ret = 0;
+ long to_write;
+ loff_t range_start = 0;
+
+ /*
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
+ */
+ if (!mapping->nrpages)
+ return 0;
+
+ /*
+ * Estimate the worse case needed credits to write out
+ * EXT4_MAX_BUF_BLOCKS pages
+ */
+ needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+ to_write = wbc->nr_to_write;
+ if (!wbc->range_cyclic) {
+ /*
+ * If range_cyclic is not set force range_cont
+ * and save the old writeback_index
+ */
+ wbc->range_cont = 1;
+ range_start = wbc->range_start;
+ }
+
+ while (!ret && to_write) {
+ /* start a new transaction*/
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ if (ext4_should_order_data(inode)) {
+ /*
+ * With ordered mode we need to add
+ * the inode to the journal handle
+ * when we do block allocation.
+ */
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out_writepages;
+ }
+
+ }
+ /*
+ * set the max dirty pages could be write at a time
+ * to fit into the reserved transaction credits
+ */
+ if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+ wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+ to_write -= wbc->nr_to_write;
+ ret = mpage_da_writepages(mapping, wbc,
+ ext4_da_get_block_write);
+ ext4_journal_stop(handle);
+ if (wbc->nr_to_write) {
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
+ }
+
+out_writepages:
+ wbc->nr_to_write = to_write;
+ if (range_start)
+ wbc->range_start = range_start;
+ return ret;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret, retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
+ struct inode *inode = mapping->host;
+ handle_t *handle;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
+retry:
+ /*
+ * With delayed allocation, we don't log the i_disksize update
+ * if there is delayed block allocation. But we still need
+ * to journalling the i_disksize update if writes to the end
+ * of file which has an already mapped buffer.
+ */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ext4_da_get_block_prep);
+ if (ret < 0) {
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ page_cache_release(page);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+out:
+ return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+ unsigned long offset)
+{
+ struct buffer_head *bh;
+ struct inode *inode = page->mapping->host;
+ unsigned int idx;
+ int i;
+
+ bh = page_buffers(page);
+ idx = offset >> inode->i_blkbits;
+
+ for (i=0; i < idx; i++)
+ bh = bh->b_this_page;
+
+ if (!buffer_mapped(bh) || (buffer_delay(bh)))
+ return 0;
+ return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ handle_t *handle = ext4_journal_current_handle();
+ loff_t new_i_size;
+ unsigned long start, end;
+
+ start = pos & (PAGE_CACHE_SIZE - 1);
+ end = start + copied -1;
+
+ /*
+ * generic_write_end() will run mark_inode_dirty() if i_size
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
+ * into that.
+ */
+
+ new_i_size = pos + copied;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_da_should_update_i_disksize(page, end)) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ /*
+ * Updating i_disksize when extending file
+ * without needing block allocation
+ */
+ if (ext4_should_order_data(inode))
+ ret = ext4_jbd2_file_inode(handle,
+ inode);
+
+ EXT4_I(inode)->i_disksize = new_i_size;
+ }
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
+ }
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ copied = ret2;
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+
+ return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+ /*
+ * Drop reserved blocks
+ */
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ goto out;
+
+ ext4_da_page_release_reservation(page, offset);
+
+out:
+ ext4_invalidatepage(page, offset);
+
+ return;
+}
+
/*
* bmap() is special. It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ test_opt(inode->i_sb, DELALLOC)) {
+ /*
+ * With delalloc we want to sync the file
+ * so that we can make sure we allocate
+ * blocks for file
+ */
+ filemap_write_and_wait(mapping);
+ }
+
if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
/*
* This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
- if (buffer_mapped(bh))
- return ext4_journal_dirty_data(handle, bh);
- return 0;
-}
-
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
*
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
*
* Problem:
*
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
* disastrous. Any write() or metadata operation will sync the fs for
* us.
*
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
-
- J_ASSERT(PageLocked(page));
-
- /*
- * We give up here if we're reentered, because it might be for a
- * different filesystem.
- */
- if (ext4_journal_current_handle())
- goto out_fail;
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (test_opt(inode->i_sb, NOBH))
+ return nobh_writepage(page,
+ ext4_normal_get_block_write, wbc);
+ else
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+}
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
- }
+static int ext4_normal_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
}
- page_bufs = page_buffers(page);
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bget_one);
-
- ret = block_write_full_page(page, ext4_get_block, wbc);
- /*
- * The page can become unlocked at any point now, and
- * truncate can then come in and change things. So we
- * can't touch *page from now on. But *page_bufs is
- * safe due to elevated refcount.
- */
+ if (!ext4_journal_current_handle())
+ return __ext4_normal_writepage(page, wbc);
- /*
- * And attach them to the current transaction. But only if
- * block_write_full_page() succeeded. Otherwise they are unmapped,
- * and generally junk.
- */
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, jbd2_journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-
-out_fail:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
- return ret;
+ return 0;
}
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs;
handle_t *handle = NULL;
int ret = 0;
int err;
- if (ext4_journal_current_handle())
- goto out_fail;
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (ret != 0)
+ goto out_unlock;
+
+ page_bufs = page_buffers(page);
+ walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+ bget_one);
+ /* As soon as we unlock the page, it can go away, but we have
+ * references to buffers so we are safe */
+ unlock_page(page);
handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_fail;
+ goto out;
}
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ ret = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+ err = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
+ if (ret == 0)
+ ret = err;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
- return ret;
-out_fail:
- redirty_page_for_writepage(wbc, page);
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ goto out;
+
+out_unlock:
unlock_page(page);
+out:
return ret;
}
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (ext4_journal_current_handle())
- goto no_write;
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
+ }
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
+ if (ext4_journal_current_handle())
goto no_write;
- }
- if (!page_has_buffers(page) || PageChecked(page)) {
+ if (PageChecked(page)) {
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_get_block);
- if (ret != 0) {
- ext4_journal_stop(handle);
- goto out_unlock;
- }
- ret = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
- err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, write_end_fn);
- if (ret == 0)
- ret = err;
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- unlock_page(page);
+ return __ext4_journalled_writepage(page, wbc);
} else {
/*
* It may be a page full of checkpoint-mode buffers. We don't
* really know unless we go poke around in the buffer_heads.
* But block_write_full_page will do the right thing.
*/
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
}
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
-out:
- return ret;
-
no_write:
redirty_page_for_writepage(wbc, page);
-out_unlock:
unlock_page(page);
- goto out;
+ return 0;
}
static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
static const struct address_space_operations ext4_ordered_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_ordered_writepage,
+ .writepage = ext4_normal_writepage,
.sync_page = block_sync_page,
.write_begin = ext4_write_begin,
.write_end = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
static const struct address_space_operations ext4_writeback_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_writeback_writepage,
+ .writepage = ext4_normal_writepage,
.sync_page = block_sync_page,
.write_begin = ext4_write_begin,
.write_end = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
.releasepage = ext4_releasepage,
};
+static const struct address_space_operations ext4_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_da_writepage,
+ .writepages = ext4_da_writepages,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_da_write_begin,
+ .write_end = ext4_da_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
+ if (ext4_should_order_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
+ else if (ext4_should_writeback_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
+ page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return -EINVAL;
+
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
err = ext4_journal_dirty_metadata(handle, bh);
} else {
if (ext4_should_order_data(inode))
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if (this_bh) {
BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, this_bh);
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if (bh2jh(this_bh))
+ ext4_journal_dirty_metadata(handle, this_bh);
+ else
+ ext4_error(inode->i_sb, __func__,
+ "circular indirect block detected, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long) this_bh->b_blocknr);
}
}
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
}
+int ext4_can_truncate(struct inode *inode)
+{
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return 0;
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
+ return 0;
+}
+
/*
* ext4_truncate()
*
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
int n;
ext4_lblk_t last_block;
unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext4_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ if (!ext4_can_truncate(inode))
return;
- /*
- * We have to lock the EOF page here, because lock_page() nests
- * outside jbd2_journal_start().
- */
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
-
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- ext4_ext_truncate(inode, page);
+ ext4_ext_truncate(inode);
return;
}
handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return; /* AKPM: return what? */
- }
last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (blocksize - 1))
+ if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+ goto out_stop;
n = ext4_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
goto out_stop;
/*
+ * From here we block out all ext4_get_block() callers who want to
+ * modify the block allocation tree.
+ */
+ down_write(&ei->i_data_sem);
+ /*
* The orphan list entry will now protect us from any crash which
* occurs before the truncate completes, so it is now safe to propagate
* the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
*/
ei->i_disksize = inode->i_size;
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
-
if (n == 1) { /* direct blocks */
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
* be freed, so we have a strong guarantee that no future commit will
* leave these blocks visible to the user.)
*
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
*/
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (!error)
error = rc;
ext4_journal_stop(handle);
+
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (error) {
+ /* Do as much error cleanup as possible */
+ handle = ext4_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ goto err_out;
+ }
+ }
}
rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
return error;
}
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode;
+ unsigned long delalloc_blocks;
+
+ inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+
+ /*
+ * We can't update i_blocks if the block allocation is delayed
+ * otherwise in the case of system crash before the real block
+ * allocation is done, we will have i_blocks inconsistent with
+ * on-disk file blocks.
+ * We always keep i_blocks updated together with real
+ * allocation. But to not confuse with user, stat
+ * will return the blocks that include the delayed allocation
+ * blocks for this file.
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ return 0;
+}
/*
* How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ loff_t size;
+ unsigned long len;
+ int ret = -EINVAL;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+ * get i_mutex because we are already holding mmap_sem.
+ */
+ down_read(&inode->i_alloc_sem);
+ size = i_size_read(inode);
+ if (page->mapping != mapping || size <= page_offset(page)
+ || !PageUptodate(page)) {
+ /* page got truncated from under us? */
+ goto out_unlock;
+ }
+ ret = 0;
+ if (PageMappedToDisk(page))
+ goto out_unlock;
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* return if we have all the buffers mapped */
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped))
+ goto out_unlock;
+ }
+ /*
+ * OK, we need to fill the hole... Do write_begin write_end
+ * to do block allocation/reservation.We are not holding
+ * inode.i__mutex here. That allow * parallel write_begin,
+ * write_end call. lock_page prevent this from happening
+ * on the same page though
+ */
+ ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+ len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ if (ret < 0)
+ goto out_unlock;
+ ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+ len, len, page, NULL);
+ if (ret < 0)
+ goto out_unlock;
+ ret = 0;
+out_unlock:
+ up_read(&inode->i_alloc_sem);
+ return ret;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade15..8d141a25bbe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_zero_bit(addr, max, start) - fix;
+ ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static inline int mb_find_next_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_bit(addr, max, start) - fix;
+ ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (!buffer_uptodate(bh[i]))
goto out;
+ err = 0;
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
int pnum;
int poff;
struct page *page;
+ int ret;
mb_debug("load group %lu\n", group);
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
mb_cmp_bitmaps(e4b, page_address(page) +
(poff * sb->s_blocksize));
}
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
- if (!PageUptodate(page))
- ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+ if (!PageUptodate(page)) {
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ }
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
- return -EIO;
+ return ret;
}
static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
}
}
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int first, int count)
{
int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr += block;
blocknr +=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
+ ext4_unlock_group(sb, e4b->bd_group);
ext4_error(sb, __func__, "double-free of inode"
" %lu's block %llu(bit %u in group %lu)\n",
inode ? inode->i_ino : 0, blocknr, block,
e4b->bd_group);
+ ext4_lock_group(sb, e4b->bd_group);
}
mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
} while (1);
}
mb_check_buddy(e4b);
-
- return 0;
}
static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
-
- /* searching for the right group start from the goal value specified */
- group = ac->ac_g_ex.fe_group;
-
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
repeat:
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
+ /*
+ * searching for the right group start
+ * from the goal value specified
+ */
+ group = ac->ac_g_ex.fe_group;
+
for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
struct ext4_group_info *grp;
struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
int rc;
int size;
+ if (unlikely(sbi->s_mb_history == NULL))
+ return -ENOMEM;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL)
return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
- if (likely(sbi->s_mb_history != NULL))
- memset(sbi->s_mb_history, 0, i);
+ sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
/* if we can't allocate history, then we simple won't use it */
}
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
#define ext4_mb_history_init(sb)
#endif
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ int i, len;
+ int metalen = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info **meta_group_info;
+
+ /*
+ * First check if this group is the first of a reserved block.
+ * If it's true, we have to allocate a new table of pointers
+ * to ext4_group_info structures
+ */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+ metalen = sizeof(*meta_group_info) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ meta_group_info = kmalloc(metalen, GFP_KERNEL);
+ if (meta_group_info == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+ "buddy group\n");
+ goto exit_meta_group_info;
+ }
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+ meta_group_info;
+ }
+
+ /*
+ * calculate needed size. if change bb_counters size,
+ * don't forget about ext4_mb_generate_buddy()
+ */
+ len = offsetof(typeof(**meta_group_info),
+ bb_counters[sb->s_blocksize_bits + 2]);
+
+ meta_group_info =
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+ meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+ if (meta_group_info[i] == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+ goto exit_group_info;
+ }
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+ &(meta_group_info[i]->bb_state));
+
+ /*
+ * initialize bb_free to be able to skip
+ * empty groups without initialization
+ */
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ meta_group_info[i]->bb_free =
+ ext4_free_blocks_after_init(sb, group, desc);
+ } else {
+ meta_group_info[i]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+ {
+ struct buffer_head *bh;
+ meta_group_info[i]->bb_bitmap =
+ kmalloc(sb->s_blocksize, GFP_KERNEL);
+ BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+ bh = ext4_read_block_bitmap(sb, group);
+ BUG_ON(bh == NULL);
+ memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+ sb->s_blocksize);
+ put_bh(bh);
+ }
+#endif
+
+ return 0;
+
+exit_group_info:
+ /* If a meta_group_info table has been allocated, release it now */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+ kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+ return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+ int err;
+
+ /* Add group based on group descriptor*/
+ err = ext4_mb_add_groupinfo(sb, group, desc);
+ if (err)
+ return err;
+
+ /*
+ * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+ * datas) are set not up to date so that they will be re-initilaized
+ * during the next call to ext4_mb_load_buddy
+ */
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ return 0;
+}
+
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+ grp->bb_free += add;
+}
+
static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t i;
- int j, len, metalen;
+ int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int num_meta_group_infos =
- (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
- EXT4_DESC_PER_BLOCK_BITS(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int num_meta_group_infos;
+ int num_meta_group_infos_max;
+ int array_size;
struct ext4_group_info **meta_group_info;
+ struct ext4_group_desc *desc;
+
+ /* This is the number of blocks used by GDT */
+ num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+ 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+
+ /*
+ * This is the total number of blocks used by GDT including
+ * the number of reserved blocks for GDT.
+ * The s_group_info array is allocated with this value
+ * to allow a clean online resize without a complex
+ * manipulation of pointer.
+ * The drawback is the unused memory when no resize
+ * occurs but it's very low in terms of pages
+ * (see comments below)
+ * Need to handle this properly when META_BG resizing is allowed
+ */
+ num_meta_group_infos_max = num_meta_group_infos +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+ /*
+ * array_size is the size of s_group_info array. We round it
+ * to the next power of two because this approximation is done
+ * internally by kmalloc so we can have some more memory
+ * for free here (e.g. may be used for META_BG resize).
+ */
+ array_size = 1;
+ while (array_size < sizeof(*sbi->s_group_info) *
+ num_meta_group_infos_max)
+ array_size = array_size << 1;
/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
* kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
* So a two level scheme suffices for now. */
- sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
- num_meta_group_infos, GFP_KERNEL);
+ sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
if (sbi->s_group_info == NULL) {
printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_group_info[i] = meta_group_info;
}
- /*
- * calculate needed size. if change bb_counters size,
- * don't forget about ext4_mb_generate_buddy()
- */
- len = sizeof(struct ext4_group_info);
- len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
for (i = 0; i < sbi->s_groups_count; i++) {
- struct ext4_group_desc *desc;
-
- meta_group_info =
- sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
- j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
- meta_group_info[j] = kzalloc(len, GFP_KERNEL);
- if (meta_group_info[j] == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
- goto err_freebuddy;
- }
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
printk(KERN_ERR
"EXT4-fs: can't read descriptor %lu\n", i);
- i++;
goto err_freebuddy;
}
- memset(meta_group_info[j], 0, len);
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
- &(meta_group_info[j]->bb_state));
-
- /*
- * initialize bb_free to be able to skip
- * empty groups without initialization
- */
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- meta_group_info[j]->bb_free =
- ext4_free_blocks_after_init(sb, i, desc);
- } else {
- meta_group_info[j]->bb_free =
- le16_to_cpu(desc->bg_free_blocks_count);
- }
-
- INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
- {
- struct buffer_head *bh;
- meta_group_info[j]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_KERNEL);
- BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
- bh = read_block_bitmap(sb, i);
- BUG_ON(bh == NULL);
- memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
- sb->s_blocksize);
- put_bh(bh);
- }
-#endif
-
+ if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+ goto err_freebuddy;
}
return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
unsigned i;
unsigned offset;
unsigned max;
+ int ret;
if (!test_opt(sb, MBALLOC))
return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
} while (i <= sb->s_blocksize_bits + 1);
/* init file for buddy data */
- i = ext4_mb_init_backend(sb);
- if (i) {
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0) {
clear_opt(sbi->s_mount_opt, MBALLOC);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
- return i;
+ return ret;
}
spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
ext4_lock_group(sb, md->group);
for (i = 0; i < md->num; i++) {
mb_debug(" %u", md->blocks[i]);
- err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
- BUG_ON(err != 0);
+ mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
}
mb_debug("\n");
ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
-#define MB_PROC_VALUE_READ(name) \
-static int ext4_mb_read_##name(char *page, char **start, \
- off_t off, int count, int *eof, void *data) \
+#define MB_PROC_FOPS(name) \
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
{ \
- struct ext4_sb_info *sbi = data; \
- int len; \
- *eof = 1; \
- if (off != 0) \
- return 0; \
- len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
- *start = page; \
- return len; \
-}
-
-#define MB_PROC_VALUE_WRITE(name) \
-static int ext4_mb_write_##name(struct file *file, \
- const char __user *buf, unsigned long cnt, void *data) \
+ struct ext4_sb_info *sbi = m->private; \
+ \
+ seq_printf(m, "%ld\n", sbi->s_mb_##name); \
+ return 0; \
+} \
+ \
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
+{ \
+ return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+} \
+ \
+static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
+ const char __user *buf, size_t cnt, loff_t *ppos) \
{ \
- struct ext4_sb_info *sbi = data; \
+ struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
char str[32]; \
long value; \
if (cnt >= sizeof(str)) \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file, \
return -ERANGE; \
sbi->s_mb_##name = value; \
return cnt; \
-}
+} \
+ \
+static const struct file_operations ext4_mb_##name##_proc_fops = { \
+ .owner = THIS_MODULE, \
+ .open = ext4_mb_##name##_proc_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+ .write = ext4_mb_##name##_proc_write, \
+};
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
+MB_PROC_FOPS(stats);
+MB_PROC_FOPS(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
+MB_PROC_FOPS(order2_reqs);
+MB_PROC_FOPS(stream_request);
+MB_PROC_FOPS(group_prealloc);
#define MB_PROC_HANDLER(name, var) \
do { \
- proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
+ proc = proc_create_data(name, mode, sbi->s_mb_proc, \
+ &ext4_mb_##var##_proc_fops, sbi); \
if (proc == NULL) { \
printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
goto err_out; \
} \
- proc->data = sbi; \
- proc->read_proc = ext4_mb_read_##var ; \
- proc->write_proc = ext4_mb_write_##var; \
} while (0)
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
struct proc_dir_entry *proc;
char devname[64];
+ if (proc_root_ext4 == NULL) {
+ sbi->s_mb_proc = NULL;
+ return -EINVAL;
+ }
bdevname(sb->s_bdev, devname);
sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
err = -EIO;
- bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
if (!bitmap_bh)
goto out_err;
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+ /*
+ * free blocks account has already be reduced/reserved
+ * at write_begin() time for delayed allocation
+ * do not double accounting
+ */
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+ percpu_counter_sub(&sbi->s_freeblocks_counter,
+ ac->ac_b_ex.fe_len);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi,
+ ac->ac_b_ex.fe_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- if (next > end)
- next = end;
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
mb_debug(" free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
if (list_empty(&grp->bb_prealloc_list))
return 0;
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
/* error handling here */
ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
err = ext4_mb_load_buddy(sb, group, &e4b);
BUG_ON(err != 0); /* error handling here */
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
/* error handling here */
ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
sbi = EXT4_SB(sb);
if (!test_opt(sb, MBALLOC)) {
- block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+ block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
&(ar->len), errp);
return block;
}
+ if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ ar->len = ext4_has_free_blocks(sbi, ar->len);
+ }
+
+ if (ar->len == 0) {
+ *errp = -ENOSPC;
+ return 0;
+ }
while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
inquota = ar->len;
+ if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
+ ar->len = 0;
*errp = -ENOMEM;
- return 0;
+ goto out1;
}
ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
- goto out;
+ goto out2;
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
if (!ext4_mb_use_preallocated(ac)) {
-
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
repeat:
@@ -4085,11 +4261,12 @@ repeat:
ext4_mb_release_context(ac);
-out:
+out2:
+ kmem_cache_free(ext4_ac_cachep, ac);
+out1:
if (ar->len < inquota)
DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
- kmem_cache_free(ext4_ac_cachep, ac);
return block;
}
static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
} else {
ext4_lock_group(sb, block_group);
- err = mb_free_blocks(inode, &e4b, bit, count);
+ mb_free_blocks(inode, &e4b, bit, count);
ext4_mb_return_to_preallocation(inode, &e4b, block, count);
ext4_unlock_group(sb, block_group);
- BUG_ON(err != 0);
}
spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
ext4_mb_release_desc(&e4b);
*freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830..387ad98350c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+ return (struct ext4_dir_entry_2 *)((char *)p +
+ ext4_rec_len_from_disk(p->rec_len));
+}
+
+/*
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
*/
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
- return 0? 20: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
static inline unsigned dx_node_limit (struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
- return 0? 22: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
/*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
- return (struct ext4_dir_entry_2 *)((char *)p +
- ext4_rec_len_from_disk(p->rec_len));
-}
-
-/*
* This function fills a red-black tree with information from a
* directory block. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de))
- if (ext4_match (namelen, name, de)) {
- if (!ext4_check_dir_entry("ext4_find_entry",
- dir, de, bh,
- (block<<EXT4_BLOCK_SIZE_BITS(sb))
- +((char *)de - bh->b_data))) {
- brelse (bh);
+ for (; de < top; de = ext4_next_entry(de)) {
+ int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+ + ((char *) de - bh->b_data);
+
+ if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+ brelse(bh);
*err = ERR_BAD_DX_DIR;
goto errout;
}
- *res_dir = de;
- dx_release (frames);
- return bh;
+
+ if (ext4_match(namelen, name, de)) {
+ *res_dir = de;
+ dx_release(frames);
+ return bh;
+ }
}
brelse (bh);
/* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c0423..f000fbe2cd9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
/*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ if (test_opt(sb, MBALLOC)) {
+ err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+ if (err)
+ goto exit_journal;
+ }
+ /*
* Make the new blocks and inodes valid next. We do this before
* increasing the group count so that once the group is enabled,
* all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
handle_t *handle;
int err;
unsigned long freed_blocks;
+ ext4_group_t group;
+ struct ext4_group_info *grp;
/* We don't need to worry about locking wrt other resizers just
* yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
/* Handle the remaining blocks in the last group only. */
- ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
if (last == 0) {
ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add);
if ((err = ext4_journal_stop(handle)))
goto exit_put;
+
+ /*
+ * Mark mballoc pages as not up to date so that they will be updated
+ * next time they are loaded by ext4_mb_load_buddy.
+ */
+ if (test_opt(sb, MBALLOC)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Get the info on the last group */
+ grp = ext4_get_group_info(sb, group);
+
+ /* Update free blocks in group info */
+ ext4_mb_update_group_info(grp, add);
+ }
+
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 02bf2434397..1cb371dcd60 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
+ kfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+ ei->i_delalloc_reserved_flag = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
return &ei->vfs_inode;
}
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
kfree(rsv);
+ jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+ &EXT4_I(inode)->jinode);
}
static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
unsigned long def_mount_opts;
struct super_block *sb = vfs->mnt_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- journal_t *journal = sbi->s_journal;
struct ext4_super_block *es = sbi->s_es;
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",nomballoc");
if (test_opt(sb, I_VERSION))
seq_puts(seq, ",i_version");
+ if (!test_opt(sb, DELALLOC))
+ seq_puts(seq, ",nodelalloc");
+
if (sbi->s_stripe)
seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_mballoc, Opt_nomballoc, Opt_stripe,
+ Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
};
static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
{Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
+ {Opt_delalloc, "delalloc"},
+ {Opt_nodelalloc, "nodelalloc"},
{Opt_err, NULL},
};
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
int qtype, qfmt;
char *qname;
#endif
+ ext4_fsblk_t last_block;
if (!options)
return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
clear_opt(sbi->s_mount_opt, NOBH);
break;
case Opt_extents:
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_warning(sb, __func__,
+ "extents feature not enabled "
+ "on this filesystem, use tune2fs\n");
+ return 0;
+ }
set_opt (sbi->s_mount_opt, EXTENTS);
break;
case Opt_noextents:
+ /*
+ * When e2fsprogs support resizing an already existing
+ * ext3 file system to greater than 2**32 we need to
+ * add support to block allocator to handle growing
+ * already existing block mapped inode so that blocks
+ * allocated for them fall within 2**32
+ */
+ last_block = ext4_blocks_count(sbi->s_es) - 1;
+ if (last_block > 0xffffffffULL) {
+ printk(KERN_ERR "EXT4-fs: Filesystem too "
+ "large to mount with "
+ "-o noextents options\n");
+ return 0;
+ }
clear_opt (sbi->s_mount_opt, EXTENTS);
break;
case Opt_i_version:
set_opt(sbi->s_mount_opt, I_VERSION);
sb->s_flags |= MS_I_VERSION;
break;
+ case Opt_nodelalloc:
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ break;
case Opt_mballoc:
set_opt(sbi->s_mount_opt, MBALLOC);
break;
@@ -1331,6 +1370,9 @@ set_qf_format:
return 0;
sbi->s_stripe = option;
break;
+ case Opt_delalloc:
+ set_opt(sbi->s_mount_opt, DELALLOC);
+ break;
default:
printk (KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
return res;
}
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ struct buffer_head *bh;
+ ext4_group_t flex_group_count;
+ ext4_group_t flex_group;
+ int groups_per_flex = 0;
+ __u64 block_bitmap = 0;
+ int i;
+
+ if (!sbi->s_es->s_log_groups_per_flex) {
+ sbi->s_log_groups_per_flex = 0;
+ return 1;
+ }
+
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+ groups_per_flex;
+ sbi->s_flex_groups = kmalloc(flex_group_count *
+ sizeof(struct flex_groups), GFP_KERNEL);
+ if (sbi->s_flex_groups == NULL) {
+ printk(KERN_ERR "EXT4-fs: not enough memory\n");
+ goto failed;
+ }
+ memset(sbi->s_flex_groups, 0, flex_group_count *
+ sizeof(struct flex_groups));
+
+ gdp = ext4_get_group_desc(sb, 1, &bh);
+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ gdp = ext4_get_group_desc(sb, i, &bh);
+
+ flex_group = ext4_flex_group(sbi, i);
+ sbi->s_flex_groups[flex_group].free_inodes +=
+ le16_to_cpu(gdp->bg_free_inodes_count);
+ sbi->s_flex_groups[flex_group].free_blocks +=
+ le16_to_cpu(gdp->bg_free_blocks_count);
+ }
+
+ return 1;
+failed:
+ return 0;
+}
+
__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
struct ext4_group_desc *gdp)
{
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
}
static int ext4_fill_super (struct super_block *sb, void *data, int silent)
- __releases(kernel_sem)
- __acquires(kernel_sem)
+ __releases(kernel_lock)
+ __acquires(kernel_lock)
{
struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
goto out_fail;
}
- if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
- goto out_fail;
- }
-
/*
* The ext4 superblock will not be buffer aligned for other than 1kB
* block sizes. We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
/*
* turn on extents feature by default in ext4 filesystem
- * User -o noextents to turn it off
+ * only if feature flag already set by mkfs or tune2fs.
+ * Use -o noextents to turn it off
*/
- set_opt(sbi->s_mount_opt, EXTENTS);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ set_opt(sbi->s_mount_opt, EXTENTS);
+ else
+ ext4_warning(sb, __func__,
+ "extents feature not enabled on this filesystem, "
+ "use tune2fs.\n");
/*
- * turn on mballoc feature by default in ext4 filesystem
- * User -o nomballoc to turn it off
+ * turn on mballoc code by default in ext4 filesystem
+ * Use -o nomballoc to turn it off
*/
set_opt(sbi->s_mount_opt, MBALLOC);
+ /*
+ * enable delayed allocation by default
+ * Use -o nodelalloc to turn it off
+ */
+ set_opt(sbi->s_mount_opt, DELALLOC);
+
+
if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
NULL, 0))
goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
goto failed_mount2;
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (!ext4_fill_flex_info(sb)) {
+ printk(KERN_ERR
+ "EXT4-fs: unable to initialize "
+ "flex_bg meta info!\n");
+ goto failed_mount2;
+ }
+
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+ "requested data journaling mode\n");
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ } else if (test_opt(sb, DELALLOC))
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
ext4_ext_init(sb);
ext4_mb_init(sb, needs_recovery);
@@ -2372,6 +2485,7 @@ cantfind_ext4:
failed_mount4:
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
err = ext4_journal_dirty_metadata(handle, bh);
else {
/* Always do at least ordered writes for quotas */
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
brelse(bh);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398..93c5fdcdad2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_block(handle, inode,
+ ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
goal, &error);
if (error)
goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cad..ac1a52cf2a3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static size_t
ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4..d91aa61b42a 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_USER_PREFIX "user."
-
static size_t
ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022c..91389c8aee8 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
- J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7cee..f8b3be87322 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
}
/*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
}
/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
- * return 0. j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
- if (!jbd_trylock_bh_state(bh)) {
- spin_unlock(&journal->j_list_lock);
- schedule();
- return 0;
- }
- return 1;
-}
-
-/*
* Done it all: now submit the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head *bh;
int ret;
int barrier_done = 0;
+ struct timespec now = current_kernel_time();
if (is_journal_aborted(journal))
return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+ tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+ tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
if (JBD2_HAS_COMPAT_FEATURE(journal,
JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
}
/*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
*/
-static int journal_wait_on_locked_list(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
{
- int ret = 0;
- struct journal_head *jh;
-
- while (commit_transaction->t_locked_list) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_locked_list->b_tprev;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- ret = -EIO;
- spin_lock(&journal->j_list_lock);
- }
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
- spin_lock(&journal->j_list_lock);
- continue;
- }
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- jbd2_journal_remove_journal_head(bh);
- put_bh(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
- put_bh(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = mapping->nrpages * 2,
+ .range_start = 0,
+ .range_end = i_size_read(mapping->host),
+ .for_writepages = 1,
+ };
+
+ ret = generic_writepages(mapping, &wbc);
return ret;
- }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- int i;
+ struct jbd2_inode *jinode;
+ int err, ret = 0;
+ struct address_space *mapping;
- for (i = 0; i < bufs; i++) {
- wbuf[i]->b_end_io = end_buffer_write_sync;
- /* We use-up our safety reference in submit_bh() */
- submit_bh(WRITE, wbuf[i]);
+ spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ mapping = jinode->i_vfs_inode->i_mapping;
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ /*
+ * submit the inode data buffers. We use writepage
+ * instead of writepages. Because writepages can do
+ * block allocation with delalloc. We need to write
+ * only allocated blocks here.
+ */
+ err = journal_submit_inode_data_buffers(mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ J_ASSERT(jinode->i_transaction == commit_transaction);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
+ spin_unlock(&journal->j_list_lock);
+ return ret;
}
/*
- * Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
*/
-static void journal_submit_data_buffers(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- struct journal_head *jh;
- struct buffer_head *bh;
- int locked;
- int bufs = 0;
- struct buffer_head **wbuf = journal->j_wbuf;
+ struct jbd2_inode *jinode, *next_i;
+ int err, ret = 0;
- /*
- * Whenever we unlock the journal and sleep, things can get added
- * onto ->t_sync_datalist, so we have to keep looping back to
- * write_out_data until we *know* that the list is empty.
- *
- * Cleanup any flushed data buffers from the data list. Even in
- * abort mode, we want to flush this out as soon as possible.
- */
-write_out_data:
- cond_resched();
+ /* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ }
- while (commit_transaction->t_sync_datalist) {
- jh = commit_transaction->t_sync_datalist;
- bh = jh2bh(jh);
- locked = 0;
-
- /* Get reference just to make sure buffer does not disappear
- * when we are forced to drop various locks */
- get_bh(bh);
- /* If the buffer is dirty, we need to submit IO and hence
- * we need the buffer lock. We try to lock the buffer without
- * blocking. If we fail, we need to drop j_list_lock and do
- * blocking lock_buffer().
- */
- if (buffer_dirty(bh)) {
- if (test_set_buffer_locked(bh)) {
- BUFFER_TRACE(bh, "needs blocking lock");
- spin_unlock(&journal->j_list_lock);
- /* Write out all data to prevent deadlocks */
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- lock_buffer(bh);
- spin_lock(&journal->j_list_lock);
- }
- locked = 1;
- }
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- }
- /* Someone already cleaned up the buffer? */
- if (!buffer_jbd(bh)
- || jh->b_transaction != commit_transaction
- || jh->b_jlist != BJ_SyncData) {
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "already cleaned up");
- put_bh(bh);
- continue;
- }
- if (locked && test_clear_buffer_dirty(bh)) {
- BUFFER_TRACE(bh, "needs writeout, adding to array");
- wbuf[bufs++] = bh;
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- if (bufs == journal->j_wbufsize) {
- spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- goto write_out_data;
- }
- } else if (!locked && buffer_locked(bh)) {
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
+ /* Now refile inode to proper lists */
+ list_for_each_entry_safe(jinode, next_i,
+ &commit_transaction->t_inode_list, i_list) {
+ list_del(&jinode->i_list);
+ if (jinode->i_next_transaction) {
+ jinode->i_transaction = jinode->i_next_transaction;
+ jinode->i_next_transaction = NULL;
+ list_add(&jinode->i_list,
+ &jinode->i_transaction->t_inode_list);
} else {
- BUFFER_TRACE(bh, "writeout complete: unfile");
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- jbd2_journal_remove_journal_head(bh);
- /* Once for our safety reference, once for
- * jbd2_journal_remove_journal_head() */
- put_bh(bh);
- put_bh(bh);
- }
-
- if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
+ jinode->i_transaction = NULL;
}
}
spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
+
+ return ret;
}
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
- err = 0;
- journal_submit_data_buffers(journal, commit_transaction);
-
- /*
- * Wait for all previously submitted IO to complete if commit
- * record is to be written synchronously.
- */
- spin_lock(&journal->j_list_lock);
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
-
- spin_unlock(&journal->j_list_lock);
-
+ err = journal_submit_data_buffers(journal, commit_transaction);
if (err)
jbd2_journal_abort(journal, err);
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD: commit phase 2\n");
/*
- * If we found any dirty or locked buffers, then we should have
- * looped back up to the write_out_data label. If there weren't
- * any then journal_clean_data_list should have wiped the list
- * clean by now, so check that it is in fact empty.
- */
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
- jbd_debug (3, "JBD: commit phase 3\n");
-
- /*
* Way to go: we have now written out all of the data for a
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
+ err = 0;
descriptor = NULL;
bufs = 0;
while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
-
- spin_lock(&journal->j_list_lock);
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
- spin_unlock(&journal->j_list_lock);
- if (err)
- __jbd2_journal_abort_hard(journal);
}
+ /*
+ * This is the right place to wait for data buffers both for ASYNC
+ * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+ * the commit block went to disk (which happens above). If commit is
+ * SYNC, we need to wait for data buffers before we start writing
+ * commit block, which happens below in such setting.
+ */
+ err = journal_finish_inode_data_buffers(journal, commit_transaction);
+ if (err)
+ jbd2_journal_abort(journal, err);
+
/* Lo and behold: we have just managed to send a transaction to
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD: commit phase 4\n");
+ jbd_debug(3, "JBD: commit phase 3\n");
/*
* akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD: commit phase 5\n");
+ jbd_debug(3, "JBD: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
/* AKPM: bforget here */
}
- jbd_debug(3, "JBD: commit phase 6\n");
+ jbd_debug(3, "JBD: commit phase 5\n");
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD: commit phase 7\n");
+ jbd_debug(3, "JBD: commit phase 6\n");
- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD: commit phase 8\n");
+ jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a7..b26c6d9fe6a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_release_buffer);
EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
}
/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+ jinode->i_transaction = NULL;
+ jinode->i_next_transaction = NULL;
+ jinode->i_vfs_inode = inode;
+ jinode->i_flags = 0;
+ INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+ struct jbd2_inode *jinode)
+{
+ int writeout = 0;
+
+ if (!journal)
+ return;
+restart:
+ spin_lock(&journal->j_list_lock);
+ /* Is commit writing out inode - we have to wait */
+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
+ wait_queue_head_t *wq;
+ DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+ wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&journal->j_list_lock);
+ schedule();
+ finish_wait(wq, &wait.wait);
+ goto restart;
+ }
+
+ /* Do we need to wait for data writeback? */
+ if (journal->j_committing_transaction == jinode->i_transaction)
+ writeout = 1;
+ if (jinode->i_transaction) {
+ list_del(&jinode->i_list);
+ jinode->i_transaction = NULL;
+ }
+ spin_unlock(&journal->j_list_lock);
+}
+
+/*
* debugfs tunables
*/
#ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e6780..4f7cadbb19f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
* new transaction and we can't block without protecting against other
* processes trying to touch the journal while it is in transition.
*
- * Called under j_state_lock
*/
static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
+ INIT_LIST_HEAD(&transaction->t_inode_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
}
/**
- * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
- * needs to be flushed before we can commit the
- * current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- journal_t *journal = handle->h_transaction->t_journal;
- int need_brelse = 0;
- struct journal_head *jh;
-
- if (is_handle_aborted(handle))
- return 0;
-
- jh = jbd2_journal_add_journal_head(bh);
- JBUFFER_TRACE(jh, "entry");
-
- /*
- * The buffer could *already* be dirty. Writeout can start
- * at any time.
- */
- jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
- /*
- * What if the buffer is already part of a running transaction?
- *
- * There are two cases:
- * 1) It is part of the current running transaction. Refile it,
- * just in case we have allocated it as metadata, deallocated
- * it, then reallocated it as data.
- * 2) It is part of the previous, still-committing transaction.
- * If all we want to do is to guarantee that the buffer will be
- * written to disk before this new transaction commits, then
- * being sure that the *previous* transaction has this same
- * property is sufficient for us! Just leave it on its old
- * transaction.
- *
- * In case (2), the buffer must not already exist as metadata
- * --- that would violate write ordering (a transaction is free
- * to write its data at any point, even before the previous
- * committing transaction has committed). The caller must
- * never, ever allow this to happen: there's nothing we can do
- * about it in this layer.
- */
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
-
- /* Now that we have bh_state locked, are we really still mapped? */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
- goto no_journal;
- }
-
- if (jh->b_transaction) {
- JBUFFER_TRACE(jh, "has transaction");
- if (jh->b_transaction != handle->h_transaction) {
- JBUFFER_TRACE(jh, "belongs to older transaction");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
-
- /* @@@ IS THIS TRUE ? */
- /*
- * Not any more. Scenario: someone does a write()
- * in data=journal mode. The buffer's transaction has
- * moved into commit. Then someone does another
- * write() to the file. We do the frozen data copyout
- * and set b_next_transaction to point to j_running_t.
- * And while we're in that state, someone does a
- * writepage() in an attempt to pageout the same area
- * of the file via a shared mapping. At present that
- * calls jbd2_journal_dirty_data(), and we get right here.
- * It may be too late to journal the data. Simply
- * falling through to the next test will suffice: the
- * data will be dirty and wil be checkpointed. The
- * ordering comments in the next comment block still
- * apply.
- */
- //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
- /*
- * If we're journalling data, and this buffer was
- * subject to a write(), it could be metadata, forget
- * or shadow against the committing transaction. Now,
- * someone has dirtied the same darn page via a mapping
- * and it is being writepage()'d.
- * We *could* just steal the page from commit, with some
- * fancy locking there. Instead, we just skip it -
- * don't tie the page's buffers to the new transaction
- * at all.
- * Implication: if we crash before the writepage() data
- * is written into the filesystem, recovery will replay
- * the write() data.
- */
- if (jh->b_jlist != BJ_None &&
- jh->b_jlist != BJ_SyncData &&
- jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "Not stealing");
- goto no_journal;
- }
-
- /*
- * This buffer may be undergoing writeout in commit. We
- * can't return from here and let the caller dirty it
- * again because that can cause the write-out loop in
- * commit to never terminate.
- */
- if (buffer_dirty(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- need_brelse = 1;
- sync_dirty_buffer(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- /* Since we dropped the lock... */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "buffer got unmapped");
- goto no_journal;
- }
- /* The buffer may become locked again at any
- time if it is redirtied */
- }
-
- /* journal_clean_data_list() may have got there first */
- if (jh->b_transaction != NULL) {
- JBUFFER_TRACE(jh, "unfile from commit");
- __jbd2_journal_temp_unlink_buffer(jh);
- /* It still points to the committing
- * transaction; move it to this one so
- * that the refile assert checks are
- * happy. */
- jh->b_transaction = handle->h_transaction;
- }
- /* The buffer will be refiled below */
-
- }
- /*
- * Special case --- the buffer might actually have been
- * allocated and then immediately deallocated in the previous,
- * committing transaction, so might still be left on that
- * transaction's metadata lists.
- */
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "not on correct data list: unfile");
- J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- __jbd2_journal_temp_unlink_buffer(jh);
- jh->b_transaction = handle->h_transaction;
- JBUFFER_TRACE(jh, "file as data");
- __jbd2_journal_file_buffer(jh, handle->h_transaction,
- BJ_SyncData);
- }
- } else {
- JBUFFER_TRACE(jh, "not on a transaction");
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
- }
-no_journal:
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- if (need_brelse) {
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- }
- JBUFFER_TRACE(jh, "exit");
- jbd2_journal_put_journal_head(jh);
- return 0;
-}
-
-/**
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
+ * of these pointers, it could go bad. Generally the caller needs to re-read
+ * the pointer from the transaction_t.
*
* Called under j_list_lock. The journal may not be locked.
*/
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
switch (jh->b_jlist) {
case BJ_None:
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers--;
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
- /* A written-back ordered data buffer */
- JBUFFER_TRACE(jh, "release data");
- __jbd2_journal_unfile_buffer(jh);
- jbd2_journal_remove_journal_head(bh);
- __brelse(bh);
- }
- } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+ if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
/*
* We take our own ref against the journal_head here to avoid
* having to add tons of locking around each instance of
- * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+ * jbd2_journal_remove_journal_head() and
+ * jbd2_journal_put_journal_head().
*/
jh = jbd2_journal_grab_journal_head(bh);
if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ }
+
busy:
return ret;
}
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!buffer_jbd(bh))
goto zap_buffer_unlocked;
+ /* OK, we have data buffer in journaled mode */
spin_lock(&journal->j_state_lock);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
} else if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "on committing transaction");
- if (jh->b_jlist == BJ_Locked) {
- /*
- * The buffer is on the committing transaction's locked
- * list. We have the buffer locked, so I/O has
- * completed. So we can nail the buffer now.
- */
- may_free = __dispose_buffer(jh, transaction);
- goto zap_buffer;
- }
/*
* If it is committing, we simply cannot touch it. We
* can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
J_ASSERT_JH(jh, !jh->b_committed_data);
J_ASSERT_JH(jh, !jh->b_frozen_data);
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers++;
list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
spin_unlock(&journal->j_list_lock);
__brelse(bh);
}
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+
+ if (is_handle_aborted(handle))
+ return -EIO;
+
+ jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+ transaction->t_tid);
+
+ /*
+ * First check whether inode isn't already on the transaction's
+ * lists without taking the lock. Note that this check is safe
+ * without the lock as we cannot race with somebody removing inode
+ * from the transaction. The reason is that we remove inode from the
+ * transaction only in journal_release_jbd_inode() and when we commit
+ * the transaction. We are guarded from the first case by holding
+ * a reference to the inode. We are safe against the second case
+ * because if jinode->i_transaction == transaction, commit code
+ * cannot touch the transaction because we hold reference to it,
+ * and if jinode->i_next_transaction == transaction, commit code
+ * will only file the inode where we want it.
+ */
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ return 0;
+
+ spin_lock(&journal->j_list_lock);
+
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ goto done;
+
+ /* On some different transaction's list - should be
+ * the committing one */
+ if (jinode->i_transaction) {
+ J_ASSERT(jinode->i_next_transaction == NULL);
+ J_ASSERT(jinode->i_transaction ==
+ journal->j_committing_transaction);
+ jinode->i_next_transaction = transaction;
+ goto done;
+ }
+ /* Not on any transaction list... */
+ J_ASSERT(!jinode->i_next_transaction);
+ jinode->i_transaction = transaction;
+ list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+ spin_unlock(&journal->j_list_lock);
+
+ return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+ loff_t new_size)
+{
+ journal_t *journal;
+ transaction_t *commit_trans;
+ int ret = 0;
+
+ if (!inode->i_transaction && !inode->i_next_transaction)
+ goto out;
+ journal = inode->i_transaction->t_journal;
+ spin_lock(&journal->j_state_lock);
+ commit_trans = journal->j_committing_transaction;
+ spin_unlock(&journal->j_state_lock);
+ if (inode->i_transaction == commit_trans) {
+ ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+ new_size, LLONG_MAX);
+ if (ret)
+ jbd2_journal_abort(journal, ret);
+ }
+out:
+ return ret;
+}
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a..dbcc7af76a1 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
bio_put(bio);
}
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io_read;
if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
submit_bio(rw, bio);
return NULL;
}
+EXPORT_SYMBOL(mpage_bio_submit);
static struct bio *
mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
* written, so it can intelligently allocate a suitably-sized BIO. For now,
* just allocate full-size (16-page) BIOs.
*/
-struct mpage_data {
- struct bio *bio;
- sector_t last_block_in_bio;
- get_block_t *get_block;
- unsigned use_writepage;
-};
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
{
struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
mpd->bio = bio;
return ret;
}
+EXPORT_SYMBOL(__mpage_writepage);
/**
* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/include/linux/fs.h b/include/linux/fs.h
index faac13e2cc5..52e510a0aec 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
extern int __filemap_fdatawrite_range(struct address_space *mapping,
loff_t start, loff_t end, int sync_mode);
+extern int filemap_fdatawrite_range(struct address_space *mapping,
+ loff_t start, loff_t end);
extern long do_fsync(struct file *file, int datasync);
extern void sync_supers(void);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d147f0f9036..3dd20900709 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -168,6 +168,8 @@ struct commit_header {
unsigned char h_chksum_size;
unsigned char h_padding[2];
__be32 h_chksum[JBD2_CHECKSUM_BYTES];
+ __be64 h_commit_sec;
+ __be32 h_commit_nsec;
};
/*
@@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
bit_spin_unlock(BH_JournalHead, &bh->b_state);
}
+/* Flags in jbd_inode->i_flags */
+#define __JI_COMMIT_RUNNING 0
+/* Commit of the inode data in progress. We use this flag to protect us from
+ * concurrent deletion of inode. We cannot use reference to inode for this
+ * since we cannot afford doing last iput() on behalf of kjournald
+ */
+#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+
+/**
+ * struct jbd_inode is the structure linking inodes in ordered mode
+ * present in a transaction so that we can sync them during commit.
+ */
+struct jbd2_inode {
+ /* Which transaction does this inode belong to? Either the running
+ * transaction or the committing one. [j_list_lock] */
+ transaction_t *i_transaction;
+
+ /* Pointer to the running transaction modifying inode's data in case
+ * there is already a committing transaction touching it. [j_list_lock] */
+ transaction_t *i_next_transaction;
+
+ /* List of inodes in the i_transaction [j_list_lock] */
+ struct list_head i_list;
+
+ /* VFS inode this inode belongs to [constant during the lifetime
+ * of the structure] */
+ struct inode *i_vfs_inode;
+
+ /* Flags of inode [j_list_lock] */
+ unsigned int i_flags;
+};
+
struct jbd2_revoke_table_s;
/**
@@ -509,24 +543,12 @@ struct transaction_s
struct journal_head *t_reserved_list;
/*
- * Doubly-linked circular list of all buffers under writeout during
- * commit [j_list_lock]
- */
- struct journal_head *t_locked_list;
-
- /*
* Doubly-linked circular list of all metadata buffers owned by this
* transaction [j_list_lock]
*/
struct journal_head *t_buffers;
/*
- * Doubly-linked circular list of all data buffers still to be
- * flushed before this transaction can be committed [j_list_lock]
- */
- struct journal_head *t_sync_datalist;
-
- /*
* Doubly-linked circular list of all forget buffers (superseded
* buffers which we can un-checkpoint once this transaction commits)
* [j_list_lock]
@@ -565,6 +587,12 @@ struct transaction_s
struct journal_head *t_log_list;
/*
+ * List of inodes whose data we've modified in data=ordered mode.
+ * [j_list_lock]
+ */
+ struct list_head t_inode_list;
+
+ /*
* Protects info related to handles
*/
spinlock_t t_handle_lock;
@@ -1004,7 +1032,6 @@ extern int jbd2_journal_extend (handle_t *, int nblocks);
extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
-extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
extern int jbd2_journal_forget (handle_t *, struct buffer_head *);
@@ -1044,6 +1071,10 @@ extern void jbd2_journal_ack_err (journal_t *);
extern int jbd2_journal_clear_err (journal_t *);
extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
extern int jbd2_journal_force_commit(journal_t *);
+extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
+extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
+extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
/*
* journal_head management
@@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal)
/* journaling buffer types */
#define BJ_None 0 /* Not journaled */
-#define BJ_SyncData 1 /* Normal data: flush before commit */
-#define BJ_Metadata 2 /* Normal journaled metadata */
-#define BJ_Forget 3 /* Buffer superseded by this transaction */
-#define BJ_IO 4 /* Buffer is for temporary IO use */
-#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
-#define BJ_LogCtl 6 /* Buffer contains log descriptors */
-#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
-#define BJ_Locked 8 /* Locked for I/O during commit */
-#define BJ_Types 9
+#define BJ_Metadata 1 /* Normal journaled metadata */
+#define BJ_Forget 2 /* Buffer superseded by this transaction */
+#define BJ_IO 3 /* Buffer is for temporary IO use */
+#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */
+#define BJ_LogCtl 5 /* Buffer contains log descriptors */
+#define BJ_Reserved 6 /* Buffer is reserved for access by journal */
+#define BJ_Types 7
extern int jbd_blocks_per_page(struct inode *inode);
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 068a0c9946a..5c42821da2d 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -11,11 +11,21 @@
*/
#ifdef CONFIG_BLOCK
+struct mpage_data {
+ struct bio *bio;
+ sector_t last_block_in_bio;
+ get_block_t *get_block;
+ unsigned use_writepage;
+};
+
struct writeback_control;
+struct bio *mpage_bio_submit(int rw, struct bio *bio);
int mpage_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages, get_block_t get_block);
int mpage_readpage(struct page *page, get_block_t get_block);
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+ void *data);
int mpage_writepages(struct address_space *mapping,
struct writeback_control *wbc, get_block_t get_block);
int mpage_writepage(struct page *page, get_block_t *get_block,
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccdfc11..20838883535 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc);
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
- s64 ret = __percpu_counter_sum(fbc);
+ s64 ret = __percpu_counter_sum(fbc, 0);
return ret < 0 ? 0 : ret;
}
+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
+{
+ return __percpu_counter_sum(fbc, 1);
+}
+
+
static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
- return __percpu_counter_sum(fbc);
+ return __percpu_counter_sum(fbc, 0);
}
static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index bd91987c065..12b15c561a1 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
+ unsigned range_cont:1;
};
/*
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 119174494cb..4a8ba4bf5f6 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
* Add up all the per-cpu counts, return the result. This is a more accurate
* but much slower version of percpu_counter_read_positive()
*/
-s64 __percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
{
s64 ret;
int cpu;
@@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
for_each_online_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
ret += *pcount;
+ if (set)
+ *pcount = 0;
}
+ if (set)
+ fbc->count = ret;
+
spin_unlock(&fbc->lock);
return ret;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6a7d34874..65d9d9e2b75 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_fdatawrite);
-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
+EXPORT_SYMBOL(filemap_fdatawrite_range);
/**
* filemap_flush - mostly a non-blocking flush
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b38f700825f..94c6d8988ab 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -960,6 +960,9 @@ retry:
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+
+ if (wbc->range_cont)
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);