From e219cca082f52e7dfea41f3be264b7b5eb204227 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 6 Nov 2008 22:37:59 -0500 Subject: jbd: don't give up looking for space so easily in __log_wait_for_space Commit be07c4ed introducd a regression because it assumed that if there were no transactions ready to be checkpointed, that no progress could be made on making space available in the journal, and so the journal should be aborted. This assumption is false; it could be the case that simply calling cleanup_journal_tail() will recover the necessary space, or, for small journals, the currently committing transaction could be responsible for chewing up the required space in the log, so we need to wait for the currently committing transaction to finish before trying to force a checkpoint operation. This patch fixes the bug reported by Meelis Roos at: http://bugzilla.kernel.org/show_bug.cgi?id=11937 Signed-off-by: "Theodore Ts'o" Cc: Duane Griffin Cc: Toshiyuki Okajima --- fs/jbd/checkpoint.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 1bd8d4acc6f..61f32f3868c 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -115,7 +115,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh) */ void __log_wait_for_space(journal_t *journal) { - int nblocks; + int nblocks, space_left; assert_spin_locked(&journal->j_state_lock); nblocks = jbd_space_needed(journal); @@ -128,25 +128,42 @@ void __log_wait_for_space(journal_t *journal) /* * Test again, another process may have checkpointed while we * were waiting for the checkpoint lock. If there are no - * outstanding transactions there is nothing to checkpoint and - * we can't make progress. Abort the journal in this case. + * transactions ready to be checkpointed, try to recover + * journal space by calling cleanup_journal_tail(), and if + * that doesn't work, by waiting for the currently committing + * transaction to complete. If there is absolutely no way + * to make progress, this is either a BUG or corrupted + * filesystem, so abort the journal and leave a stack + * trace for forensic evidence. */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); - if (__log_space_left(journal) < nblocks) { + space_left = __log_space_left(journal); + if (space_left < nblocks) { int chkpt = journal->j_checkpoint_transactions != NULL; + tid_t tid = 0; + if (journal->j_committing_transaction) + tid = journal->j_committing_transaction->t_tid; spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); if (chkpt) { log_do_checkpoint(journal); + } else if (cleanup_journal_tail(journal) == 0) { + /* We were able to recover space; yay! */ + ; + } else if (tid) { + log_wait_commit(journal, tid); } else { - printk(KERN_ERR "%s: no transactions\n", - __func__); + printk(KERN_ERR "%s: needed %d blocks and " + "only had %d space available\n", + __func__, nblocks, space_left); + printk(KERN_ERR "%s: no way to get more " + "journal space\n", __func__); + WARN_ON(1); journal_abort(journal, 0); } - spin_lock(&journal->j_state_lock); } else { spin_unlock(&journal->j_list_lock); -- cgit v1.2.3 From 8c3f25d8950c3e9fe6c9849f88679b3f2a071550 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 6 Nov 2008 22:38:07 -0500 Subject: jbd2: don't give up looking for space so easily in __jbd2_log_wait_for_space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 23f8b79e introducd a regression because it assumed that if there were no transactions ready to be checkpointed, that no progress could be made on making space available in the journal, and so the journal should be aborted. This assumption is false; it could be the case that simply calling jbd2_cleanup_journal_tail() will recover the necessary space, or, for small journals, the currently committing transaction could be responsible for chewing up the required space in the log, so we need to wait for the currently committing transaction to finish before trying to force a checkpoint operation. This patch fixes a bug reported by Mihai Harpau at: https://bugzilla.redhat.com/show_bug.cgi?id=469582 This patch fixes a bug reported by François Valenduc at: http://bugzilla.kernel.org/show_bug.cgi?id=11840 Signed-off-by: "Theodore Ts'o" Cc: Duane Griffin Cc: Toshiyuki Okajima --- fs/jbd2/checkpoint.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9203c3332f1..9497718fe92 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -116,7 +116,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh) */ void __jbd2_log_wait_for_space(journal_t *journal) { - int nblocks; + int nblocks, space_left; assert_spin_locked(&journal->j_state_lock); nblocks = jbd_space_needed(journal); @@ -129,25 +129,43 @@ void __jbd2_log_wait_for_space(journal_t *journal) /* * Test again, another process may have checkpointed while we * were waiting for the checkpoint lock. If there are no - * outstanding transactions there is nothing to checkpoint and - * we can't make progress. Abort the journal in this case. + * transactions ready to be checkpointed, try to recover + * journal space by calling cleanup_journal_tail(), and if + * that doesn't work, by waiting for the currently committing + * transaction to complete. If there is absolutely no way + * to make progress, this is either a BUG or corrupted + * filesystem, so abort the journal and leave a stack + * trace for forensic evidence. */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); - if (__jbd2_log_space_left(journal) < nblocks) { + space_left = __jbd2_log_space_left(journal); + if (space_left < nblocks) { int chkpt = journal->j_checkpoint_transactions != NULL; + tid_t tid = 0; + if (journal->j_committing_transaction) + tid = journal->j_committing_transaction->t_tid; spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); if (chkpt) { jbd2_log_do_checkpoint(journal); + } else if (jbd2_cleanup_journal_tail(journal) == 0) { + /* We were able to recover space; yay! */ + ; + } else if (tid) { + jbd2_log_wait_commit(journal, tid); } else { - printk(KERN_ERR "%s: no transactions\n", - __func__); + printk(KERN_ERR "%s: needed %d blocks and " + "only had %d space available\n", + __func__, nblocks, space_left); + printk(KERN_ERR "%s: no way to get more " + "journal space in %s\n", __func__, + journal->j_devname); + WARN_ON(1); jbd2_journal_abort(journal, 0); } - spin_lock(&journal->j_state_lock); } else { spin_unlock(&journal->j_list_lock); -- cgit v1.2.3 From 2423840ded13e6d3b52d88aff8d033bb78fafd08 Mon Sep 17 00:00:00 2001 From: Sami Liedes Date: Sun, 2 Nov 2008 19:23:30 -0500 Subject: jbd2: deregister proc on failure in jbd2_journal_init_inode jbd2_journal_init_inode() does not call jbd2_stats_proc_exit() on all failure paths after calling jbd2_stats_proc_init(). This leaves dangling references to the fs in proc. This patch fixes a bug reported by Sami Leides at: http://bugzilla.kernel.org/show_bug.cgi?id=11493 Signed-off-by: Sami Liedes Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 783de118de9..e70d657a19f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1089,6 +1089,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); + jbd2_stats_proc_exit(journal); kfree(journal); return NULL; } @@ -1098,6 +1099,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) if (err) { printk(KERN_ERR "%s: Cannnot locate journal superblock\n", __func__); + jbd2_stats_proc_exit(journal); kfree(journal); return NULL; } -- cgit v1.2.3 From ae2d9fb18e575ed37ffc241ece4bf68f0be4ae32 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 4 Nov 2008 09:10:50 -0500 Subject: ext4: fix missing ext4_unlock_group in error path If we try to free a block which is already freed, the code was returning without first unlocking the group. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dfe17a13405..444ad998f72 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4441,6 +4441,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, else if (block >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { + ext4_unlock_group(sb, group); ext4_error(sb, __func__, "Double free of blocks %d (%d %d)\n", block, entry->start_blk, entry->count); -- cgit v1.2.3 From d94e99a64c3beece22dbfb2b335771a59184eb0a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 4 Nov 2008 09:11:26 -0500 Subject: ext4: Convert to host order before using the values. Use le16_to_cpu to read the s_reserved_gdt_blocks values from super block. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 994859df010..e27acd18b4b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1458,9 +1458,8 @@ static int ext4_fill_flex_info(struct super_block *sb) /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + - ((sbi->s_es->s_reserved_gdt_blocks +1 ) << - EXT4_DESC_PER_BLOCK_BITS(sb))) / - groups_per_flex; + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << + EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; sbi->s_flex_groups = kzalloc(flex_group_count * sizeof(struct flex_groups), GFP_KERNEL); if (sbi->s_flex_groups == NULL) { -- cgit v1.2.3 From 14ce0cb411c88681ab8f3a4c9caa7f42e97a3184 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 3 Nov 2008 18:10:55 -0500 Subject: ext4: wait on all pending commits in ext4_sync_fs() In ext4_sync_fs, we only wait for a commit to finish if we started it, but there may be one already in progress which will not be synced. In the case of a data=ordered umount with pending long symlinks which are delayed due to a long list of other I/O on the backing block device, this causes the buffer associated with the long symlinks to not be moved to the inode dirty list in the second phase of fsync_super. Then, before they can be dirtied again, kjournald exits, seeing the UMOUNT flag and the dirty pages are never written to the backing block device, causing long symlink corruption and exposing new or previously freed block data to userspace. To ensure all commits are synced, we flush all journal commits now when sync_fs'ing ext4. Signed-off-by: Arthur Jones Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" Cc: Eric Sandeen Cc: --- fs/ext4/super.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e27acd18b4b..e4a241c65db 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2884,12 +2884,9 @@ int ext4_force_commit(struct super_block *sb) /* * Ext4 always journals updates to the superblock itself, so we don't * have to propagate any other updates to the superblock on disk at this - * point. Just start an async writeback to get the buffers on their way - * to the disk. - * - * This implicitly triggers the writebehind on sync(). + * point. (We can probably nuke this function altogether, and remove + * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...) */ - static void ext4_write_super(struct super_block *sb) { if (mutex_trylock(&sb->s_lock) != 0) @@ -2899,15 +2896,15 @@ static void ext4_write_super(struct super_block *sb) static int ext4_sync_fs(struct super_block *sb, int wait) { - tid_t target; + int ret = 0; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { - if (wait) - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); - } - return 0; + if (wait) + ret = ext4_force_commit(sb); + else + jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + return ret; } /* -- cgit v1.2.3 From ac51d83705c2a38c71f39cde99708b14e6212a60 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 6 Nov 2008 16:49:36 -0500 Subject: ext4: calculate journal credits correctly This fixes a 2.6.27 regression which was introduced in commit a02908f1. We weren't passing the chunk parameter down to the two subections, ext4_indirect_trans_blocks() and ext4_ext_index_trans_blocks(), with the result that massively overestimate the amount of credits needed by ext4_da_writepages, especially in the non-extents case. This causes failures especially on /boot partitions, which tend to be small and non-extent using since GRUB doesn't handle extents. This patch fixes the bug reported by Joseph Fannin at: http://bugzilla.kernel.org/show_bug.cgi?id=11964 Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8dbf6953845..5a130b56f1c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4580,9 +4580,10 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) - return ext4_indirect_trans_blocks(inode, nrblocks, 0); - return ext4_ext_index_trans_blocks(inode, nrblocks, 0); + return ext4_indirect_trans_blocks(inode, nrblocks, chunk); + return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); } + /* * Account for index blocks, block groups bitmaps and block group * descriptor blocks if modify datablocks and index blocks -- cgit v1.2.3 From ed9b3e3379731e9f9d2f73f3d7fd9e7d2ce3df4a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 7 Nov 2008 09:06:45 -0500 Subject: ext4: Mark the buffer_heads as dirty and uptodate after prepare_write We need to make sure we mark the buffer_heads as dirty and uptodate so that block_write_full_page write them correctly. This fixes mmap corruptions that can occur in low memory situations. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5a130b56f1c..be21a5ae33c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2329,6 +2329,8 @@ static int ext4_da_writepage(struct page *page, unlock_page(page); return 0; } + /* now mark the buffer_heads as dirty and uptodate */ + block_commit_write(page, 0, PAGE_CACHE_SIZE); } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) -- cgit v1.2.3 From 23712a9c28b9f80a8cf70c8490358d5f562d2465 Mon Sep 17 00:00:00 2001 From: Frederic Bohe Date: Fri, 7 Nov 2008 09:21:01 -0500 Subject: ext4: add checksum calculation when clearing UNINIT flag in ext4_new_inode When initializing an uninitialized block group in ext4_new_inode(), its block group checksum must be re-calculated. This fixes a race when several threads try to allocate a new inode in an UNINIT'd group. There is some question whether we need to be initializing the block bitmap in ext4_new_inode() at all, but for now, if we are going to init the block group, let's eliminate the race. Signed-off-by: Frederic Bohe Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index fe34d74cfb1..2a117e286e5 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -718,6 +718,8 @@ got: gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); free = ext4_free_blocks_after_init(sb, group, gdp); gdp->bg_free_blocks_count = cpu_to_le16(free); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, + gdp); } spin_unlock(sb_bgl_lock(sbi, group)); -- cgit v1.2.3