From f73953c0656f2db9073c585c4df2884a8ecd101e Mon Sep 17 00:00:00 2001 From: Thiemo Nagel Date: Tue, 7 Apr 2009 18:46:47 -0400 Subject: ext4: Fix big-endian problem in __ext4_check_blockref() Commit fe2c8191 introduced a regression on big-endian system, because the checks to make sure block references in non-extent inodes are valid failed to use le32_to_cpu(). Reported-by: Alexander Beregalov Signed-off-by: Thiemo Nagel Tested-by: Alexander Beregalov Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a2e7952bc5f..c6bd6ced3bb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -372,16 +372,16 @@ static int ext4_block_to_path(struct inode *inode, } static int __ext4_check_blockref(const char *function, struct inode *inode, - unsigned int *p, unsigned int max) { + __le32 *p, unsigned int max) { unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); - unsigned int *bref = p; + __le32 *bref = p; while (bref < p+max) { - if (unlikely(*bref >= maxblocks)) { + if (unlikely(le32_to_cpu(*bref) >= maxblocks)) { ext4_error(inode->i_sb, function, "block reference %u >= max (%u) " "in inode #%lu, offset=%d", - *bref, maxblocks, + le32_to_cpu(*bref), maxblocks, inode->i_ino, (int)(bref-p)); return -EIO; } -- cgit v1.2.3 From e44543b83bf4ab84dc6bd5b88158c78b1ed1c208 Mon Sep 17 00:00:00 2001 From: Thiemo Nagel Date: Sat, 4 Apr 2009 23:30:44 -0400 Subject: ext4: Fix off-by-one-error in ext4_valid_extent_idx() Signed-off-by: Thiemo Nagel Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ac77d8b8251..6132353dcf6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -342,7 +342,7 @@ static int ext4_valid_extent_idx(struct inode *inode, ext4_fsblk_t block = idx_pblock(ext_idx); struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; if (unlikely(block < le32_to_cpu(es->s_first_data_block) || - (block > ext4_blocks_count(es)))) + (block >= ext4_blocks_count(es)))) return 0; else return 1; -- cgit v1.2.3 From a6cb767e24b1dbedfcfa8077eab0aa2eab224038 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 6 Apr 2009 18:39:27 +0200 Subject: xfs: validate log feature fields correctly If the large log sector size feature bit is set in the superblock by accident (say disk corruption), the then fields that are now considered valid are not checked on production kernels. The checks are present as ASSERT statements so cause a panic on a debug kernel. Change this so that the fields are validity checked if the feature bit is set and abort the log mount if the fields do not contain valid values. Reported-by: Eric Sesterhenn Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f76c6d7cea2..8016d304074 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -562,9 +562,8 @@ xfs_log_mount( } mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); - if (!mp->m_log) { - cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!"); - error = ENOMEM; + if (IS_ERR(mp->m_log)) { + error = -PTR_ERR(mp->m_log); goto out; } @@ -1180,10 +1179,13 @@ xlog_alloc_log(xfs_mount_t *mp, xfs_buf_t *bp; int i; int iclogsize; + int error = ENOMEM; log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); - if (!log) - return NULL; + if (!log) { + xlog_warn("XFS: Log allocation failed: No memory!"); + goto out; + } log->l_mp = mp; log->l_targ = log_target; @@ -1201,19 +1203,35 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_grant_reserve_cycle = 1; log->l_grant_write_cycle = 1; + error = EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; - ASSERT(log->l_sectbb_log <= mp->m_sectbb_log); + if (log->l_sectbb_log < 0 || + log->l_sectbb_log > mp->m_sectbb_log) { + xlog_warn("XFS: Log sector size (0x%x) out of range.", + log->l_sectbb_log); + goto out_free_log; + } + /* for larger sector sizes, must have v2 or external log */ - ASSERT(log->l_sectbb_log == 0 || - log->l_logBBstart == 0 || - xfs_sb_version_haslogv2(&mp->m_sb)); - ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT); + if (log->l_sectbb_log != 0 && + (log->l_logBBstart != 0 && + !xfs_sb_version_haslogv2(&mp->m_sb))) { + xlog_warn("XFS: log sector size (0x%x) invalid " + "for configuration.", log->l_sectbb_log); + goto out_free_log; + } + if (mp->m_sb.sb_logsectlog < BBSHIFT) { + xlog_warn("XFS: Log sector log (0x%x) too small.", + mp->m_sb.sb_logsectlog); + goto out_free_log; + } } log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; xlog_get_iclog_buffer_size(mp, log); + error = ENOMEM; bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); if (!bp) goto out_free_log; @@ -1313,7 +1331,8 @@ out_free_iclog: xfs_buf_free(log->l_xbuf); out_free_log: kmem_free(log); - return NULL; +out: + return ERR_PTR(-error); } /* xlog_alloc_log */ -- cgit v1.2.3 From 705db3fd4660174a27418bbcb874d209a76044eb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 6 Apr 2009 18:40:17 +0200 Subject: xfs: fix double free of inode If we fail to initialise the VFS inode in inode_init_always(), it will call ->delete_inode internally resulting in the inode being freed. Hence we need to delay the call to inode_init_always() until after the XFS inode is sufficient set up to handle a call to ->delete_inode, and then if that fails do not touch the inode again at all as it has been freed. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_iget.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 478e587087f..89b81eedce6 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -69,15 +69,6 @@ xfs_inode_alloc( ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); - /* - * initialise the VFS inode here to get failures - * out of the way early. - */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) { - kmem_zone_free(xfs_inode_zone, ip); - return NULL; - } - /* initialise the xfs inode */ ip->i_ino = ino; ip->i_mount = mp; @@ -113,6 +104,20 @@ xfs_inode_alloc( #ifdef XFS_DIR2_TRACE ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); #endif + /* + * Now initialise the VFS inode. We do this after the xfs_inode + * initialisation as internal failures will result in ->destroy_inode + * being called and that will pass down through the reclaim path and + * free the XFS inode. This path requires the XFS inode to already be + * initialised. Hence if this call fails, the xfs_inode has already + * been freed and we should not reference it at all in the error + * handling. + */ + if (!inode_init_always(mp->m_super, VFS_I(ip))) + return NULL; + + /* prevent anyone from using this yet */ + VFS_I(ip)->i_state = I_NEW|I_LOCK; return ip; } -- cgit v1.2.3 From c626d174cfe38e7f0545d074c299527892cd8c45 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 6 Apr 2009 18:42:11 +0200 Subject: xfs: prevent unwritten extent conversion from blocking I/O completion Unwritten extent conversion can recurse back into the filesystem due to memory allocation. Memory reclaim requires I/O completions to be processed to allow the callers to make progress. If the I/O completion workqueue thread is doing the recursion, then we have a deadlock situation. Move unwritten extent completion into it's own workqueue so it doesn't block I/O completions for normal delayed allocation or overwrite data. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_aops.c | 38 +++++++++++++++++++++----------------- fs/xfs/linux-2.6/xfs_aops.h | 1 + fs/xfs/linux-2.6/xfs_buf.c | 9 +++++++++ 3 files changed, 31 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c13f67300fe..7ec89fc05b2 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -152,23 +152,6 @@ xfs_find_bdev_for_inode( return mp->m_ddev_targp->bt_bdev; } -/* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. If we are asked to wait, - * flush the workqueue. - */ -STATIC void -xfs_finish_ioend( - xfs_ioend_t *ioend, - int wait) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - queue_work(xfsdatad_workqueue, &ioend->io_work); - if (wait) - flush_workqueue(xfsdatad_workqueue); - } -} - /* * We're now finished for good with this ioend structure. * Update the page state via the associated buffer_heads, @@ -309,6 +292,27 @@ xfs_end_bio_read( xfs_destroy_ioend(ioend); } +/* + * Schedule IO completion handling on a xfsdatad if this was + * the final hold on this ioend. If we are asked to wait, + * flush the workqueue. + */ +STATIC void +xfs_finish_ioend( + xfs_ioend_t *ioend, + int wait) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + struct workqueue_struct *wq = xfsdatad_workqueue; + if (ioend->io_work.func == xfs_end_bio_unwritten) + wq = xfsconvertd_workqueue; + + queue_work(wq, &ioend->io_work); + if (wait) + flush_workqueue(wq); + } +} + /* * Allocate and initialise an IO completion structure. * We need to track unwritten extent write completion here initially. diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 1dd52884975..221b3e66cee 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -19,6 +19,7 @@ #define __XFS_AOPS_H__ extern struct workqueue_struct *xfsdatad_workqueue; +extern struct workqueue_struct *xfsconvertd_workqueue; extern mempool_t *xfs_ioend_pool; /* diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index aa1016bb913..e28800a9f2b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -51,6 +51,7 @@ static struct shrinker xfs_buf_shake = { static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; +struct workqueue_struct *xfsconvertd_workqueue; #ifdef XFS_BUF_TRACE void @@ -1775,6 +1776,7 @@ xfs_flush_buftarg( xfs_buf_t *bp, *n; int pincount = 0; + xfs_buf_runall_queues(xfsconvertd_workqueue); xfs_buf_runall_queues(xfsdatad_workqueue); xfs_buf_runall_queues(xfslogd_workqueue); @@ -1831,9 +1833,15 @@ xfs_buf_init(void) if (!xfsdatad_workqueue) goto out_destroy_xfslogd_workqueue; + xfsconvertd_workqueue = create_workqueue("xfsconvertd"); + if (!xfsconvertd_workqueue) + goto out_destroy_xfsdatad_workqueue; + register_shrinker(&xfs_buf_shake); return 0; + out_destroy_xfsdatad_workqueue: + destroy_workqueue(xfsdatad_workqueue); out_destroy_xfslogd_workqueue: destroy_workqueue(xfslogd_workqueue); out_free_buf_zone: @@ -1849,6 +1857,7 @@ void xfs_buf_terminate(void) { unregister_shrinker(&xfs_buf_shake); + destroy_workqueue(xfsconvertd_workqueue); destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); -- cgit v1.2.3 From 9d7fef74b23fe57803c5f71fab11630d9ec2cb4b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 6 Apr 2009 18:42:59 +0200 Subject: xfs: inform the xfsaild of the push target before sleeping When trying to reserve log space, we find the amount of space we need, then go to sleep waiting for space. When we are woken, we try to push the tail of the log forward to make sure we have space available. Unfortunately, this means that if there is not space available, and everyone who needs space goes to sleep there is no-one left to push the tail of the log to make space available. Once we have a thread waiting for space to become available, the others queue up behind it in a FIFO, and none of them push the tail of the log. This can result in everyone going to sleep in xlog_grant_log_space() if the first sleeper races with the last I/O that moves the tail of the log forward. With no further I/O tomove the tail of the log, there is nothing to wake the sleepers and hence all transactions just stop. Fix this by making sure the xfsaild will create enough space for the transaction that is about to sleep by moving the push target far enough forwards to ensure that that the curent proceeees will have enough space available when it is woken. That is, we push the AIL before we go to sleep. Because we've inserted the log ticket into the queue before we've pushed and gone to sleep, subsequent transactions will wait behind this one. Hence we are guaranteed to have space available when we are woken. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8016d304074..3750f04ede0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2560,18 +2560,19 @@ redo: xlog_ins_ticketq(&log->l_reserve_headq, tic); xlog_trace_loggrant(log, tic, "xlog_grant_log_space: sleep 2"); + spin_unlock(&log->l_grant_lock); + xlog_grant_push_ail(log->l_mp, need_bytes); + spin_lock(&log->l_grant_lock); + XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); - if (XLOG_FORCED_SHUTDOWN(log)) { - spin_lock(&log->l_grant_lock); + spin_lock(&log->l_grant_lock); + if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - } xlog_trace_loggrant(log, tic, "xlog_grant_log_space: wake 2"); - xlog_grant_push_ail(log->l_mp, need_bytes); - spin_lock(&log->l_grant_lock); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_reserve_headq, tic); @@ -2650,7 +2651,7 @@ xlog_regrant_write_log_space(xlog_t *log, * for more free space, otherwise try to get some space for * this transaction. */ - + need_bytes = tic->t_unit_res; if ((ntic = log->l_write_headq)) { free_bytes = xlog_space_left(log, log->l_grant_write_cycle, log->l_grant_write_bytes); @@ -2670,26 +2671,25 @@ xlog_regrant_write_log_space(xlog_t *log, xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: sleep 1"); + spin_unlock(&log->l_grant_lock); + xlog_grant_push_ail(log->l_mp, need_bytes); + spin_lock(&log->l_grant_lock); + XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already * off the queue */ - if (XLOG_FORCED_SHUTDOWN(log)) { - spin_lock(&log->l_grant_lock); + spin_lock(&log->l_grant_lock); + if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - } xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: wake 1"); - xlog_grant_push_ail(log->l_mp, tic->t_unit_res); - spin_lock(&log->l_grant_lock); } } - need_bytes = tic->t_unit_res; - redo: if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; @@ -2699,19 +2699,20 @@ redo: if (free_bytes < need_bytes) { if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) xlog_ins_ticketq(&log->l_write_headq, tic); + spin_unlock(&log->l_grant_lock); + xlog_grant_push_ail(log->l_mp, need_bytes); + spin_lock(&log->l_grant_lock); + XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already off the queue */ - if (XLOG_FORCED_SHUTDOWN(log)) { - spin_lock(&log->l_grant_lock); + spin_lock(&log->l_grant_lock); + if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - } xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: wake 2"); - xlog_grant_push_ail(log->l_mp, need_bytes); - spin_lock(&log->l_grant_lock); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_write_headq, tic); -- cgit v1.2.3 From a8d770d987ee20b59fba6c37d7f0f2a351913c4b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 6 Apr 2009 18:44:54 +0200 Subject: xfs: use xfs_sync_inodes() for device flushing Currently xfs_device_flush calls sync_blockdev() which is a no-op for XFS as all it's metadata is held in a different address to the one sync_blockdev() works on. Call xfs_sync_inodes() instead to flush all the delayed allocation blocks out. To do this as efficiently as possible, do it via two passes - one to do an async flush of all the dirty blocks and a second to wait for all the IO to complete. This requires some modification to the xfs-sync_inodes_ag() flush code to do efficiently. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_fs_subr.c | 14 +++++++------- fs/xfs/linux-2.6/xfs_sync.c | 43 ++++++++++++++++++++++++------------------ fs/xfs/linux-2.6/xfs_sync.h | 7 ++++--- fs/xfs/xfs_iomap.c | 2 +- fs/xfs/xfs_mount.h | 2 +- 5 files changed, 38 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index 5aeb7777696..08be36d7326 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -74,14 +74,14 @@ xfs_flush_pages( if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_fdatawrite(mapping); - if (flags & XFS_B_ASYNC) - return -ret; - ret2 = filemap_fdatawait(mapping); - if (!ret) - ret = ret2; + ret = -filemap_fdatawrite(mapping); } - return -ret; + if (flags & XFS_B_ASYNC) + return ret; + ret2 = xfs_wait_on_pages(ip, first, last); + if (!ret) + ret = ret2; + return ret; } int diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a608e72fa40..88caafc8ef1 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -62,12 +62,6 @@ xfs_sync_inodes_ag( uint32_t first_index = 0; int error = 0; int last_error = 0; - int fflag = XFS_B_ASYNC; - - if (flags & SYNC_DELWRI) - fflag = XFS_B_DELWRI; - if (flags & SYNC_WAIT) - fflag = 0; /* synchronous overrides all */ do { struct inode *inode; @@ -128,11 +122,23 @@ xfs_sync_inodes_ag( * If we have to flush data or wait for I/O completion * we need to hold the iolock. */ - if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) { - xfs_ilock(ip, XFS_IOLOCK_SHARED); - lock_flags |= XFS_IOLOCK_SHARED; - error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE); - if (flags & SYNC_IOWAIT) + if (flags & SYNC_DELWRI) { + if (VN_DIRTY(inode)) { + if (flags & SYNC_TRYLOCK) { + if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + lock_flags |= XFS_IOLOCK_SHARED; + } else { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + lock_flags |= XFS_IOLOCK_SHARED; + } + if (lock_flags & XFS_IOLOCK_SHARED) { + error = xfs_flush_pages(ip, 0, -1, + (flags & SYNC_WAIT) ? 0 + : XFS_B_ASYNC, + FI_NONE); + } + } + if (VN_CACHED(inode) && (flags & SYNC_IOWAIT)) xfs_ioend_wait(ip); } xfs_ilock(ip, XFS_ILOCK_SHARED); @@ -400,9 +406,9 @@ xfs_syncd_queue_work( void *data, void (*syncer)(struct xfs_mount *, void *)) { - struct bhv_vfs_sync_work *work; + struct xfs_sync_work *work; - work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP); + work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP); INIT_LIST_HEAD(&work->w_list); work->w_syncer = syncer; work->w_data = data; @@ -445,23 +451,24 @@ xfs_flush_inode( * (IOW, "If at first you don't succeed, use a Bigger Hammer"). */ STATIC void -xfs_flush_device_work( +xfs_flush_inodes_work( struct xfs_mount *mp, void *arg) { struct inode *inode = arg; - sync_blockdev(mp->m_super->s_bdev); + xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK); + xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT); iput(inode); } void -xfs_flush_device( +xfs_flush_inodes( xfs_inode_t *ip) { struct inode *inode = VFS_I(ip); igrab(inode); - xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work); + xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work); delay(msecs_to_jiffies(500)); xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); } @@ -497,7 +504,7 @@ xfssyncd( { struct xfs_mount *mp = arg; long timeleft; - bhv_vfs_sync_work_t *work, *n; + xfs_sync_work_t *work, *n; LIST_HEAD (tmp); set_freezable(); diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 04f058c848a..ec95e264805 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -21,18 +21,19 @@ struct xfs_mount; struct xfs_perag; -typedef struct bhv_vfs_sync_work { +typedef struct xfs_sync_work { struct list_head w_list; struct xfs_mount *w_mount; void *w_data; /* syncer routine argument */ void (*w_syncer)(struct xfs_mount *, void *); -} bhv_vfs_sync_work_t; +} xfs_sync_work_t; #define SYNC_ATTR 0x0001 /* sync attributes */ #define SYNC_DELWRI 0x0002 /* look at delayed writes */ #define SYNC_WAIT 0x0004 /* wait for i/o to complete */ #define SYNC_BDFLUSH 0x0008 /* BDFLUSH is calling -- don't block */ #define SYNC_IOWAIT 0x0010 /* wait for all I/O to complete */ +#define SYNC_TRYLOCK 0x0020 /* only try to lock inodes */ int xfs_syncd_init(struct xfs_mount *mp); void xfs_syncd_stop(struct xfs_mount *mp); @@ -44,7 +45,7 @@ int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inode(struct xfs_inode *ip); -void xfs_flush_device(struct xfs_inode *ip); +void xfs_flush_inodes(struct xfs_inode *ip); int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 08ce72316bf..8b97d82d7a8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -361,7 +361,7 @@ xfs_flush_space( return 0; case 2: xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_device(ip); + xfs_flush_inodes(ip); xfs_ilock(ip, XFS_ILOCK_EXCL); *fsynced = 3; return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7af44adffc8..d6a64392f98 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -313,7 +313,7 @@ typedef struct xfs_mount { #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct task_struct *m_sync_task; /* generalised sync thread */ - bhv_vfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */ + xfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */ struct list_head m_sync_list; /* sync thread work item list */ spinlock_t m_sync_lock; /* work item list lock */ int m_sync_seq; /* sync thread generation no. */ -- cgit v1.2.3 From 5825294edd3364cbba6514f70d88debec4f6cec7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 6 Apr 2009 18:45:44 +0200 Subject: xfs: make inode flush at ENOSPC synchronous When we are writing to a single file and hit ENOSPC, we trigger a background flush of the inode and try again. Because we hold page locks and the iolock, the flush won't proceed until after we release these locks. This occurs once we've given up and ENOSPC has been reported. Hence if this one is the only dirty inode in the system, we'll get an ENOSPC prematurely. To fix this, remove the async flush from the allocation routines and move it to the top of the write path where we can do a synchronous flush and retry the write again. Only retry once as a second ENOSPC indicates that we really are ENOSPC. This avoids a page cache deadlock when trying to do this flush synchronously in the allocation layer that was identified by Mikulas Patocka. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_lrw.c | 18 +++++++++++++++++- fs/xfs/linux-2.6/xfs_sync.c | 25 ------------------------- fs/xfs/linux-2.6/xfs_sync.h | 1 - fs/xfs/xfs_iomap.c | 2 +- 4 files changed, 18 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 7e90daa0d1d..9142192ccbe 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -751,10 +751,26 @@ start: goto relock; } } else { + int enospc = 0; + ssize_t ret2 = 0; + +write_retry: xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs, *offset, ioflags); - ret = generic_file_buffered_write(iocb, iovp, segs, + ret2 = generic_file_buffered_write(iocb, iovp, segs, pos, offset, count, ret); + /* + * if we just got an ENOSPC, flush the inode now we + * aren't holding any page locks and retry *once* + */ + if (ret2 == -ENOSPC && !enospc) { + error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE); + if (error) + goto out_unlock_internal; + enospc = 1; + goto write_retry; + } + ret = ret2; } current->backing_dev_info = NULL; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 88caafc8ef1..73cf8dc1973 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -426,31 +426,6 @@ xfs_syncd_queue_work( * heads, looking about for more room... */ STATIC void -xfs_flush_inode_work( - struct xfs_mount *mp, - void *arg) -{ - struct inode *inode = arg; - filemap_flush(inode->i_mapping); - iput(inode); -} - -void -xfs_flush_inode( - xfs_inode_t *ip) -{ - struct inode *inode = VFS_I(ip); - - igrab(inode); - xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work); - delay(msecs_to_jiffies(500)); -} - -/* - * This is the "bigger hammer" version of xfs_flush_inode_work... - * (IOW, "If at first you don't succeed, use a Bigger Hammer"). - */ -STATIC void xfs_flush_inodes_work( struct xfs_mount *mp, void *arg) diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index ec95e264805..6e83a35626e 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -44,7 +44,6 @@ int xfs_sync_fsdata(struct xfs_mount *mp, int flags); int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); -void xfs_flush_inode(struct xfs_inode *ip); void xfs_flush_inodes(struct xfs_inode *ip); int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 8b97d82d7a8..7b8b1707103 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -347,7 +347,7 @@ xfs_flush_space( case 0: if (ip->i_delayed_blks) { xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inode(ip); + delay(1); xfs_ilock(ip, XFS_ILOCK_EXCL); *fsynced = 1; } else { -- cgit v1.2.3 From e43afd72d2455defd63a3f94f22fa09b586e58ed Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 6 Apr 2009 18:47:27 +0200 Subject: xfs: block callers of xfs_flush_inodes() correctly xfs_flush_inodes() currently uses a magic timeout to wait for some inodes to be flushed before returning. This isn't really reliable but used to be the best that could be done due to deadlock potential of waiting for the entire flush. Now the inode flush is safe to execute while we hold page and inode locks, we can wait for all the inodes to flush synchronously. Convert the wait mechanism to a completion to do this efficiently. This should remove all remaining spurious ENOSPC errors from the delayed allocation reservation path. This is extracted almost line for line from a larger patch from Mikulas Patocka. Signed-off-by: Mikulas Patocka Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_sync.c | 12 +++++++++--- fs/xfs/linux-2.6/xfs_sync.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 73cf8dc1973..f7ba76633c2 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -404,7 +404,8 @@ STATIC void xfs_syncd_queue_work( struct xfs_mount *mp, void *data, - void (*syncer)(struct xfs_mount *, void *)) + void (*syncer)(struct xfs_mount *, void *), + struct completion *completion) { struct xfs_sync_work *work; @@ -413,6 +414,7 @@ xfs_syncd_queue_work( work->w_syncer = syncer; work->w_data = data; work->w_mount = mp; + work->w_completion = completion; spin_lock(&mp->m_sync_lock); list_add_tail(&work->w_list, &mp->m_sync_list); spin_unlock(&mp->m_sync_lock); @@ -441,10 +443,11 @@ xfs_flush_inodes( xfs_inode_t *ip) { struct inode *inode = VFS_I(ip); + DECLARE_COMPLETION_ONSTACK(completion); igrab(inode); - xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work); - delay(msecs_to_jiffies(500)); + xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); + wait_for_completion(&completion); xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); } @@ -514,6 +517,8 @@ xfssyncd( list_del(&work->w_list); if (work == &mp->m_sync_work) continue; + if (work->w_completion) + complete(work->w_completion); kmem_free(work); } } @@ -527,6 +532,7 @@ xfs_syncd_init( { mp->m_sync_work.w_syncer = xfs_sync_worker; mp->m_sync_work.w_mount = mp; + mp->m_sync_work.w_completion = NULL; mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); if (IS_ERR(mp->m_sync_task)) return -PTR_ERR(mp->m_sync_task); diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 6e83a35626e..308d5bf6dfb 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -26,6 +26,7 @@ typedef struct xfs_sync_work { struct xfs_mount *w_mount; void *w_data; /* syncer routine argument */ void (*w_syncer)(struct xfs_mount *, void *); + struct completion *w_completion; } xfs_sync_work_t; #define SYNC_ATTR 0x0001 /* sync attributes */ -- cgit v1.2.3 From 153fec43ce5264dfe9f3530b281a2e940b25a0a8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 6 Apr 2009 18:48:30 +0200 Subject: xfs: flush delayed allcoation blocks on ENOSPC in create If we are creating lots of small files, we can fail to get a reservation for inode create earlier than we should due to EOF preallocation done during delayed allocation reservation. Hence on the first reservation ENOSPC failure flush all the delayed allocation blocks out of the system and retry. This fixes the last commonly triggered spurious ENOSPC issue that has been reported. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_vnodeops.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 7394c7af5de..19cf90a9c76 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1457,6 +1457,13 @@ xfs_create( error = xfs_trans_reserve(tp, resblks, log_res, 0, XFS_TRANS_PERM_LOG_RES, log_count); if (error == ENOSPC) { + /* flush outstanding delalloc blocks and retry */ + xfs_flush_inodes(dp); + error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); + } + if (error == ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ resblks = 0; error = xfs_trans_reserve(tp, 0, log_res, 0, XFS_TRANS_PERM_LOG_RES, log_count); -- cgit v1.2.3 From 8de2bf937a6bea8f0f775fd5399ba20c1a0c3d77 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 6 Apr 2009 18:49:12 +0200 Subject: xfs: remove xfs_flush_space The only thing we need to do now when we get an ENOSPC condition during delayed allocation reservation is flush all the other inodes with delalloc blocks on them and retry without EOF preallocation. Remove the unneeded mess that is xfs_flush_space() and just call xfs_flush_inodes() directly from xfs_iomap_write_delay(). Also, change the location of the retry label to avoid trying to do EOF preallocation because we don't want to do that at ENOSPC. This enables us to remove the BMAPI_SYNC flag as it is no longer used. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_iomap.c | 61 ++++++++++++++---------------------------------------- fs/xfs/xfs_iomap.h | 3 +-- 2 files changed, 16 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 7b8b1707103..5aaa2d7ec15 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -337,38 +337,6 @@ xfs_iomap_eof_align_last_fsb( return 0; } -STATIC int -xfs_flush_space( - xfs_inode_t *ip, - int *fsynced, - int *ioflags) -{ - switch (*fsynced) { - case 0: - if (ip->i_delayed_blks) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - delay(1); - xfs_ilock(ip, XFS_ILOCK_EXCL); - *fsynced = 1; - } else { - *ioflags |= BMAPI_SYNC; - *fsynced = 2; - } - return 0; - case 1: - *fsynced = 2; - *ioflags |= BMAPI_SYNC; - return 0; - case 2: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inodes(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); - *fsynced = 3; - return 0; - } - return 1; -} - STATIC int xfs_cmn_err_fsblock_zero( xfs_inode_t *ip, @@ -538,15 +506,9 @@ error_out: } /* - * If the caller is doing a write at the end of the file, - * then extend the allocation out to the file system's write - * iosize. We clean up any extra space left over when the - * file is closed in xfs_inactive(). - * - * For sync writes, we are flushing delayed allocate space to - * try to make additional space available for allocation near - * the filesystem full boundary - preallocation hurts in that - * situation, of course. + * If the caller is doing a write at the end of the file, then extend the + * allocation out to the file system's write iosize. We clean up any extra + * space left over when the file is closed in xfs_inactive(). */ STATIC int xfs_iomap_eof_want_preallocate( @@ -565,7 +527,7 @@ xfs_iomap_eof_want_preallocate( int n, error, imaps; *prealloc = 0; - if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size) + if ((offset + count) <= ip->i_size) return 0; /* @@ -611,7 +573,7 @@ xfs_iomap_write_delay( xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc, fsynced = 0; + int prealloc, flushed = 0; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -627,12 +589,12 @@ xfs_iomap_write_delay( extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); -retry: error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, ioflag, imap, XFS_WRITE_IMAPS, &prealloc); if (error) return error; +retry: if (prealloc) { aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); @@ -659,15 +621,22 @@ retry: /* * If bmapi returned us nothing, and if we didn't get back EDQUOT, - * then we must have run out of space - flush delalloc, and retry.. + * then we must have run out of space - flush all other inodes with + * delalloc blocks and retry without EOF preallocation. */ if (nimaps == 0) { xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, ip, offset, count); - if (xfs_flush_space(ip, &fsynced, &ioflag)) + if (flushed) return XFS_ERROR(ENOSPC); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_flush_inodes(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + + flushed = 1; error = 0; + prealloc = 0; goto retry; } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index a1cc1322fc0..fdcf7b82747 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -40,8 +40,7 @@ typedef enum { BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */ BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */ BMAPI_MMAP = (1 << 6), /* allocate for mmap write */ - BMAPI_SYNC = (1 << 7), /* sync write to flush delalloc space */ - BMAPI_TRYLOCK = (1 << 8), /* non-blocking request */ + BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */ } bmapi_flags_t; -- cgit v1.2.3 From 0f2ddca66d70c8ccba7486cf2d79c6b60e777abd Mon Sep 17 00:00:00 2001 From: "From: Thiemo Nagel" Date: Tue, 7 Apr 2009 14:07:47 -0400 Subject: ext4: check block device size on mount Signed-off-by: Thiemo Nagel Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9987bba99db..2958f4e6f22 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2508,6 +2508,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (EXT4_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext4; + /* check blocks count against device size */ + blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu " + "exceeds size of device (%llu blocks)\n", + ext4_blocks_count(es), blocks_count); + goto failed_mount; + } + /* * It makes no sense for the first data block to be beyond the end * of the filesystem. -- cgit v1.2.3 From 6e34eeddf7deec1444bbddab533f03f520d8458c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 7 Apr 2009 18:12:43 -0400 Subject: block_write_full_page: switch synchronous writes to use WRITE_SYNC_PLUG Now that we have a distinction between WRITE_SYNC and WRITE_SYNC_PLUG, use WRITE_SYNC_PLUG in __block_write_full_page() to avoid unplugging the block device I/O queue between each page that gets flushed out. Otherwise, when we run sync() or fsync() and we need to write out a large number of pages, the block device queue will get unplugged between for every page that is flushed out, which will be a pretty serious performance regression caused by commit a64c8610. Signed-off-by: "Theodore Ts'o" --- fs/buffer.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 6e35762b616..13edf7ad3ff 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1596,6 +1596,16 @@ EXPORT_SYMBOL(unmap_underlying_metadata); * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. + * + * If block_write_full_page() is called with wbc->sync_mode == + * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this + * causes the writes to be flagged as synchronous writes, but the + * block device queue will NOT be unplugged, since usually many pages + * will be pushed to the out before the higher-level caller actually + * waits for the writes to be completed. The various wait functions, + * such as wait_on_writeback_range() will ultimately call sync_page() + * which will ultimately call blk_run_backing_dev(), which will end up + * unplugging the device queue. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc) @@ -1606,7 +1616,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; const unsigned blocksize = 1 << inode->i_blkbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE); BUG_ON(!PageLocked(page)); -- cgit v1.2.3 From 430db323fae7665da721768949ade6304811c648 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 7 Apr 2009 18:25:01 -0400 Subject: ext3: Try to avoid starting a transaction in writepage for data=writepage This does the same as commit 9e80d407736161d9b8b0c5a0d44f786e44c322ea (avoid starting a transaction when no block allocation is needed) but for data=writeback mode of ext3. We also cleanup the data=ordered case a bit to stick to coding style... Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext3/inode.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 466a332e0bd..fcfa2436185 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1521,12 +1521,16 @@ static int ext3_ordered_writepage(struct page *page, if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); - } else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { - /* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */ - return block_write_full_page(page, NULL, wbc); + page_bufs = page_buffers(page); + } else { + page_bufs = page_buffers(page); + if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } } - page_bufs = page_buffers(page); - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { @@ -1581,6 +1585,15 @@ static int ext3_writeback_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); -- cgit v1.2.3 From 2b3fffefea993a94c386b2d96de2d09469c343d1 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Tue, 7 Apr 2009 21:21:42 -0700 Subject: befs: fix build on parisc fs/befs/super.c:85: error: 'PAGE_SIZE' undeclared Signed-off-by: Alexander Beregalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/befs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/befs/super.c b/fs/befs/super.c index 41f2b4d0093..ca40f828f64 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -8,6 +8,7 @@ */ #include +#include /* for PAGE_SIZE */ #include "befs.h" #include "super.h" -- cgit v1.2.3 From 4c967291fc875a53de7126d256ad5e48f42a6521 Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Tue, 7 Apr 2009 21:21:43 -0700 Subject: nommu: fix typo vma->pg_off to vma->vm_pgoff 6260a4b0521a41189b2c2a8119096c1e21dbdf2c ("/proc/pid/maps: don't show pgoff of pure ANON VMAs" had a typo. fs/proc/task_nommu.c:138: error: 'struct vm_area_struct' has no member named 'pg_off' distcc[21484] ERROR: compile fs/proc/task_nommu.c on sprygo/32 failed Signed-off-by: Nobuhiro Iwamatsu Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_nommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 12c20377772..64a72e2e765 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -135,7 +135,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; - pgoff = (loff_t)vma->pg_off << PAGE_SHIFT; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; } seq_printf(m, -- cgit v1.2.3 From ce60a2f15764f296b0467960759351702c7d2986 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 9 Apr 2009 17:37:52 +0200 Subject: fuse: fix argument type in fuse_get_user_pages() Fix the following warning: fs/fuse/file.c: In function 'fuse_direct_io': fs/fuse/file.c:1002: warning: passing argument 3 of 'fuse_get_user_pages' from incompatible pointer type This was introduced by commit f4975c67 "fuse: allow kernel to access "direct_io" files". Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2b25133524a..0946861b10b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -938,9 +938,9 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, - unsigned *nbytesp, int write) + size_t *nbytesp, int write) { - unsigned nbytes = *nbytesp; + size_t nbytes = *nbytesp; unsigned long user_addr = (unsigned long) buf; unsigned offset = user_addr & ~PAGE_MASK; int npages; @@ -955,7 +955,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, return 0; } - nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); + nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); down_read(¤t->mm->mmap_sem); -- cgit v1.2.3 From 3121bfe7631126d1b13064855ac2cfa164381bb0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 9 Apr 2009 17:37:53 +0200 Subject: fuse: fix "direct_io" private mmap MAP_PRIVATE mmap could return stale data from the cache for "direct_io" files. Fix this by flushing the cache on mmap. Found with a slightly modified fsx-linux. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0946861b10b..06f30e96567 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_MAYSHARE) return -ENODEV; + invalidate_inode_pages2(file->f_mapping); + return generic_file_mmap(file, vma); } -- cgit v1.2.3 From 11ff5f6affe9b75f115a900a5584db339d46002b Mon Sep 17 00:00:00 2001 From: Stoyan Gaydarov Date: Thu, 9 Apr 2009 17:10:28 +0100 Subject: afs: BUG to BUG_ON changes Signed-off-by: Stoyan Gaydarov Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/netdevices.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c index 49f18942306..7ad36506c25 100644 --- a/fs/afs/netdevices.c +++ b/fs/afs/netdevices.c @@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen) struct net_device *dev; int ret = -ENODEV; - if (maclen != ETH_ALEN) - BUG(); + BUG_ON(maclen != ETH_ALEN); rtnl_lock(); dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); -- cgit v1.2.3 From c306af23e19d3c94c9229263c39fe487e915e774 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 26 Mar 2009 10:16:57 +0900 Subject: nilfs2: return f_fsid for statfs2 This follows the change of Coly Li's series ("fs: return f_fsid for statfs(2)"), and make nilfs2 return f_fsid info for statfs(2). Acked-by: Coly Li Signed-off-by: Ryusuke Konishi --- fs/nilfs2/super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index e117e1ea9bf..8a965f9523a 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -476,11 +476,12 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct the_nilfs *nilfs = sbi->s_nilfs; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); unsigned long long blocks; unsigned long overhead; unsigned long nrsvblocks; sector_t nfreeblocks; - struct the_nilfs *nilfs = sbi->s_nilfs; int err; /* @@ -514,6 +515,9 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = atomic_read(&sbi->s_inodes_count); buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ buf->f_namelen = NILFS_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; } -- cgit v1.2.3 From bcb48891b05b4179edc86298d3dccb2ce90d5413 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 27 Mar 2009 02:51:39 +0900 Subject: nilfs2: fix lockdep recursive locking warning on bmap The bmap semaphore of DAT file can be held while a bmap of other files is locked. This has caused the following false detection of lockdep check: mount.nilfs2/4667 is trying to acquire lock: (&bmap->b_sem){..--}, at: [] nilfs_bmap_lookup_at_level+0x1a/0x74 [nilfs2] but task is already holding lock: (&bmap->b_sem){..--}, at: [] nilfs_bmap_lookup_at_level+0x1a/0x74 [nilfs2] This will fix the false detection by distinguishing semaphores of the DAT and other files. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/bmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 24638e059bf..064279e33bb 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -688,6 +688,8 @@ static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = { .bpop_translate = NULL, }; +static struct lock_class_key nilfs_bmap_dat_lock_key; + /** * nilfs_bmap_read - read a bmap from an inode * @bmap: bmap @@ -715,6 +717,7 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) bmap->b_pops = &nilfs_bmap_ptr_ops_p; bmap->b_last_allocated_key = 0; /* XXX: use macro */ bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); break; case NILFS_CPFILE_INO: case NILFS_SUFILE_INO: @@ -772,6 +775,7 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) { memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union)); init_rwsem(&gcbmap->b_sem); + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; } @@ -779,5 +783,6 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) { memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union)); init_rwsem(&bmap->b_sem); + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; } -- cgit v1.2.3 From c2698e50e304cd29a7836f05452359a3306a405e Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 27 Mar 2009 02:53:12 +0900 Subject: nilfs2: fix lockdep recursive locking warning on meta data files This fixes the following false detection of lockdep against nilfs meta data files: ============================================= [ INFO: possible recursive locking detected ] 2.6.29 #26 --------------------------------------------- mount.nilfs2/4185 is trying to acquire lock: (&mi->mi_sem){----}, at: [] nilfs_sufile_get_stat+0x1e/0x105 [nilfs2] but task is already holding lock: (&mi->mi_sem){----}, at: [] nilfs_count_free_blocks+0x48/0x84 [nilfs2] Signed-off-by: Ryusuke Konishi --- fs/nilfs2/the_nilfs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 33400cf0bbe..7f65b3be4aa 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -115,6 +115,7 @@ void put_nilfs(struct the_nilfs *nilfs) static int nilfs_load_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, sector_t sr_block) { + static struct lock_class_key dat_lock_key; struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; struct nilfs_super_block **sbp = nilfs->ns_sbp; @@ -163,6 +164,9 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, if (unlikely(err)) goto failed_sufile; + lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key); + lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key); + nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat); nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size, sizeof(struct nilfs_cpfile_header)); -- cgit v1.2.3 From e7a7402c0d392dcadc74cae8922f8fae4667605a Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 27 Mar 2009 10:49:11 +0900 Subject: nilfs2: remove module version A MODULE_VERSION() macro has been used in out-of-tree nilfs modules, but it's needless and not updated in tree. So, this removes it along with the version declaration. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/nilfs.h | 5 ----- fs/nilfs2/super.c | 1 - 2 files changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 7558c977db0..3d0c18a16db 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -34,11 +34,6 @@ #include "bmap.h" #include "bmap_union.h" -/* - * NILFS filesystem version - */ -#define NILFS_VERSION "2.0.5" - /* * nilfs inode data in memory */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8a965f9523a..6989b03e97a 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -63,7 +63,6 @@ MODULE_AUTHOR("NTT Corp."); MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); -MODULE_VERSION(NILFS_VERSION); MODULE_LICENSE("GPL"); static int nilfs_remount(struct super_block *sb, int *flags, char *data); -- cgit v1.2.3 From 3efb55b496952e0d29a9ec66d0ceaab175c4e8ca Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 30 Mar 2009 00:50:19 +0900 Subject: nilfs2: simplify handling of active state of segments fix This fixes a bug of ("nilfs2: simplify handling of active state of segments") patch. The patch did not take account that a base index is increased in nilfs_sufile_get_suinfo() function if requested entries go across block boundary on sufile. Due to this bug, the active flag sometimes appears on wrong segments and has induced malfunction of garbage collection. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/sufile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index c774cf397e2..1ef2b4d9d79 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -625,7 +625,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks); si[i + j].sui_flags = le32_to_cpu(su->su_flags) & ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); - if (nilfs_segment_is_active(nilfs, segnum + i + j)) + if (nilfs_segment_is_active(nilfs, segnum + j)) si[i + j].sui_flags |= (1UL << NILFS_SEGMENT_USAGE_ACTIVE); } -- cgit v1.2.3 From 88072faf9a32c92f37c15065496bb6eb309aebe3 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 5 Apr 2009 15:03:16 +0900 Subject: nilfs2: fix wrong accounting and duplicate brelse in nilfs_sufile_set_error The nilfs_sufile_set_error() function wrongly adjusts the number of dirty segments instead of the number of clean segments. In addition, the function calls brelse() twice for the same buffer head. This fixes these bugs. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/sufile.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1ef2b4d9d79..8b2f93ca1e1 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -523,7 +523,7 @@ int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum) struct nilfs_segment_usage *su; struct nilfs_sufile_header *header; void *kaddr; - int ret; + int suclean, ret; if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { printk(KERN_WARNING "%s: invalid segment number: %llu\n", @@ -546,16 +546,19 @@ int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum) brelse(su_bh); goto out_header; } + suclean = nilfs_segment_usage_clean(su); nilfs_segment_usage_set_error(su); kunmap_atomic(kaddr, KM_USER0); - brelse(su_bh); - kaddr = kmap_atomic(header_bh->b_page, KM_USER0); - header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); - le64_add_cpu(&header->sh_ndirtysegs, -1); - kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(header_bh); + if (suclean) { + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, + kaddr); + le64_add_cpu(&header->sh_ncleansegs, -1); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(header_bh); + } nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); brelse(su_bh); -- cgit v1.2.3 From a703018f7bbec8109419318f5d51f235fdce5155 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 5 Apr 2009 18:24:11 +0900 Subject: nilfs2: segment usage file cleanups This will simplify sufile.c by sharing common code which repeatedly appears in routines updating a segment usage entry; a wrapper function nilfs_sufile_update() is introduced for the purpose, and counter modifications are integrated to a new function nilfs_sufile_mod_counter(). This is a preparation for the successive bugfix patch ("nilfs2: fix possible mismatch of sufile counters on recovery"). Signed-off-by: Ryusuke Konishi --- fs/nilfs2/sufile.c | 268 +++++++++++++++-------------------------------------- fs/nilfs2/sufile.h | 67 +++++++++++++- 2 files changed, 140 insertions(+), 195 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 8b2f93ca1e1..07013f58dfe 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -93,6 +93,52 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, create, NULL, bhp); } +static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, + u64 ncleanadd, u64 ndirtyadd) +{ + struct nilfs_sufile_header *header; + void *kaddr; + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + le64_add_cpu(&header->sh_ncleansegs, ncleanadd); + le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(header_bh); +} + +int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)) +{ + struct buffer_head *header_bh, *bh; + int ret; + + if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { + printk(KERN_WARNING "%s: invalid segment number: %llu\n", + __func__, (unsigned long long)segnum); + return -EINVAL; + } + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh); + if (!ret) { + dofunc(sufile, segnum, header_bh, bh); + brelse(bh); + } + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + /** * nilfs_sufile_alloc - allocate a segment * @sufile: inode of segment usage file @@ -113,7 +159,6 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) { struct buffer_head *header_bh, *su_bh; - struct the_nilfs *nilfs; struct nilfs_sufile_header *header; struct nilfs_segment_usage *su; size_t susz = NILFS_MDT(sufile)->mi_entry_size; @@ -124,8 +169,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) down_write(&NILFS_MDT(sufile)->mi_sem); - nilfs = NILFS_MDT(sufile)->mi_nilfs; - ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; @@ -192,165 +235,55 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) return ret; } -/** - * nilfs_sufile_cancel_free - - * @sufile: inode of segment usage file - * @segnum: segment number - * - * Description: - * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - */ -int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum) +void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) { - struct buffer_head *header_bh, *su_bh; - struct the_nilfs *nilfs; - struct nilfs_sufile_header *header; struct nilfs_segment_usage *su; void *kaddr; - int ret; - - down_write(&NILFS_MDT(sufile)->mi_sem); - - nilfs = NILFS_MDT(sufile)->mi_nilfs; - - ret = nilfs_sufile_get_header_block(sufile, &header_bh); - if (ret < 0) - goto out_sem; - - ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); - if (ret < 0) - goto out_header; kaddr = kmap_atomic(su_bh->b_page, KM_USER0); - su = nilfs_sufile_block_get_segment_usage( - sufile, segnum, su_bh, kaddr); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (unlikely(!nilfs_segment_usage_clean(su))) { printk(KERN_WARNING "%s: segment %llu must be clean\n", __func__, (unsigned long long)segnum); kunmap_atomic(kaddr, KM_USER0); - goto out_su_bh; + return; } nilfs_segment_usage_set_dirty(su); kunmap_atomic(kaddr, KM_USER0); - kaddr = kmap_atomic(header_bh->b_page, KM_USER0); - header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); - le64_add_cpu(&header->sh_ncleansegs, -1); - le64_add_cpu(&header->sh_ndirtysegs, 1); - kunmap_atomic(kaddr, KM_USER0); - - nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_sufile_mod_counter(header_bh, -1, 1); nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); - - out_su_bh: - brelse(su_bh); - out_header: - brelse(header_bh); - out_sem: - up_write(&NILFS_MDT(sufile)->mi_sem); - return ret; } -/** - * nilfs_sufile_freev - free segments - * @sufile: inode of segment usage file - * @segnum: array of segment numbers - * @nsegs: number of segments - * - * Description: nilfs_sufile_freev() frees segments specified by @segnum and - * @nsegs, which must have been returned by a previous call to - * nilfs_sufile_alloc(). - * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - */ -#define NILFS_SUFILE_FREEV_PREALLOC 16 -int nilfs_sufile_freev(struct inode *sufile, __u64 *segnum, size_t nsegs) +void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) { - struct buffer_head *header_bh, **su_bh, - *su_bh_prealloc[NILFS_SUFILE_FREEV_PREALLOC]; - struct the_nilfs *nilfs; - struct nilfs_sufile_header *header; struct nilfs_segment_usage *su; void *kaddr; - int ret, i; + int sudirty; - down_write(&NILFS_MDT(sufile)->mi_sem); - - nilfs = NILFS_MDT(sufile)->mi_nilfs; - - /* prepare resources */ - if (nsegs <= NILFS_SUFILE_FREEV_PREALLOC) - su_bh = su_bh_prealloc; - else { - su_bh = kmalloc(sizeof(*su_bh) * nsegs, GFP_NOFS); - if (su_bh == NULL) { - ret = -ENOMEM; - goto out_sem; - } - } - - ret = nilfs_sufile_get_header_block(sufile, &header_bh); - if (ret < 0) - goto out_su_bh; - for (i = 0; i < nsegs; i++) { - ret = nilfs_sufile_get_segment_usage_block(sufile, segnum[i], - 0, &su_bh[i]); - if (ret < 0) - goto out_bh; - } - - /* free segments */ - for (i = 0; i < nsegs; i++) { - kaddr = kmap_atomic(su_bh[i]->b_page, KM_USER0); - su = nilfs_sufile_block_get_segment_usage( - sufile, segnum[i], su_bh[i], kaddr); - WARN_ON(nilfs_segment_usage_error(su)); - nilfs_segment_usage_set_clean(su); + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (nilfs_segment_usage_clean(su)) { + printk(KERN_WARNING "%s: segment %llu is already clean\n", + __func__, (unsigned long long)segnum); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(su_bh[i]); + return; } - kaddr = kmap_atomic(header_bh->b_page, KM_USER0); - header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); - le64_add_cpu(&header->sh_ncleansegs, nsegs); - le64_add_cpu(&header->sh_ndirtysegs, -(u64)nsegs); - kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(header_bh); - nilfs_mdt_mark_dirty(sufile); - - out_bh: - for (i--; i >= 0; i--) - brelse(su_bh[i]); - brelse(header_bh); + WARN_ON(nilfs_segment_usage_error(su)); + WARN_ON(!nilfs_segment_usage_dirty(su)); - out_su_bh: - if (su_bh != su_bh_prealloc) - kfree(su_bh); - - out_sem: - up_write(&NILFS_MDT(sufile)->mi_sem); - return ret; -} + sudirty = nilfs_segment_usage_dirty(su); + nilfs_segment_usage_set_clean(su); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(su_bh); -/** - * nilfs_sufile_free - - * @sufile: - * @segnum: - */ -int nilfs_sufile_free(struct inode *sufile, __u64 segnum) -{ - return nilfs_sufile_freev(sufile, &segnum, 1); + nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); + nilfs_mdt_mark_dirty(sufile); } /** @@ -500,75 +433,28 @@ int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp) return ret; } -/** - * nilfs_sufile_set_error - mark a segment as erroneous - * @sufile: inode of segment usage file - * @segnum: segment number - * - * Description: nilfs_sufile_set_error() marks the segment specified by - * @segnum as erroneous. The error segment will never be used again. - * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid segment usage number. - */ -int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum) +void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) { - struct buffer_head *header_bh, *su_bh; struct nilfs_segment_usage *su; - struct nilfs_sufile_header *header; void *kaddr; - int suclean, ret; - - if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { - printk(KERN_WARNING "%s: invalid segment number: %llu\n", - __func__, (unsigned long long)segnum); - return -EINVAL; - } - down_write(&NILFS_MDT(sufile)->mi_sem); - - ret = nilfs_sufile_get_header_block(sufile, &header_bh); - if (ret < 0) - goto out_sem; - ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); - if (ret < 0) - goto out_header; + int suclean; kaddr = kmap_atomic(su_bh->b_page, KM_USER0); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (nilfs_segment_usage_error(su)) { kunmap_atomic(kaddr, KM_USER0); - brelse(su_bh); - goto out_header; + return; } suclean = nilfs_segment_usage_clean(su); - nilfs_segment_usage_set_error(su); kunmap_atomic(kaddr, KM_USER0); - if (suclean) { - kaddr = kmap_atomic(header_bh->b_page, KM_USER0); - header = nilfs_sufile_block_get_header(sufile, header_bh, - kaddr); - le64_add_cpu(&header->sh_ncleansegs, -1); - kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(header_bh); - } + if (suclean) + nilfs_sufile_mod_counter(header_bh, -1, 0); nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); - brelse(su_bh); - - out_header: - brelse(header_bh); - - out_sem: - up_write(&NILFS_MDT(sufile)->mi_sem); - return ret; } /** diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index d595f33a768..449a6e2671b 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -36,9 +36,6 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) } int nilfs_sufile_alloc(struct inode *, __u64 *); -int nilfs_sufile_cancel_free(struct inode *, __u64); -int nilfs_sufile_freev(struct inode *, __u64 *, size_t); -int nilfs_sufile_free(struct inode *, __u64); int nilfs_sufile_get_segment_usage(struct inode *, __u64, struct nilfs_segment_usage **, struct buffer_head **); @@ -46,9 +43,71 @@ void nilfs_sufile_put_segment_usage(struct inode *, __u64, struct buffer_head *); int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *); -int nilfs_sufile_set_error(struct inode *, __u64); ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *, size_t); +int nilfs_sufile_update(struct inode *, __u64, int, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)); +void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); + +/** + * nilfs_sufile_cancel_free - + * @sufile: inode of segment usage file + * @segnum: segment number + * + * Description: + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 0, + nilfs_sufile_do_cancel_free); +} + +/** + * nilfs_sufile_free - free segment + * @sufile: inode of segment usage file + * @segnum: segment number to be freed + */ +static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free); +} + +/** + * nilfs_sufile_set_error - mark a segment as erroneous + * @sufile: inode of segment usage file + * @segnum: segment number + * + * Description: nilfs_sufile_set_error() marks the segment specified by + * @segnum as erroneous. The error segment will never be used again. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid segment usage number. + */ +static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 0, + nilfs_sufile_do_set_error); +} #endif /* _NILFS_SUFILE_H */ -- cgit v1.2.3 From c85399c2da8b86de8f6877980294fa1a4a88a5a4 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 5 Apr 2009 18:30:58 +0900 Subject: nilfs2: fix possible mismatch of sufile counters on recovery On-disk counters ndirtysegs and ncleansegs of sufile, can go wrong after roll-forward recovery because nilfs_prepare_segment_for_recovery() function marks segments dirty without adjusting value of these counters. This fixes the problem by adding a function to sufile which does the operation adjusting the counters, and by letting the recovery function use it. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/recovery.c | 20 ++++---------------- fs/nilfs2/sufile.c | 29 +++++++++++++++++++++++++++++ fs/nilfs2/sufile.h | 12 ++++++++++++ 3 files changed, 45 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 6ade0963fc1..4fc081e47d7 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -413,7 +413,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, struct nilfs_segment_entry *ent, *n; struct inode *sufile = nilfs->ns_sufile; __u64 segnum[4]; - time_t mtime; int err; int i; @@ -442,24 +441,13 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, * Collecting segments written after the latest super root. * These are marked dirty to avoid being reallocated in the next write. */ - mtime = get_seconds(); list_for_each_entry_safe(ent, n, head, list) { - if (ent->segnum == segnum[0]) { - list_del(&ent->list); - nilfs_free_segment_entry(ent); - continue; - } - err = nilfs_open_segment_entry(ent, sufile); - if (unlikely(err)) - goto failed; - if (!nilfs_segment_usage_dirty(ent->raw_su)) { - /* make the segment garbage */ - ent->raw_su->su_nblocks = cpu_to_le32(0); - ent->raw_su->su_lastmod = cpu_to_le32(mtime); - nilfs_segment_usage_set_dirty(ent->raw_su); + if (ent->segnum != segnum[0]) { + err = nilfs_sufile_scrap(sufile, ent->segnum); + if (unlikely(err)) + goto failed; } list_del(&ent->list); - nilfs_close_segment_entry(ent, sufile); nilfs_free_segment_entry(ent); } diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 07013f58dfe..98e68677f04 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -258,6 +258,35 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, nilfs_mdt_mark_dirty(sufile); } +void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + int clean, dirty; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) && + su->su_nblocks == cpu_to_le32(0)) { + kunmap_atomic(kaddr, KM_USER0); + return; + } + clean = nilfs_segment_usage_clean(su); + dirty = nilfs_segment_usage_dirty(su); + + /* make the segment garbage */ + su->su_lastmod = cpu_to_le64(0); + su->su_nblocks = cpu_to_le32(0); + su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); +} + void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 449a6e2671b..a2e2efd4ade 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -52,6 +52,8 @@ int nilfs_sufile_update(struct inode *, __u64, int, struct buffer_head *)); void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, struct buffer_head *); +void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *, struct buffer_head *); void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, @@ -77,6 +79,16 @@ static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum) nilfs_sufile_do_cancel_free); } +/** + * nilfs_sufile_scrap - make a segment garbage + * @sufile: inode of segment usage file + * @segnum: segment number to be freed + */ +static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap); +} + /** * nilfs_sufile_free - free segment * @sufile: inode of segment usage file -- cgit v1.2.3 From eb2e5f452a1456c7a20f7566a24d5a8f6ef3edd5 Mon Sep 17 00:00:00 2001 From: Dave Anderson Date: Mon, 13 Apr 2009 14:39:46 -0700 Subject: hfs: fix memory leak when unmounting When an HFS filesystem is unmounted, it leaks a 2-page bitmap. Also, under extreme memory pressure, it's possible that hfs_releasepage() may use a tree pointer that has not been initialized, and if so, the release request should just be rejected. [akpm@linux-foundation.org: free_pages(0) is legal, remove obvious comment] Signed-off-by: Dave Anderson Tested-by: Eugene Teo Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/inode.c | 4 ++++ fs/hfs/mdb.c | 1 + 2 files changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 9435dda8f1e..a1cbff2b4d9 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -70,6 +70,10 @@ static int hfs_releasepage(struct page *page, gfp_t mask) BUG(); return 0; } + + if (!tree) + return 0; + if (tree->node_size >= PAGE_CACHE_SIZE) { nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); spin_lock(&tree->hash_lock); diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 36ca2e1a4fa..7b6165f25fb 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -349,6 +349,7 @@ void hfs_mdb_put(struct super_block *sb) if (HFS_SB(sb)->nls_disk) unload_nls(HFS_SB(sb)->nls_disk); + free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0); kfree(HFS_SB(sb)); sb->s_fs_info = NULL; } -- cgit v1.2.3 From 32433879480d13bc019d5a067ce884064a93dd63 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 13 Apr 2009 14:40:06 -0700 Subject: jbd: update locking coments Update information about locking in JBD revoke code. Reported-by: Lin Tan . Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/revoke.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index c7bd649bbbd..3e9afc2a91d 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -55,6 +55,25 @@ * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. + * + * Locking rules: + * We keep two hash tables of revoke records. One hashtable belongs to the + * running transaction (is pointed to by journal->j_revoke), the other one + * belongs to the committing transaction. Accesses to the second hash table + * happen only from the kjournald and no other thread touches this table. Also + * journal_switch_revoke_table() which switches which hashtable belongs to the + * running and which to the committing transaction is called only from + * kjournald. Therefore we need no locks when accessing the hashtable belonging + * to the committing transaction. + * + * All users operating on the hash table belonging to the running transaction + * have a handle to the transaction. Therefore they are safe from kjournald + * switching hash tables under them. For operations on the lists of entries in + * the hash table j_revoke_lock is used. + * + * Finally, also replay code uses the hash tables but at this moment noone else + * can touch them (filesystem isn't mounted yet) and hence no locking is + * needed. */ #ifndef __KERNEL__ @@ -402,8 +421,6 @@ int journal_revoke(handle_t *handle, unsigned long blocknr, * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. - * - * The caller must have the journal locked. */ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { @@ -481,10 +498,7 @@ void journal_switch_revoke_table(journal_t *journal) /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. - * - * Called with the journal lock held. */ - void journal_write_revoke_records(journal_t *journal, transaction_t *transaction) { -- cgit v1.2.3 From 316cb4ef3eb2ad6e35e15cc56d39c6cda58c093a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 13 Apr 2009 14:40:14 -0700 Subject: ext2: fix data corruption for racing writes If two writers allocating blocks to file race with each other (e.g. because writepages races with ordinary write or two writepages race with each other), ext2_getblock() can be called on the same inode in parallel. Before we are going to allocate new blocks, we have to recheck the block chain we have obtained so far without holding truncate_mutex. Otherwise we could overwrite the indirect block pointer set by the other writer leading to data loss. The below test program by Ying is able to reproduce the data loss with ext2 on in BRD in a few minutes if the machine is under memory pressure: long kMemSize = 50 << 20; int kPageSize = 4096; int main(int argc, char **argv) { int status; int count = 0; int i; char *fname = "/mnt/test.mmap"; char *mem; unlink(fname); int fd = open(fname, O_CREAT | O_EXCL | O_RDWR, 0600); status = ftruncate(fd, kMemSize); mem = mmap(0, kMemSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); // Fill the memory with 1s. memset(mem, 1, kMemSize); sleep(2); for (i = 0; i < kMemSize; i++) { int byte_good = mem[i] != 0; if (!byte_good && ((i % kPageSize) == 0)) { //printf("%d ", i / kPageSize); count++; } } munmap(mem, kMemSize); close(fd); unlink(fname); if (count > 0) { printf("Running %d bad page\n", count); return 1; } return 0; } Cc: Ying Han Cc: Nick Piggin Signed-off-by: Jan Kara Cc: Mingming Cao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/inode.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index b43b9556366..acf67883110 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -590,9 +590,8 @@ static int ext2_get_blocks(struct inode *inode, if (depth == 0) return (err); -reread: - partial = ext2_get_branch(inode, depth, offsets, chain, &err); + partial = ext2_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); @@ -602,15 +601,16 @@ reread: while (count < maxblocks && count <= blocks_to_boundary) { ext2_fsblk_t blk; - if (!verify_chain(chain, partial)) { + if (!verify_chain(chain, chain + depth - 1)) { /* * Indirect block might be removed by * truncate while we were reading it. * Handling of that case: forget what we've * got now, go to reread. */ + err = -EAGAIN; count = 0; - goto changed; + break; } blk = le32_to_cpu(*(chain[depth-1].p + count)); if (blk == first_block + count) @@ -618,7 +618,8 @@ reread: else break; } - goto got_it; + if (err != -EAGAIN) + goto got_it; } /* Next simple case - plain lookup or failed read of indirect block */ @@ -626,6 +627,33 @@ reread: goto cleanup; mutex_lock(&ei->truncate_mutex); + /* + * If the indirect block is missing while we are reading + * the chain(ext3_get_branch() returns -EAGAIN err), or + * if the chain has been changed after we grab the semaphore, + * (either because another process truncated this branch, or + * another get_block allocated this branch) re-grab the chain to see if + * the request block has been allocated or not. + * + * Since we already block the truncate/other get_block + * at this point, we will have the current copy of the chain when we + * splice the branch into the tree. + */ + if (err == -EAGAIN || !verify_chain(chain, partial)) { + while (partial > chain) { + brelse(partial->bh); + partial--; + } + partial = ext2_get_branch(inode, depth, offsets, chain, &err); + if (!partial) { + count++; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + clear_buffer_new(bh_result); + goto got_it; + } + } /* * Okay, we need to do block allocation. Lazily initialize the block @@ -683,12 +711,6 @@ cleanup: partial--; } return err; -changed: - while (partial > chain) { - brelse(partial->bh); - partial--; - } - goto reread; } int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) -- cgit v1.2.3 From 053c525fcf976810f023d96472f414c0d5e6339b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Apr 2009 13:44:08 +0200 Subject: buffer: switch do_emergency_thaw() away from pdflush_operation() This is (again) a preparatory patch similar to commit a2a9537ac0b37a5da6fbe7e1e9cb06c524d2a9c4. It open codes a simple async way of executing do_thaw_all() out of context, so we can get rid of pdflush. Signed-off-by: Jens Axboe --- fs/buffer.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 13edf7ad3ff..ff8bb1f2333 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -547,7 +547,7 @@ repeat: return err; } -void do_thaw_all(unsigned long unused) +void do_thaw_all(struct work_struct *work) { struct super_block *sb; char b[BDEVNAME_SIZE]; @@ -567,6 +567,7 @@ restart: goto restart; } spin_unlock(&sb_lock); + kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } @@ -577,7 +578,13 @@ restart: */ void emergency_thaw_all(void) { - pdflush_operation(do_thaw_all, 0); + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_thaw_all); + schedule_work(work); + } } /** -- cgit v1.2.3 From 7fa5d20d1a5e60ef7e453993b67b26c87dc09f07 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 31 Mar 2009 15:49:08 +0100 Subject: GFS2: Make quotad's waiting interruptible So we don't count its D state in the loadavg. Reported-by: Nathan Straz Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 8d53f66b5bc..47bc5cbba48 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1364,7 +1364,7 @@ int gfs2_quotad(void *data) refrigerator(); t = min(quotad_timeo, statfs_timeo); - prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); spin_lock(&sdp->sd_trunc_lock); empty = list_empty(&sdp->sd_trunc_list); spin_unlock(&sdp->sd_trunc_lock); -- cgit v1.2.3 From 5cf32524de745c56e1411d63eccf23fef1709d73 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 31 Mar 2009 16:06:27 +0100 Subject: GFS2: Fix symlink creation race In certain cases symlinks can appear to have zero size if a lookup on the inode occurs within a certain (very short) time after the symlink has been created. The symlink is correctly created on disk but appears to have zero size when stat()ed. This patch closes the race and prevents incorrect sizes appearing. Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index abd5429ae28..1c70fa5168d 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -371,6 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, ip = ghs[1].gh_gl->gl_object; ip->i_disksize = size; + i_size_write(inode, size); error = gfs2_meta_inode_buffer(ip, &dibh); -- cgit v1.2.3 From a228df6339e0d385b8149c860d81b6007f5e9c81 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 7 Apr 2009 14:01:34 +0100 Subject: GFS2: Move umount flush rwsem The rwsem, used only on umount, is in the wrong place in glock.c. This patch moves it up a bit so that it does not get called under a spinlock. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 3984e47d1d3..1afd9f26bcb 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -597,7 +597,6 @@ __acquires(&gl->gl_spin) GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); - down_read(&gfs2_umount_flush_sem); if (test_bit(GLF_DEMOTE, &gl->gl_flags) && gl->gl_demote_state != gl->gl_state) { if (find_first_holder(gl)) @@ -614,15 +613,14 @@ __acquires(&gl->gl_spin) if (ret == 0) goto out_unlock; if (ret == 2) - goto out_sem; + goto out; gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ } do_xmote(gl, gh, gl->gl_target); -out_sem: - up_read(&gfs2_umount_flush_sem); +out: return; out_sched: @@ -631,7 +629,7 @@ out_sched: gfs2_glock_put(gl); out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); - goto out_sem; + goto out; } static void glock_work_func(struct work_struct *work) @@ -641,6 +639,7 @@ static void glock_work_func(struct work_struct *work) if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) finish_xmote(gl, gl->gl_reply); + down_read(&gfs2_umount_flush_sem); spin_lock(&gl->gl_spin); if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && @@ -653,6 +652,7 @@ static void glock_work_func(struct work_struct *work) } run_queue(gl, 0); spin_unlock(&gl->gl_spin); + up_read(&gfs2_umount_flush_sem); if (!delay || queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); -- cgit v1.2.3 From 10d2198805d7faa2b193485446ff6b1de42c9b78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Apr 2009 19:42:17 +0200 Subject: GFS2: cleanup file_operations mess Remove the weird pointer to file_operations mess and replace it with straight-forward defining of the lockinginstance names to the _nolock variants. Signed-off-by: Christoph Hellwig Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 8 ++++---- fs/gfs2/inode.h | 14 ++++++++------ fs/gfs2/ops_file.c | 8 ++++---- 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 7b277d44915..5a31d426116 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -137,15 +137,15 @@ void gfs2_set_iop(struct inode *inode) if (S_ISREG(mode)) { inode->i_op = &gfs2_file_iops; if (gfs2_localflocks(sdp)) - inode->i_fop = gfs2_file_fops_nolock; + inode->i_fop = &gfs2_file_fops_nolock; else - inode->i_fop = gfs2_file_fops; + inode->i_fop = &gfs2_file_fops; } else if (S_ISDIR(mode)) { inode->i_op = &gfs2_dir_iops; if (gfs2_localflocks(sdp)) - inode->i_fop = gfs2_dir_fops_nolock; + inode->i_fop = &gfs2_dir_fops_nolock; else - inode->i_fop = gfs2_dir_fops; + inode->i_fop = &gfs2_dir_fops; } else if (S_ISLNK(mode)) { inode->i_op = &gfs2_symlink_iops; } else { diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index dca4fee3078..c30be2b6658 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -101,21 +101,23 @@ void gfs2_dinode_print(const struct gfs2_inode *ip); extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; extern const struct inode_operations gfs2_symlink_iops; -extern const struct file_operations *gfs2_file_fops_nolock; -extern const struct file_operations *gfs2_dir_fops_nolock; +extern const struct file_operations gfs2_file_fops_nolock; +extern const struct file_operations gfs2_dir_fops_nolock; extern void gfs2_set_inode_flags(struct inode *inode); #ifdef CONFIG_GFS2_FS_LOCKING_DLM -extern const struct file_operations *gfs2_file_fops; -extern const struct file_operations *gfs2_dir_fops; +extern const struct file_operations gfs2_file_fops; +extern const struct file_operations gfs2_dir_fops; + static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) { return sdp->sd_args.ar_localflocks; } #else /* Single node only */ -#define gfs2_file_fops NULL -#define gfs2_dir_fops NULL +#define gfs2_file_fops gfs2_file_fops_nolock +#define gfs2_dir_fops gfs2_dir_fops_nolock + static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) { return 1; diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 70b9b854894..101caf3ee86 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -705,7 +705,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) } } -const struct file_operations *gfs2_file_fops = &(const struct file_operations){ +const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, @@ -723,7 +723,7 @@ const struct file_operations *gfs2_file_fops = &(const struct file_operations){ .setlease = gfs2_setlease, }; -const struct file_operations *gfs2_dir_fops = &(const struct file_operations){ +const struct file_operations gfs2_dir_fops = { .readdir = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, @@ -735,7 +735,7 @@ const struct file_operations *gfs2_dir_fops = &(const struct file_operations){ #endif /* CONFIG_GFS2_FS_LOCKING_DLM */ -const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){ +const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, @@ -751,7 +751,7 @@ const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operat .setlease = generic_setlease, }; -const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){ +const struct file_operations gfs2_dir_fops_nolock = { .readdir = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, -- cgit v1.2.3 From 1328df725239804ae30fc7257c1a3185e679b517 Mon Sep 17 00:00:00 2001 From: Xu Gang Date: Tue, 14 Apr 2009 14:54:14 +0800 Subject: GFS2: Use DEFINE_SPINLOCK SPIN_LOCK_UNLOCKED is deprecated, use DEFINE_SPINLOCK instead. (as suggested in Documentation/spinlocks.txt) Signed-off-by: Xu Gang Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 47bc5cbba48..152e6c4a0dc 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -81,7 +81,7 @@ struct gfs2_quota_change_host { static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); -static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(qd_lru_lock); int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) { -- cgit v1.2.3 From b3c2d2ddd63944ef2a1e4a43077b602288107e01 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 14 Apr 2009 19:48:36 +0200 Subject: splice: split up __splice_from_pipe() Split up __splice_from_pipe() into four helper functions: splice_from_pipe_begin() splice_from_pipe_next() splice_from_pipe_feed() splice_from_pipe_end() splice_from_pipe_next() will wait (if necessary) for more buffers to be added to the pipe. splice_from_pipe_feed() will feed the buffers to the supplied actor and return when there's no more data available (or if all of the requested data has been copied). This is necessary so that implementations can do locking around the non-waiting splice_from_pipe_feed(). This patch should not cause any change in behavior. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 217 +++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 143 insertions(+), 74 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index c18aa7e03e2..fd6b278d447 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -601,107 +601,176 @@ out: return ret; } +static void wakeup_pipe_writers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); +} + /** - * __splice_from_pipe - splice data from a pipe to given actor + * splice_from_pipe_feed - feed available data from a pipe to a file * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: - * This function does little more than loop over the pipe and call - * @actor to do the actual moving of a single struct pipe_buffer to - * the desired destination. See pipe_to_file, pipe_to_sendpage, or - * pipe_to_user. + + * This function loops over the pipe and calls @actor to do the + * actual moving of a single struct pipe_buffer to the desired + * destination. It returns when there's no more buffers left in + * the pipe or if the requested number of bytes (@sd->total_len) + * have been copied. It returns a positive number (one) if the + * pipe needs to be filled with more data, zero if the required + * number of bytes have been copied and -errno on error. * + * This, together with splice_from_pipe_{begin,end,next}, may be + * used to implement the functionality of __splice_from_pipe() when + * locking is required around copying the pipe buffers to the + * destination. */ -ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, - splice_actor *actor) +int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) { - int ret, do_wakeup, err; - - ret = 0; - do_wakeup = 0; - - for (;;) { - if (pipe->nrbufs) { - struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; - const struct pipe_buf_operations *ops = buf->ops; + int ret; - sd->len = buf->len; - if (sd->len > sd->total_len) - sd->len = sd->total_len; + while (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + const struct pipe_buf_operations *ops = buf->ops; - err = actor(pipe, buf, sd); - if (err <= 0) { - if (!ret && err != -ENODATA) - ret = err; + sd->len = buf->len; + if (sd->len > sd->total_len) + sd->len = sd->total_len; - break; - } + ret = actor(pipe, buf, sd); + if (ret <= 0) { + if (ret == -ENODATA) + ret = 0; + return ret; + } + buf->offset += ret; + buf->len -= ret; - ret += err; - buf->offset += err; - buf->len -= err; + sd->num_spliced += ret; + sd->len -= ret; + sd->pos += ret; + sd->total_len -= ret; - sd->len -= err; - sd->pos += err; - sd->total_len -= err; - if (sd->len) - continue; + if (!buf->len) { + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->nrbufs--; + if (pipe->inode) + sd->need_wakeup = true; + } - if (!buf->len) { - buf->ops = NULL; - ops->release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); - pipe->nrbufs--; - if (pipe->inode) - do_wakeup = 1; - } + if (!sd->total_len) + return 0; + } - if (!sd->total_len) - break; - } + return 1; +} +EXPORT_SYMBOL(splice_from_pipe_feed); - if (pipe->nrbufs) - continue; +/** + * splice_from_pipe_next - wait for some data to splice from + * @pipe: pipe to splice from + * @sd: information about the splice operation + * + * Description: + * This function will wait for some data and return a positive + * value (one) if pipe buffers are available. It will return zero + * or -errno if no more data needs to be spliced. + */ +int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) +{ + while (!pipe->nrbufs) { if (!pipe->writers) - break; - if (!pipe->waiting_writers) { - if (ret) - break; - } + return 0; - if (sd->flags & SPLICE_F_NONBLOCK) { - if (!ret) - ret = -EAGAIN; - break; - } + if (!pipe->waiting_writers && sd->num_spliced) + return 0; - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } + if (sd->flags & SPLICE_F_NONBLOCK) + return -EAGAIN; - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - do_wakeup = 0; + if (signal_pending(current)) + return -ERESTARTSYS; + + if (sd->need_wakeup) { + wakeup_pipe_writers(pipe); + sd->need_wakeup = false; } pipe_wait(pipe); } - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - } + return 1; +} +EXPORT_SYMBOL(splice_from_pipe_next); - return ret; +/** + * splice_from_pipe_begin - start splicing from pipe + * @pipe: pipe to splice from + * + * Description: + * This function should be called before a loop containing + * splice_from_pipe_next() and splice_from_pipe_feed() to + * initialize the necessary fields of @sd. + */ +void splice_from_pipe_begin(struct splice_desc *sd) +{ + sd->num_spliced = 0; + sd->need_wakeup = false; +} +EXPORT_SYMBOL(splice_from_pipe_begin); + +/** + * splice_from_pipe_end - finish splicing from pipe + * @pipe: pipe to splice from + * @sd: information about the splice operation + * + * Description: + * This function will wake up pipe writers if necessary. It should + * be called after a loop containing splice_from_pipe_next() and + * splice_from_pipe_feed(). + */ +void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) +{ + if (sd->need_wakeup) + wakeup_pipe_writers(pipe); +} +EXPORT_SYMBOL(splice_from_pipe_end); + +/** + * __splice_from_pipe - splice data from a pipe to given actor + * @pipe: pipe to splice from + * @sd: information to @actor + * @actor: handler that splices the data + * + * Description: + * This function does little more than loop over the pipe and call + * @actor to do the actual moving of a single struct pipe_buffer to + * the desired destination. See pipe_to_file, pipe_to_sendpage, or + * pipe_to_user. + * + */ +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) +{ + int ret; + + splice_from_pipe_begin(sd); + do { + ret = splice_from_pipe_next(pipe, sd); + if (ret > 0) + ret = splice_from_pipe_feed(pipe, sd, actor); + } while (ret > 0); + splice_from_pipe_end(pipe, sd); + + return sd->num_spliced ? sd->num_spliced : ret; } EXPORT_SYMBOL(__splice_from_pipe); -- cgit v1.2.3 From 2933970b960223076d6affcf7a77e2bc546b8102 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 14 Apr 2009 19:48:37 +0200 Subject: splice: remove i_mutex locking in splice_from_pipe() splice_from_pipe() is only called from two places: - generic_splice_sendpage() - splice_write_null() Neither of these require i_mutex to be taken on the destination inode. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index fd6b278d447..349576b2c75 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -784,7 +784,7 @@ EXPORT_SYMBOL(__splice_from_pipe); * @actor: handler that splices the data * * Description: - * See __splice_from_pipe. This function locks the input and output inodes, + * See __splice_from_pipe. This function locks the pipe inode, * otherwise it's identical to __splice_from_pipe(). * */ @@ -793,7 +793,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, splice_actor *actor) { ssize_t ret; - struct inode *inode = out->f_mapping->host; struct splice_desc sd = { .total_len = len, .flags = flags, @@ -801,24 +800,11 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, .u.file = out, }; - /* - * The actor worker might be calling ->write_begin and - * ->write_end. Most of the time, these expect i_mutex to - * be held. Since this may result in an ABBA deadlock with - * pipe->inode, we have to order lock acquiry here. - * - * Outer lock must be inode->i_mutex, as pipe_wait() will - * release and reacquire pipe->inode->i_mutex, AND inode must - * never be a pipe. - */ - WARN_ON(S_ISFIFO(inode->i_mode)); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); + mutex_lock(&pipe->inode->i_mutex); ret = __splice_from_pipe(pipe, &sd, actor); if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); - mutex_unlock(&inode->i_mutex); return ret; } -- cgit v1.2.3 From eb443e5a25d43996deb62b9bcee1a4ce5dea2ead Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 14 Apr 2009 19:48:38 +0200 Subject: splice: fix i_mutex locking in generic_splice_write() Rearrange locking of i_mutex on destination so it's only held while buffers are copied with the pipe_to_file() actor, and not while waiting for more data on the pipe. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 349576b2c75..a1f595b9db4 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -895,17 +895,29 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; - WARN_ON(S_ISFIFO(inode->i_mode)); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - ret = file_remove_suid(out); - if (likely(!ret)) { - if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); - ret = __splice_from_pipe(pipe, &sd, pipe_to_file); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); - } - mutex_unlock(&inode->i_mutex); + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); + + splice_from_pipe_begin(&sd); + do { + ret = splice_from_pipe_next(pipe, &sd); + if (ret <= 0) + break; + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + ret = file_remove_suid(out); + if (!ret) + ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); + mutex_unlock(&inode->i_mutex); + } while (ret > 0); + splice_from_pipe_end(pipe, &sd); + + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + + if (sd.num_spliced) + ret = sd.num_spliced; + if (ret > 0) { unsigned long nr_pages; -- cgit v1.2.3 From 328eaaba4e41a04c1dc4679d65bea3fee4349d86 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 14 Apr 2009 19:48:39 +0200 Subject: ocfs2: fix i_mutex locking in ocfs2_splice_to_file() Rearrange locking of i_mutex on destination and call to ocfs2_rw_lock() so locks are only held while buffers are copied with the pipe_to_file() actor, and not while waiting for more data on the pipe. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/ocfs2/file.c | 94 +++++++++++++++++++++++++++++++++++++++++++++------------ fs/splice.c | 5 +-- 2 files changed, 77 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8672b953603..c2a87c885b7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1912,6 +1912,22 @@ out_sems: return written ? written : ret; } +static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, + struct file *out, + struct splice_desc *sd) +{ + int ret; + + ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, + sd->total_len, 0, NULL); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + return splice_from_pipe_feed(pipe, sd, pipe_to_file); +} + static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, @@ -1919,38 +1935,76 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, unsigned int flags) { int ret; - struct inode *inode = out->f_path.dentry->d_inode; + struct address_space *mapping = out->f_mapping; + struct inode *inode = mapping->host; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, (unsigned int)len, out->f_path.dentry->d_name.len, out->f_path.dentry->d_name.name); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); - ret = ocfs2_rw_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto out; - } + splice_from_pipe_begin(&sd); + do { + ret = splice_from_pipe_next(pipe, &sd); + if (ret <= 0) + break; - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, - NULL); - if (ret < 0) { - mlog_errno(ret); - goto out_unlock; - } + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) + mlog_errno(ret); + else { + ret = ocfs2_splice_to_file(pipe, out, &sd); + ocfs2_rw_unlock(inode, 1); + } + mutex_unlock(&inode->i_mutex); + } while (ret > 0); + splice_from_pipe_end(pipe, &sd); - if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); - ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); -out_unlock: - ocfs2_rw_unlock(inode, 1); -out: - mutex_unlock(&inode->i_mutex); + if (sd.num_spliced) + ret = sd.num_spliced; + + if (ret > 0) { + unsigned long nr_pages; + + *ppos += ret; + nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + /* + * If file or inode is SYNC and we actually wrote some data, + * sync it. + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + mutex_lock(&inode->i_mutex); + err = ocfs2_rw_lock(inode, 1); + if (err < 0) { + mlog_errno(err); + } else { + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + ocfs2_rw_unlock(inode, 1); + } + mutex_unlock(&inode->i_mutex); + + if (err) + ret = err; + } + balance_dirty_pages_ratelimited_nr(mapping, nr_pages); + } mlog_exit(ret); return ret; diff --git a/fs/splice.c b/fs/splice.c index a1f595b9db4..584b2b7a1db 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -555,8 +555,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create * a new page in the output file page cache and fill/dirty that. */ -static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, - struct splice_desc *sd) +int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) { struct file *file = sd->u.file; struct address_space *mapping = file->f_mapping; @@ -600,6 +600,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, out: return ret; } +EXPORT_SYMBOL(pipe_to_file); static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { -- cgit v1.2.3 From f8cc774ce4844811a55e2352f1443055e3994e28 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 14 Apr 2009 19:48:40 +0200 Subject: splice: remove generic_file_splice_write_nolock() Remove the now unused generic_file_splice_write_nolock() function. It's conceptually broken anyway, because splice may need to wait for pipe events so holding locks across the whole operation is wrong. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 59 ----------------------------------------------------------- 1 file changed, 59 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 584b2b7a1db..128ee36a719 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -810,65 +810,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, return ret; } -/** - * generic_file_splice_write_nolock - generic_file_splice_write without mutexes - * @pipe: pipe info - * @out: file to write to - * @ppos: position in @out - * @len: number of bytes to splice - * @flags: splice modifier flags - * - * Description: - * Will either move or copy pages (determined by @flags options) from - * the given pipe inode to the given file. The caller is responsible - * for acquiring i_mutex on both inodes. - * - */ -ssize_t -generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) -{ - struct address_space *mapping = out->f_mapping; - struct inode *inode = mapping->host; - struct splice_desc sd = { - .total_len = len, - .flags = flags, - .pos = *ppos, - .u.file = out, - }; - ssize_t ret; - int err; - - err = file_remove_suid(out); - if (unlikely(err)) - return err; - - ret = __splice_from_pipe(pipe, &sd, pipe_to_file); - if (ret > 0) { - unsigned long nr_pages; - - *ppos += ret; - nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - - if (err) - ret = err; - } - balance_dirty_pages_ratelimited_nr(mapping, nr_pages); - } - - return ret; -} - -EXPORT_SYMBOL(generic_file_splice_write_nolock); - /** * generic_file_splice_write - splice data from a pipe to a file * @pipe: pipe info -- cgit v1.2.3 From 61e0d47c33cc371f725bcda4a47ae0efe652dba8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 14 Apr 2009 19:48:41 +0200 Subject: splice: add helpers for locking pipe inode There are lots of sequences like this, especially in splice code: if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); /* do something */ if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); so introduce helpers which do the conditional locking and unlocking. Also replace the inode_double_lock() call with a pipe_double_lock() helper to avoid spreading the use of this functionality beyond the pipe code. This patch is just a cleanup, and should cause no behavioral changes. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/inode.c | 36 ------------------------------------ fs/pipe.c | 42 ++++++++++++++++++++++++++++++++++++++---- fs/splice.c | 50 +++++++++++++++++++++----------------------------- 3 files changed, 59 insertions(+), 69 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index d06d6d268de..6ad14a1cd8c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1470,42 +1470,6 @@ static void __wait_on_freeing_inode(struct inode *inode) spin_lock(&inode_lock); } -/* - * We rarely want to lock two inodes that do not have a parent/child - * relationship (such as directory, child inode) simultaneously. The - * vast majority of file systems should be able to get along fine - * without this. Do not use these functions except as a last resort. - */ -void inode_double_lock(struct inode *inode1, struct inode *inode2) -{ - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { - if (inode1) - mutex_lock(&inode1->i_mutex); - else if (inode2) - mutex_lock(&inode2->i_mutex); - return; - } - - if (inode1 < inode2) { - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); - } -} -EXPORT_SYMBOL(inode_double_lock); - -void inode_double_unlock(struct inode *inode1, struct inode *inode2) -{ - if (inode1) - mutex_unlock(&inode1->i_mutex); - - if (inode2 && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); -} -EXPORT_SYMBOL(inode_double_unlock); - static __initdata unsigned long ihash_entries; static int __init set_ihash_entries(char *str) { diff --git a/fs/pipe.c b/fs/pipe.c index 4af7aa52181..13414ec45b8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -37,6 +37,42 @@ * -- Manfred Spraul 2002-05-09 */ +static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) +{ + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, subclass); +} + +void pipe_lock(struct pipe_inode_info *pipe) +{ + /* + * pipe_lock() nests non-pipe inode locks (for writing to a file) + */ + pipe_lock_nested(pipe, I_MUTEX_PARENT); +} +EXPORT_SYMBOL(pipe_lock); + +void pipe_unlock(struct pipe_inode_info *pipe) +{ + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); +} +EXPORT_SYMBOL(pipe_unlock); + +void pipe_double_lock(struct pipe_inode_info *pipe1, + struct pipe_inode_info *pipe2) +{ + BUG_ON(pipe1 == pipe2); + + if (pipe1 < pipe2) { + pipe_lock_nested(pipe1, I_MUTEX_PARENT); + pipe_lock_nested(pipe2, I_MUTEX_CHILD); + } else { + pipe_lock_nested(pipe2, I_MUTEX_CHILD); + pipe_lock_nested(pipe1, I_MUTEX_PARENT); + } +} + /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe) { @@ -47,12 +83,10 @@ void pipe_wait(struct pipe_inode_info *pipe) * is considered a noninteractive wait: */ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); schedule(); finish_wait(&pipe->wait, &wait); - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); } static int diff --git a/fs/splice.c b/fs/splice.c index 128ee36a719..5384a90665d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -182,8 +182,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, do_wakeup = 0; page_nr = 0; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); for (;;) { if (!pipe->readers) { @@ -245,15 +244,13 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, pipe->waiting_writers--; } - if (pipe->inode) { - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } while (page_nr < spd_pages) @@ -801,11 +798,9 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, .u.file = out, }; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, actor); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); return ret; } @@ -837,8 +832,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; - if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); + pipe_lock(pipe); splice_from_pipe_begin(&sd); do { @@ -854,8 +848,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, } while (ret > 0); splice_from_pipe_end(pipe, &sd); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; @@ -1348,8 +1341,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, if (!pipe) return -EBADF; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); error = ret = 0; while (nr_segs) { @@ -1404,8 +1396,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, iov++; } - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); if (!ret) ret = error; @@ -1533,7 +1524,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) return 0; ret = 0; - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); while (!pipe->nrbufs) { if (signal_pending(current)) { @@ -1551,7 +1542,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) pipe_wait(pipe); } - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); return ret; } @@ -1571,7 +1562,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) return 0; ret = 0; - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); while (pipe->nrbufs >= PIPE_BUFFERS) { if (!pipe->readers) { @@ -1592,7 +1583,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) pipe->waiting_writers--; } - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); return ret; } @@ -1608,10 +1599,10 @@ static int link_pipe(struct pipe_inode_info *ipipe, /* * Potential ABBA deadlock, work around it by ordering lock - * grabbing by inode address. Otherwise two different processes + * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ - inode_double_lock(ipipe->inode, opipe->inode); + pipe_double_lock(ipipe, opipe); do { if (!opipe->readers) { @@ -1662,7 +1653,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) ret = -EAGAIN; - inode_double_unlock(ipipe->inode, opipe->inode); + pipe_unlock(ipipe); + pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. -- cgit v1.2.3 From 86c824b9434e764d01489688e4e38aee43b93fcf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 15 Apr 2009 09:00:07 +0200 Subject: bio: add documentation to bio_alloc() Explain that with __GFP_WAIT set it will not fail, and that the caller must never allocate more than 1 bio at the time. Signed-off-by: Jens Axboe --- fs/bio.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index e0c9e545bbf..cd42bb882f3 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -348,6 +348,24 @@ err: return NULL; } +/** + * bio_alloc - allocate a bio for I/O + * @gfp_mask: the GFP_ mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * + * Description: + * bio_alloc will allocate a bio and associated bio_vec array that can hold + * at least @nr_iovecs entries. Allocations will be done from the + * fs_bio_set. Also see @bio_alloc_bioset. + * + * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate + * a bio. This is due to the mempool guarantees. To make this work, callers + * must never allocate more than 1 bio at the time from this pool. Callers + * that need to allocate more than 1 bio must always submit the previously + * allocate bio for IO before attempting to allocate a new one. Failure to + * do so can cause livelocks under memory pressure. + * + **/ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) { struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); -- cgit v1.2.3 From 4d1f9fdb6177a9bdecf26976337dd39abcc8edbc Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Wed, 15 Apr 2009 10:35:52 +0530 Subject: dio: Remove code handling bio_alloc failure with __GFP_WAIT Remove code handling bio_alloc failure with __GFP_WAIT. GFP_KERNEL implies __GFP_WAIT. Signed-off-by: Nikanth Karthikesan Signed-off-by: Jens Axboe --- fs/direct-io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index da258e7249c..05763bbc205 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -307,8 +307,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, struct bio *bio; bio = bio_alloc(GFP_KERNEL, nr_vecs); - if (bio == NULL) - return -ENOMEM; bio->bi_bdev = bdev; bio->bi_sector = first_sector; -- cgit v1.2.3 From 226e7dabf5534722944adefbad01970bd38bb7ae Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Wed, 15 Apr 2009 10:36:16 +0530 Subject: ext4: Remove code handling bio_alloc failure with __GFP_WAIT Remove code handling bio_alloc failure with __GFP_WAIT. GFP_NOIO implies __GFP_WAIT. Signed-off-by: Nikanth Karthikesan Signed-off-by: Jens Axboe --- fs/ext4/extents.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6132353dcf6..2a1cb097976 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2416,8 +2416,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) len = ee_len; bio = bio_alloc(GFP_NOIO, len); - if (!bio) - return -ENOMEM; bio->bi_sector = ee_pblock; bio->bi_bdev = inode->i_sb->s_bdev; -- cgit v1.2.3 From b1fffc9ca6ca4c3feef3a7e8405252d528d312dc Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Wed, 15 Apr 2009 10:36:35 +0530 Subject: gfs2: Remove code handling bio_alloc failure with __GFP_WAIT Remove code handling bio_alloc failure with __GFP_WAIT. GFP_NOFS implies __GFP_WAIT. Signed-off-by: Nikanth Karthikesan Acked-by: Steven Whitehouse Signed-off-by: Jens Axboe --- fs/gfs2/ops_fstype.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 51883b3ad89..650a730707b 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -272,11 +272,6 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) lock_page(page); bio = bio_alloc(GFP_NOFS, 1); - if (unlikely(!bio)) { - __free_page(page); - return -ENOBUFS; - } - bio->bi_sector = sector * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio_add_page(bio, page, PAGE_SIZE, 0); -- cgit v1.2.3 From 35c80d5f400f68f2eccf3069d1c068e154bde9c9 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 15 Apr 2009 13:22:38 -0400 Subject: Add block_write_full_page_endio for passing endio handler block_write_full_page doesn't allow the caller to control what happens when the IO is over. This adds a new call named block_write_full_page_endio so the buffer head end_io handler can be provided by the caller. This will be used by the ext3 data=guarded mode to do i_size updates in a workqueue based end_io handler. end_buffer_async_write is also exported so it can be called to do the dirty work of managing page writeback for the higher level end_io handler. Signed-off-by: Chris Mason Acked-by: Theodore Tso Acked-by: Jan Kara Signed-off-by: Linus Torvalds --- fs/buffer.c | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index ff8bb1f2333..b3e5be7514f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -360,7 +360,7 @@ still_busy: * Completion handler for block_write_full_page() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. */ -static void end_buffer_async_write(struct buffer_head *bh, int uptodate) +void end_buffer_async_write(struct buffer_head *bh, int uptodate) { char b[BDEVNAME_SIZE]; unsigned long flags; @@ -438,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh) set_buffer_async_read(bh); } -void mark_buffer_async_write(struct buffer_head *bh) +void mark_buffer_async_write_endio(struct buffer_head *bh, + bh_end_io_t *handler) { - bh->b_end_io = end_buffer_async_write; + bh->b_end_io = handler; set_buffer_async_write(bh); } + +void mark_buffer_async_write(struct buffer_head *bh) +{ + mark_buffer_async_write_endio(bh, end_buffer_async_write); +} EXPORT_SYMBOL(mark_buffer_async_write); @@ -1615,7 +1621,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata); * unplugging the device queue. */ static int __block_write_full_page(struct inode *inode, struct page *page, - get_block_t *get_block, struct writeback_control *wbc) + get_block_t *get_block, struct writeback_control *wbc, + bh_end_io_t *handler) { int err; sector_t block; @@ -1700,7 +1707,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, continue; } if (test_clear_buffer_dirty(bh)) { - mark_buffer_async_write(bh); + mark_buffer_async_write_endio(bh, handler); } else { unlock_buffer(bh); } @@ -1753,7 +1760,7 @@ recover: if (buffer_mapped(bh) && buffer_dirty(bh) && !buffer_delay(bh)) { lock_buffer(bh); - mark_buffer_async_write(bh); + mark_buffer_async_write_endio(bh, handler); } else { /* * The buffer may have been set dirty during @@ -2679,7 +2686,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block, out: ret = mpage_writepage(page, get_block, wbc); if (ret == -EAGAIN) - ret = __block_write_full_page(inode, page, get_block, wbc); + ret = __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); return ret; } EXPORT_SYMBOL(nobh_writepage); @@ -2837,9 +2845,10 @@ out: /* * The generic ->writepage function for buffer-backed address_spaces + * this form passes in the end_io handler used to finish the IO. */ -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) +int block_write_full_page_endio(struct page *page, get_block_t *get_block, + struct writeback_control *wbc, bh_end_io_t *handler) { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); @@ -2848,7 +2857,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block, /* Is the page fully inside i_size? */ if (page->index < end_index) - return __block_write_full_page(inode, page, get_block, wbc); + return __block_write_full_page(inode, page, get_block, wbc, + handler); /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_CACHE_SIZE-1); @@ -2871,9 +2881,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block, * writes to that region are not written out to the file." */ zero_user_segment(page, offset, PAGE_CACHE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc); + return __block_write_full_page(inode, page, get_block, wbc, handler); } +/* + * The generic ->writepage function for buffer-backed address_spaces + */ +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + return block_write_full_page_endio(page, get_block, wbc, + end_buffer_async_write); +} + + sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block_t *get_block) { @@ -3342,9 +3363,11 @@ EXPORT_SYMBOL(block_read_full_page); EXPORT_SYMBOL(block_sync_page); EXPORT_SYMBOL(block_truncate_page); EXPORT_SYMBOL(block_write_full_page); +EXPORT_SYMBOL(block_write_full_page_endio); EXPORT_SYMBOL(cont_write_begin); EXPORT_SYMBOL(end_buffer_read_sync); EXPORT_SYMBOL(end_buffer_write_sync); +EXPORT_SYMBOL(end_buffer_async_write); EXPORT_SYMBOL(file_fsync); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_cont_expand_simple); -- cgit v1.2.3 From d110271e1f4140a9fb06d968b1afe9ca56a6064e Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Wed, 25 Mar 2009 15:11:36 -0600 Subject: sysfs: don't use global workqueue in sysfs_schedule_callback() A sysfs attribute using sysfs_schedule_callback() to commit suicide may end up calling device_unregister(), which will eventually call a driver's ->remove function. Drivers may call flush_scheduled_work() in their shutdown routines, in which case lockdep will complain with something like the following: ============================================= [ INFO: possible recursive locking detected ] 2.6.29-rc8-kk #1 --------------------------------------------- events/4/56 is trying to acquire lock: (events){--..}, at: [] flush_workqueue+0x0/0xa0 but task is already holding lock: (events){--..}, at: [] run_workqueue+0x108/0x230 other info that might help us debug this: 3 locks held by events/4/56: #0: (events){--..}, at: [] run_workqueue+0x108/0x230 #1: (&ss->work){--..}, at: [] run_workqueue+0x108/0x230 #2: (pci_remove_rescan_mutex){--..}, at: [] remove_callback+0x21/0x40 stack backtrace: Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1 Call Trace: [] validate_chain+0xb7d/0x1260 [] __lock_acquire+0x42e/0xa40 [] lock_acquire+0x58/0x80 [] ? flush_workqueue+0x0/0xa0 [] flush_workqueue+0x4d/0xa0 [] ? flush_workqueue+0x0/0xa0 [] flush_scheduled_work+0x10/0x20 [] e1000_remove+0x55/0xfe [e1000e] [] ? sysfs_schedule_callback_work+0x0/0x50 [] pci_device_remove+0x32/0x70 [] __device_release_driver+0x59/0x90 [] device_release_driver+0x2b/0x40 [] bus_remove_device+0xa6/0x120 [] device_del+0x12b/0x190 [] device_unregister+0x26/0x70 [] pci_stop_dev+0x49/0x60 [] pci_remove_bus_device+0x40/0xc0 [] remove_callback+0x29/0x40 [] sysfs_schedule_callback_work+0x1f/0x50 [] run_workqueue+0x15a/0x230 [] ? run_workqueue+0x108/0x230 [] worker_thread+0x9f/0x100 [] ? autoremove_wake_function+0x0/0x40 [] ? worker_thread+0x0/0x100 [] kthread+0x4d/0x80 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? kthread+0x0/0x80 [] ? child_rip+0x0/0x20 Although we know that the device_unregister path will never acquire a lock that a driver might try to acquire in its ->remove, in general we should never attempt to flush a workqueue from within the same workqueue, and lockdep rightly complains. So as long as sysfs attributes cannot commit suicide directly and we are stuck with this callback mechanism, put the sysfs callbacks on their own workqueue instead of the global one. This has the side benefit that if a suicidal sysfs attribute kicks off a long chain of ->remove callbacks, we no longer induce a long delay on the global queue. This also fixes a missing module_put in the error path introduced by sysfs-only-allow-one-scheduled-removal-callback-per-kobj.patch. We never destroy the workqueue, but I'm not sure that's a problem. Reported-by: Kenji Kaneshige Tested-by: Kenji Kaneshige Signed-off-by: Alex Chiang Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 289c43a4726..979e9379fb5 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -667,6 +667,7 @@ struct sysfs_schedule_callback_struct { struct work_struct work; }; +static struct workqueue_struct *sysfs_workqueue; static DEFINE_MUTEX(sysfs_workq_mutex); static LIST_HEAD(sysfs_workq); static void sysfs_schedule_callback_work(struct work_struct *work) @@ -715,11 +716,20 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), mutex_lock(&sysfs_workq_mutex); list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) if (ss->kobj == kobj) { + module_put(owner); mutex_unlock(&sysfs_workq_mutex); return -EAGAIN; } mutex_unlock(&sysfs_workq_mutex); + if (sysfs_workqueue == NULL) { + sysfs_workqueue = create_workqueue("sysfsd"); + if (sysfs_workqueue == NULL) { + module_put(owner); + return -ENOMEM; + } + } + ss = kmalloc(sizeof(*ss), GFP_KERNEL); if (!ss) { module_put(owner); @@ -735,7 +745,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), mutex_lock(&sysfs_workq_mutex); list_add_tail(&ss->workq_list, &sysfs_workq); mutex_unlock(&sysfs_workq_mutex); - schedule_work(&ss->work); + queue_work(sysfs_workqueue, &ss->work); return 0; } EXPORT_SYMBOL_GPL(sysfs_schedule_callback); -- cgit v1.2.3 From 1af3557abdef34ee036a6de4cb79e24468544b8d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 9 Apr 2009 13:53:22 +0900 Subject: sysfs: sysfs poll keep the poll rule of regular file. Currently, following test programs don't finished. % ruby -e ' Thread.new { sleep } File.read("/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies") ' strace expose the reason. ... open("/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies", O_RDONLY|O_LARGEFILE) = 3 ioctl(3, SNDCTL_TMR_TIMEBASE or TCGETS, 0xbf9fa6b8) = -1 ENOTTY (Inappropriate ioctl for device) fstat64(3, {st_mode=S_IFREG|0444, st_size=4096, ...}) = 0 _llseek(3, 0, [0], SEEK_CUR) = 0 select(4, [3], NULL, NULL, NULL) = 1 (in [3]) read(3, "1400000 1300000 1200000 1100000 1"..., 4096) = 62 select(4, [3], NULL, NULL, NULL Because Ruby (the scripting language) VM assume select system-call against regular file don't block. it because SUSv3 says "Regular files shall always poll TRUE for reading and writing". see http://www.opengroup.org/onlinepubs/009695399/functions/poll.html it seems valid assumption. But sysfs_poll() don't keep this rule although sysfs file can read and write always. This patch restore proper poll behavior to sysfs. /sys/block/md*/md/sync_action polling application and another sysfs updating sensitive application still can use POLLERR and POLLPRI. Cc: Neil Brown Signed-off-by: KOSAKI Motohiro Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 979e9379fb5..b1606e07b7a 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -446,11 +446,11 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) if (buffer->event != atomic_read(&od->event)) goto trigger; - return 0; + return DEFAULT_POLLMASK; trigger: buffer->needs_read_fill = 1; - return POLLERR|POLLPRI; + return DEFAULT_POLLMASK|POLLERR|POLLPRI; } void sysfs_notify_dirent(struct sysfs_dirent *sd) -- cgit v1.2.3 From 31b07093c44a7a442394d44423e21d783f5523b8 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 9 Apr 2009 13:57:59 +0900 Subject: proc: mounts_poll() make consistent to mdstat_poll In recently sysfs_poll discussion, Neil Brown pointed out /proc/mounts also should be fixed. SUSv3 says "Regular files shall always poll TRUE for reading and writing". see http://www.opengroup.org/onlinepubs/009695399/functions/poll.html Then, mounts_poll()'s default should be "POLLIN | POLLRDNORM". it mean always readable. In addition, event trigger should use "POLLERR | POLLPRI" instead POLLERR. it makes consistent to mdstat_poll() and sysfs_poll(). and, select(2) can handle POLLPRI easily. Reported-by: Neil Brown Signed-off-by: KOSAKI Motohiro Cc: Ram Pai Cc: Miklos Szeredi Cc: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index f71559784bf..aa763ab0077 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -648,14 +648,14 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) { struct proc_mounts *p = file->private_data; struct mnt_namespace *ns = p->ns; - unsigned res = 0; + unsigned res = POLLIN | POLLRDNORM; poll_wait(file, &ns->poll, wait); spin_lock(&vfsmount_lock); if (p->event != ns->event) { p->event = ns->event; - res = POLLERR; + res |= POLLERR | POLLPRI; } spin_unlock(&vfsmount_lock); -- cgit v1.2.3 From 0f4d634c59a4e062bef81c00d9e63333f2a83b46 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 26 Mar 2009 13:35:37 -0400 Subject: cifs: flush data on any setattr We already flush all the dirty pages for an inode before doing ATTR_SIZE and ATTR_MTIME changes. There's another problem though -- if we change the mode so that the file becomes read-only then we may not be able to write data to it after a reconnect. Fix this by just going back to flushing all the dirty data on any setattr call. There are probably some cases that can be optimized out, but I'm not sure they're worthwhile and we need to consider them more carefully to make sure that we don't cause regressions if we have to reconnect before writeback occurs. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 58 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f121a80fdd6..89063f1eb55 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1792,20 +1792,21 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) goto out; } - if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) { - /* - Flush data before changing file size or changing the last - write time of the file on the server. If the - flush returns error, store it to report later and continue. - BB: This should be smarter. Why bother flushing pages that - will be truncated anyway? Also, should we error out here if - the flush returns error? - */ - rc = filemap_write_and_wait(inode->i_mapping); - if (rc != 0) { - cifsInode->write_behind_rc = rc; - rc = 0; - } + /* + * Attempt to flush data before changing attributes. We need to do + * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the + * ownership or mode then we may also need to do this. Here, we take + * the safe way out and just do the flush on all setattr requests. If + * the flush returns error, store it to report later and continue. + * + * BB: This should be smarter. Why bother flushing pages that + * will be truncated anyway? Also, should we error out here if + * the flush returns error? + */ + rc = filemap_write_and_wait(inode->i_mapping); + if (rc != 0) { + cifsInode->write_behind_rc = rc; + rc = 0; } if (attrs->ia_valid & ATTR_SIZE) { @@ -1903,20 +1904,21 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) return -ENOMEM; } - if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) { - /* - Flush data before changing file size or changing the last - write time of the file on the server. If the - flush returns error, store it to report later and continue. - BB: This should be smarter. Why bother flushing pages that - will be truncated anyway? Also, should we error out here if - the flush returns error? - */ - rc = filemap_write_and_wait(inode->i_mapping); - if (rc != 0) { - cifsInode->write_behind_rc = rc; - rc = 0; - } + /* + * Attempt to flush data before changing attributes. We need to do + * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the + * ownership or mode then we may also need to do this. Here, we take + * the safe way out and just do the flush on all setattr requests. If + * the flush returns error, store it to report later and continue. + * + * BB: This should be smarter. Why bother flushing pages that + * will be truncated anyway? Also, should we error out here if + * the flush returns error? + */ + rc = filemap_write_and_wait(inode->i_mapping); + if (rc != 0) { + cifsInode->write_behind_rc = rc; + rc = 0; } if (attrs->ia_valid & ATTR_SIZE) { -- cgit v1.2.3 From 74496d365ad171d11f21da4a8be71c945f6ec825 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 31 Mar 2009 16:28:36 +0800 Subject: cifs: remove some pointless conditionals before kfree() Remove some pointless conditionals before kfree(). Signed-off-by: Wei Yongjun Signed-off-by: Steve French --- fs/cifs/connect.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0de3b5615a2..b173b017171 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2673,8 +2673,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ - if (ses->serverOS) - kfree(ses->serverOS); + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); if (ses->serverOS == NULL) @@ -2710,8 +2709,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ - if (ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2*(len+1), GFP_KERNEL); @@ -2725,8 +2723,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, ses->serverDomain[1+(2*len)] = 0; } else { /* else no more room so create dummy domain string */ - if (ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); } @@ -2772,8 +2769,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); - if (ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len + 1, GFP_KERNEL); if (ses->serverDomain == NULL) @@ -3013,8 +3009,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ - if (ses->serverOS) - kfree(ses->serverOS); + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, @@ -3086,8 +3081,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { - if (ses->serverOS) - kfree(ses->serverOS); + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); @@ -3414,8 +3408,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ - if (ses->serverOS) - kfree(ses->serverOS); + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, @@ -3448,8 +3441,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string not always null terminated (e.g. for Windows XP & 2000) */ - if (ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2 * (len + @@ -3476,13 +3468,11 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, = 0; } /* else no more room so create dummy domain string */ else { - if (ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2,GFP_KERNEL); } } else { /* no room so create dummy domain and NOS string */ - if (ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); @@ -3492,8 +3482,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { - if (ses->serverOS) - kfree(ses->serverOS); + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); strncpy(ses->serverOS,bcc_ptr, len); @@ -3512,8 +3501,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); - if (ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len+1, GFP_KERNEL); -- cgit v1.2.3 From 85a6dac54a7e28112488b02523202985edc7e639 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 1 Apr 2009 05:22:00 +0000 Subject: [CIFS] Endian convert UniqueId when reporting inode numbers from server files Jeff made a good point that we should endian convert the UniqueId when we use it to set i_ino Even though this value is opaque to the client, when comparing the inode numbers of the same server file from two different clients (one big endian, one little endian) or when we compare a big endian client's view of i_ino with what the server thinks - we should get the same value Signed-off-by: Steve French --- fs/cifs/CHANGES | 3 ++- fs/cifs/cifspdu.h | 8 ++++---- fs/cifs/cifssmb.c | 2 +- fs/cifs/dir.c | 6 ++++-- fs/cifs/inode.c | 8 +++++--- fs/cifs/readdir.c | 4 ++-- 6 files changed, 18 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 65984006192..9d1fb6ec8a5 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -15,7 +15,8 @@ Posix file open support added (turned off after one attempt if server fails to support it properly, as with Samba server versions prior to 3.3.2) Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too little memory for the "nativeFileSystem" field returned by the server -during mount). +during mount). Endian convert inode numbers if necessary (makes it easier +to compare inode numbers on network files from big endian systems). Version 1.56 ------------ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b370489c8da..a785f69dbc9 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2163,7 +2163,7 @@ typedef struct { __le32 Type; __le64 DevMajor; __le64 DevMinor; - __u64 UniqueId; + __le64 UniqueId; __le64 Permissions; __le64 Nlinks; } __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */ @@ -2308,7 +2308,7 @@ struct unlink_psx_rq { /* level 0x20a SetPathInfo */ } __attribute__((packed)); struct file_internal_info { - __u64 UniqueId; /* inode number */ + __le64 UniqueId; /* inode number */ } __attribute__((packed)); /* level 0x3ee */ struct file_mode_info { @@ -2338,7 +2338,7 @@ typedef struct { __le32 Type; __le64 DevMajor; __le64 DevMinor; - __u64 UniqueId; + __le64 UniqueId; __le64 Permissions; __le64 Nlinks; char FileName[1]; @@ -2386,7 +2386,7 @@ typedef struct { __le32 FileNameLength; __le32 EaSize; /* EA size */ __le32 Reserved; - __u64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ + __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ char FileName[1]; } __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index bc09c998631..3f36b1ea03c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3918,7 +3918,7 @@ GetInodeNumberRetry: } pfinfo = (struct file_internal_info *) (data_offset + (char *) &pSMBr->hdr.Protocol); - *inode_number = pfinfo->UniqueId; + *inode_number = le64_to_cpu(pfinfo->UniqueId); } } GetInodeNumOut: diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 54dce78fbb7..e457e143434 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -187,8 +187,10 @@ int cifs_posix_open(char *full_path, struct inode **pinode, if (!pinode) goto posix_open_ret; /* caller does not need info */ - if (*pinode == NULL) - *pinode = cifs_new_inode(sb, &presp_data->UniqueId); + if (*pinode == NULL) { + __u64 unique_id = le64_to_cpu(presp_data->UniqueId); + *pinode = cifs_new_inode(sb, &unique_id); + } /* else an inode was passed in. Update its info, don't create one */ /* We do not need to close the file if new_inode fails since diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 89063f1eb55..fceebee39f2 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -276,7 +276,8 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* get new inode */ if (*pinode == NULL) { - *pinode = cifs_new_inode(sb, &find_data.UniqueId); + __u64 unique_id = le64_to_cpu(find_data.UniqueId); + *pinode = cifs_new_inode(sb, &unique_id); if (*pinode == NULL) { rc = -ENOMEM; goto cgiiu_exit; @@ -1138,6 +1139,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) cFYI(1, ("posix mkdir returned 0x%x", rc)); d_drop(direntry); } else { + __u64 unique_id; if (pInfo->Type == cpu_to_le32(-1)) { /* no return info, go query for it */ kfree(pInfo); @@ -1151,8 +1153,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) else direntry->d_op = &cifs_dentry_ops; - newinode = cifs_new_inode(inode->i_sb, - &pInfo->UniqueId); + unique_id = le64_to_cpu(pInfo->UniqueId); + newinode = cifs_new_inode(inode->i_sb, &unique_id); if (newinode == NULL) { kfree(pInfo); goto mkdir_get_info; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index c2c01ff4c32..c3c3e6286af 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -840,7 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst, len = strnlen(filename, PATH_MAX); } - *pinum = pFindData->UniqueId; + *pinum = le64_to_cpu(pFindData->UniqueId); } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { FILE_DIRECTORY_INFO *pFindData = (FILE_DIRECTORY_INFO *)current_entry; @@ -856,7 +856,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst, (SEARCH_ID_FULL_DIR_INFO *)current_entry; filename = &pFindData->FileName[0]; len = le32_to_cpu(pFindData->FileNameLength); - *pinum = pFindData->UniqueId; + *pinum = le64_to_cpu(pFindData->UniqueId); } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { FILE_BOTH_DIRECTORY_INFO *pFindData = (FILE_BOTH_DIRECTORY_INFO *)current_entry; -- cgit v1.2.3 From 1bfe73c258addc388b90fe8c2c6bbc0f0c9c10dd Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 1 Apr 2009 17:54:42 +0400 Subject: Remote DFS root support. Allows to mount share on a server that returns -EREMOTE at the tree connect stage or at the check on a full path accessibility. Signed-off-by: Igor Mammedov Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 152 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 120 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b173b017171..2e7a4ea26ab 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2214,9 +2214,56 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon, return rc; } +static void +cleanup_volume_info(struct smb_vol **pvolume_info) +{ + struct smb_vol *volume_info; + + if (!pvolume_info && !*pvolume_info) + return; + + volume_info = *pvolume_info; + kzfree(volume_info->password); + kfree(volume_info->UNC); + kfree(volume_info->prepath); + kfree(volume_info); + *pvolume_info = NULL; + return; +} + +/* build_path_to_root returns full path to root when + * we do not have an exiting connection (tcon) */ +static char * +build_unc_path_to_root(const struct smb_vol *volume_info, + const struct cifs_sb_info *cifs_sb) +{ + char *full_path; + + int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1); + full_path = kmalloc(unc_len + cifs_sb->prepathlen + 1, GFP_KERNEL); + if (full_path == NULL) + return ERR_PTR(-ENOMEM); + + strncpy(full_path, volume_info->UNC, unc_len); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { + int i; + for (i = 0; i < unc_len; i++) { + if (full_path[i] == '\\') + full_path[i] = '/'; + } + } + + if (cifs_sb->prepathlen) + strncpy(full_path + unc_len, cifs_sb->prepath, + cifs_sb->prepathlen); + + full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */ + return full_path; +} + int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, - char *mount_data, const char *devname) + char *mount_data_global, const char *devname) { int rc = 0; int xid; @@ -2225,6 +2272,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, struct cifsTconInfo *tcon = NULL; struct TCP_Server_Info *srvTcp = NULL; char *full_path; + struct dfs_info3_param *referrals = NULL; + unsigned int num_referrals = 0; + + char *mount_data = mount_data_global; + +try_mount_again: + full_path = NULL; xid = GetXid(); @@ -2371,11 +2425,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, } } - /* check for null share name ie connect to dfs root */ if ((strchr(volume_info->UNC + 3, '\\') == NULL) && (strchr(volume_info->UNC + 3, '/') == NULL)) { - /* rc = connect_to_dfs_path(...) */ - cFYI(1, ("DFS root not supported")); + cERROR(1, ("Missing share name")); rc = -ENODEV; goto mount_fail_check; } else { @@ -2392,7 +2444,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, } } if (rc) - goto mount_fail_check; + goto remote_path_check; tcon->seal = volume_info->seal; write_lock(&cifs_tcp_ses_lock); list_add(&tcon->tcon_list, &pSesInfo->tcon_list); @@ -2417,19 +2469,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, /* BB FIXME fix time_gran to be larger for LANMAN sessions */ sb->s_time_gran = 100; -mount_fail_check: - /* on error free sesinfo and tcon struct if needed */ - if (rc) { - /* If find_unc succeeded then rc == 0 so we can not end */ - /* up accidently freeing someone elses tcon struct */ - if (tcon) - cifs_put_tcon(tcon); - else if (pSesInfo) - cifs_put_smb_ses(pSesInfo); - else - cifs_put_tcp_session(srvTcp); - goto out; - } + if (rc) + goto remote_path_check; + cifs_sb->tcon = tcon; /* do not care if following two calls succeed - informational */ @@ -2461,7 +2503,9 @@ mount_fail_check: cifs_sb->rsize = min(cifs_sb->rsize, (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); - if (!rc && cifs_sb->prepathlen) { +remote_path_check: + /* check if a whole path (including prepath) is not remote */ + if (!rc && cifs_sb->prepathlen && tcon) { /* build_path_to_root works only when we have a valid tcon */ full_path = cifs_build_path_to_root(cifs_sb); if (full_path == NULL) { @@ -2469,31 +2513,75 @@ mount_fail_check: goto mount_fail_check; } rc = is_path_accessible(xid, tcon, cifs_sb, full_path); - if (rc) { - cERROR(1, ("Path %s in not accessible: %d", - full_path, rc)); + if (rc != -EREMOTE) { kfree(full_path); goto mount_fail_check; } kfree(full_path); } + /* get referral if needed */ + if (rc == -EREMOTE) { + /* convert forward to back slashes in prepath here if needed */ + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0) + convert_delimiter(cifs_sb->prepath, + CIFS_DIR_SEP(cifs_sb)); + full_path = build_unc_path_to_root(volume_info, cifs_sb); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); + goto mount_fail_check; + } + + cFYI(1, ("Getting referral for: %s", full_path)); + rc = get_dfs_path(xid, pSesInfo , full_path + 1, + cifs_sb->local_nls, &num_referrals, &referrals, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (!rc && num_referrals > 0) { + char *fake_devname = NULL; + + if (mount_data != mount_data_global) + kfree(mount_data); + mount_data = cifs_compose_mount_options( + cifs_sb->mountdata, full_path + 1, + referrals, &fake_devname); + kfree(fake_devname); + free_dfs_info_array(referrals, num_referrals); + + if (tcon) + cifs_put_tcon(tcon); + else if (pSesInfo) + cifs_put_smb_ses(pSesInfo); + + cleanup_volume_info(&volume_info); + FreeXid(xid); + kfree(full_path); + goto try_mount_again; + } + } + +mount_fail_check: + /* on error free sesinfo and tcon struct if needed */ + if (rc) { + if (mount_data != mount_data_global) + kfree(mount_data); + /* If find_unc succeeded then rc == 0 so we can not end */ + /* up accidently freeing someone elses tcon struct */ + if (tcon) + cifs_put_tcon(tcon); + else if (pSesInfo) + cifs_put_smb_ses(pSesInfo); + else + cifs_put_tcp_session(srvTcp); + goto out; + } + /* volume_info->password is freed above when existing session found (in which case it is not needed anymore) but when new sesion is created the password ptr is put in the new session structure (in which case the password will be freed at unmount time) */ out: /* zero out password before freeing */ - if (volume_info) { - if (volume_info->password != NULL) { - memset(volume_info->password, 0, - strlen(volume_info->password)); - kfree(volume_info->password); - } - kfree(volume_info->UNC); - kfree(volume_info->prepath); - kfree(volume_info); - } + cleanup_volume_info(&volume_info); FreeXid(xid); return rc; } -- cgit v1.2.3 From d036f50fc202e1a851a25dc5edc215ebd0086201 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 3 Apr 2009 03:12:08 +0000 Subject: [CIFS] Fix build break from recent DFS patch when DFS support not enabled Signed-off-by: Steve French --- fs/cifs/connect.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2e7a4ea26ab..6926023af87 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2522,6 +2522,7 @@ remote_path_check: /* get referral if needed */ if (rc == -EREMOTE) { +#ifdef CONFIG_CIFS_DFS_UPCALL /* convert forward to back slashes in prepath here if needed */ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0) convert_delimiter(cifs_sb->prepath, @@ -2557,6 +2558,9 @@ remote_path_check: kfree(full_path); goto try_mount_again; } +#else /* No DFS support, return error on mount */ + rc = -EOPNOTSUPP; +#endif } mount_fail_check: -- cgit v1.2.3 From fbec9ab952d4810960e620035c8e95f0fbbae4be Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Apr 2009 13:44:00 -0400 Subject: cifs: vary timeout on writes past EOF based on offset (try #5) This is the fourth version of this patch: The first three generated a compiler warning asking for explicit curly braces. The first two didn't handle update the size correctly when writes that didn't start at the eof were done. The first patch also didn't update the size correctly when it explicitly set via truncate(). This patch adds code to track the client's current understanding of the size of the file on the server separate from the i_size, and then to use this info to semi-intelligently set the timeout for writes past the EOF. This helps prevent timeouts when trying to write large, sparse files on windows servers. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 1 + fs/cifs/cifsglob.h | 1 + fs/cifs/cifssmb.c | 4 ++-- fs/cifs/file.c | 64 ++++++++++++++++++++++++++++++++++++++++++------------ fs/cifs/inode.c | 8 ++++--- fs/cifs/readdir.c | 2 ++ 6 files changed, 61 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 38491fd3871..34f5701d955 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -316,6 +316,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->clientCanCacheAll = false; cifs_inode->delete_pending = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ + cifs_inode->server_eof = 0; /* Can not set i_flags here - they get immediately overwritten to zero by the VFS */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9fbf4dff5da..7ae19868fdc 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -370,6 +370,7 @@ struct cifsInodeInfo { bool clientCanCacheAll:1; /* read and writebehind oplock */ bool oplockPending:1; bool delete_pending:1; /* DELETE_ON_CLOSE is set */ + u64 server_eof; /* current file size on server */ struct inode vfs_inode; }; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 3f36b1ea03c..a0845dc7b8a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1626,6 +1626,8 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon, int smb_hdr_len; int resp_buf_type = 0; + *nbytes = 0; + cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count)); if (tcon->ses->capabilities & CAP_LARGE_FILES) { @@ -1682,11 +1684,9 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon, cifs_stats_inc(&tcon->num_writes); if (rc) { cFYI(1, ("Send error Write2 = %d", rc)); - *nbytes = 0; } else if (resp_buf_type == 0) { /* presumably this can not happen, but best to be safe */ rc = -EIO; - *nbytes = 0; } else { WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base; *nbytes = le16_to_cpu(pSMBr->CountHigh); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 81747acca4c..dfd3e6c52a1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -971,6 +971,40 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) return rc; } +/* + * Set the timeout on write requests past EOF. For some servers (Windows) + * these calls can be very long. + * + * If we're writing >10M past the EOF we give a 180s timeout. Anything less + * than that gets a 45s timeout. Writes not past EOF get 15s timeouts. + * The 10M cutoff is totally arbitrary. A better scheme for this would be + * welcome if someone wants to suggest one. + * + * We may be able to do a better job with this if there were some way to + * declare that a file should be sparse. + */ +static int +cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset) +{ + if (offset <= cifsi->server_eof) + return CIFS_STD_OP; + else if (offset > (cifsi->server_eof + (10 * 1024 * 1024))) + return CIFS_VLONG_OP; + else + return CIFS_LONG_OP; +} + +/* update the file size (if needed) after a write */ +static void +cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, + unsigned int bytes_written) +{ + loff_t end_of_write = offset + bytes_written; + + if (end_of_write > cifsi->server_eof) + cifsi->server_eof = end_of_write; +} + ssize_t cifs_user_write(struct file *file, const char __user *write_data, size_t write_size, loff_t *poffset) { @@ -981,6 +1015,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, struct cifsTconInfo *pTcon; int xid, long_op; struct cifsFileInfo *open_file; + struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); @@ -1000,11 +1035,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, xid = GetXid(); - if (*poffset > file->f_path.dentry->d_inode->i_size) - long_op = CIFS_VLONG_OP; /* writes past EOF take long time */ - else - long_op = CIFS_LONG_OP; - + long_op = cifs_write_timeout(cifsi, *poffset); for (total_written = 0; write_size > total_written; total_written += bytes_written) { rc = -EAGAIN; @@ -1048,8 +1079,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, FreeXid(xid); return rc; } - } else + } else { + cifs_update_eof(cifsi, *poffset, bytes_written); *poffset += bytes_written; + } long_op = CIFS_STD_OP; /* subsequent writes fast - 15 seconds is plenty */ } @@ -1085,6 +1118,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data, struct cifsTconInfo *pTcon; int xid, long_op; struct cifsFileInfo *open_file; + struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); @@ -1099,11 +1133,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data, xid = GetXid(); - if (*poffset > file->f_path.dentry->d_inode->i_size) - long_op = CIFS_VLONG_OP; /* writes past EOF can be slow */ - else - long_op = CIFS_LONG_OP; - + long_op = cifs_write_timeout(cifsi, *poffset); for (total_written = 0; write_size > total_written; total_written += bytes_written) { rc = -EAGAIN; @@ -1166,8 +1196,10 @@ static ssize_t cifs_write(struct file *file, const char *write_data, FreeXid(xid); return rc; } - } else + } else { + cifs_update_eof(cifsi, *poffset, bytes_written); *poffset += bytes_written; + } long_op = CIFS_STD_OP; /* subsequent writes fast - 15 seconds is plenty */ } @@ -1380,11 +1412,12 @@ static int cifs_writepages(struct address_space *mapping, int nr_pages; __u64 offset = 0; struct cifsFileInfo *open_file; + struct cifsInodeInfo *cifsi = CIFS_I(mapping->host); struct page *page; struct pagevec pvec; int rc = 0; int scanned = 0; - int xid; + int xid, long_op; cifs_sb = CIFS_SB(mapping->host->i_sb); @@ -1528,12 +1561,15 @@ retry: cERROR(1, ("No writable handles for inode")); rc = -EBADF; } else { + long_op = cifs_write_timeout(cifsi, offset); rc = CIFSSMBWrite2(xid, cifs_sb->tcon, open_file->netfid, bytes_to_write, offset, &bytes_written, iov, n_iov, - CIFS_LONG_OP); + long_op); atomic_dec(&open_file->wrtPending); + cifs_update_eof(cifsi, offset, bytes_written); + if (rc || bytes_written < bytes_to_write) { cERROR(1, ("Write2 ret %d, wrote %d", rc, bytes_written)); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index fceebee39f2..09082ac8518 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -143,6 +143,7 @@ static void cifs_unix_info_to_inode(struct inode *inode, inode->i_nlink = le64_to_cpu(info->Nlinks); + cifsInfo->server_eof = end_of_file; spin_lock(&inode->i_lock); if (is_size_safe_to_change(cifsInfo, end_of_file)) { /* @@ -606,12 +607,12 @@ int cifs_get_inode_info(struct inode **pinode, inode->i_mode |= S_IFREG; } + cifsInfo->server_eof = le64_to_cpu(pfindData->EndOfFile); spin_lock(&inode->i_lock); - if (is_size_safe_to_change(cifsInfo, - le64_to_cpu(pfindData->EndOfFile))) { + if (is_size_safe_to_change(cifsInfo, cifsInfo->server_eof)) { /* can not safely shrink the file size here if the client is writing to it due to potential races */ - i_size_write(inode, le64_to_cpu(pfindData->EndOfFile)); + i_size_write(inode, cifsInfo->server_eof); /* 512 bytes (2**9) is the fake blocksize that must be used for this calculation */ @@ -1755,6 +1756,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, } if (rc == 0) { + cifsInode->server_eof = attrs->ia_size; rc = cifs_vmtruncate(inode, attrs->ia_size); cifs_truncate_page(inode->i_mapping, inode->i_size); } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index c3c3e6286af..1a8be622833 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -239,6 +239,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, if (atomic_read(&cifsInfo->inUse) == 0) atomic_set(&cifsInfo->inUse, 1); + cifsInfo->server_eof = end_of_file; spin_lock(&tmp_inode->i_lock); if (is_size_safe_to_change(cifsInfo, end_of_file)) { /* can not safely change the file size here if the @@ -375,6 +376,7 @@ static void unix_fill_in_inode(struct inode *tmp_inode, tmp_inode->i_gid = le64_to_cpu(pfindData->Gid); tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks); + cifsInfo->server_eof = end_of_file; spin_lock(&tmp_inode->i_lock); if (is_size_safe_to_change(cifsInfo, end_of_file)) { /* can not safely change the file size here if the -- cgit v1.2.3 From 2d6d589d8009b37ae03244059c93e0e8cf46910e Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 9 Apr 2009 00:36:44 +0000 Subject: [CIFS] remove some build warnings Signed-off-by: Steve French --- fs/cifs/connect.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6926023af87..01e280cab06 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2231,6 +2231,7 @@ cleanup_volume_info(struct smb_vol **pvolume_info) return; } +#ifdef CONFIG_CIFS_DFS_UPCALL /* build_path_to_root returns full path to root when * we do not have an exiting connection (tcon) */ static char * @@ -2260,6 +2261,7 @@ build_unc_path_to_root(const struct smb_vol *volume_info, full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */ return full_path; } +#endif int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, @@ -2272,12 +2274,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, struct cifsTconInfo *tcon = NULL; struct TCP_Server_Info *srvTcp = NULL; char *full_path; + char *mount_data = mount_data_global; +#ifdef CONFIG_CIFS_DFS_UPCALL struct dfs_info3_param *referrals = NULL; unsigned int num_referrals = 0; - - char *mount_data = mount_data_global; - try_mount_again: +#endif full_path = NULL; xid = GetXid(); -- cgit v1.2.3 From 5144ebf408ed08380917126493450ee22e63fe3f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Mar 2009 01:47:12 -0400 Subject: cifs: remove dnotify thread code cifs: remove dnotify thread code Al Viro recently removed the dir_notify code from the kernel along with the CIFS code that used it. We can also get rid of the dnotify thread as well. In actuality, it never had anything to do with dir_notify anyway. All it did was unnecessarily wake up all the tasks waiting on the response queues every 15s. Previously that happened to prevent tasks from hanging indefinitely when the server went unresponsive, but we put those to sleep with proper timeouts now so there's no reason to keep this around. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 34f5701d955..0d6d8b57365 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -66,9 +66,6 @@ unsigned int sign_CIFS_PDUs = 1; extern struct task_struct *oplockThread; /* remove sparse warning */ struct task_struct *oplockThread = NULL; /* extern struct task_struct * dnotifyThread; remove sparse warning */ -#ifdef CONFIG_CIFS_EXPERIMENTAL -static struct task_struct *dnotifyThread = NULL; -#endif static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; module_param(CIFSMaxBufSize, int, 0); @@ -1041,34 +1038,6 @@ static int cifs_oplock_thread(void *dummyarg) return 0; } -#ifdef CONFIG_CIFS_EXPERIMENTAL -static int cifs_dnotify_thread(void *dummyarg) -{ - struct list_head *tmp; - struct TCP_Server_Info *server; - - do { - if (try_to_freeze()) - continue; - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(15*HZ); - /* check if any stuck requests that need - to be woken up and wakeq so the - thread can wake up and error out */ - read_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &cifs_tcp_ses_list) { - server = list_entry(tmp, struct TCP_Server_Info, - tcp_ses_list); - if (atomic_read(&server->inFlight)) - wake_up_all(&server->response_q); - } - read_unlock(&cifs_tcp_ses_lock); - } while (!kthread_should_stop()); - - return 0; -} -#endif - static int __init init_cifs(void) { @@ -1145,21 +1114,8 @@ init_cifs(void) goto out_unregister_dfs_key_type; } -#ifdef CONFIG_CIFS_EXPERIMENTAL - dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd"); - if (IS_ERR(dnotifyThread)) { - rc = PTR_ERR(dnotifyThread); - cERROR(1, ("error %d create dnotify thread", rc)); - goto out_stop_oplock_thread; - } -#endif - return 0; -#ifdef CONFIG_CIFS_EXPERIMENTAL - out_stop_oplock_thread: -#endif - kthread_stop(oplockThread); out_unregister_dfs_key_type: #ifdef CONFIG_CIFS_DFS_UPCALL unregister_key_type(&key_type_dns_resolver); @@ -1197,9 +1153,6 @@ exit_cifs(void) cifs_destroy_inodecache(); cifs_destroy_mids(); cifs_destroy_request_bufs(); -#ifdef CONFIG_CIFS_EXPERIMENTAL - kthread_stop(dnotifyThread); -#endif kthread_stop(oplockThread); } -- cgit v1.2.3 From d9fb5c091b419e0495c50c1cce9e4cf9f7105072 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Mar 2009 01:47:11 -0400 Subject: cifs: no need to use rcu_assign_pointer on immutable keys cifs: no need to use rcu_assign_pointer on immutable keys Neither keytype in use by CIFS has an "update" method. This means that the keys are immutable once instantiated. We don't need to use RCU to set the payload data pointers. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_spnego.c | 2 +- fs/cifs/dns_resolve.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 3fd3a9df043..67bf93a40d2 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -41,7 +41,7 @@ cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen) /* attach the data */ memcpy(payload, data, datalen); - rcu_assign_pointer(key->payload.data, payload); + key->payload.data = payload; ret = 0; error: diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 1e0c1bd8f2e..df4a306f697 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -78,7 +78,7 @@ dns_resolver_instantiate(struct key *key, const void *data, } key->type_data.x[0] = datalen; - rcu_assign_pointer(key->payload.data, ip); + key->payload.data = ip; return rc; } -- cgit v1.2.3 From a6ce4932fbdbcd8f8e8c6df76812014351c32892 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 9 Apr 2009 01:14:32 +0000 Subject: [CIFS] Add support for posix open during lookup This patch by utilizing lookup intents, and thus removing a network roundtrip in the open path, improves performance dramatically on open (30% or more) to Samba and other servers which support the cifs posix extensions Signed-off-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 +- fs/cifs/dir.c | 131 ++++++++++++++++++++++++++++++++++------------------- fs/cifs/file.c | 65 +++++++++++++------------- 3 files changed, 118 insertions(+), 80 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 7ae19868fdc..df40ab64cd9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -350,7 +350,7 @@ struct cifsFileInfo { bool invalidHandle:1; /* file closed via session abend */ bool messageMode:1; /* for pipes: message vs byte mode */ atomic_t wrtPending; /* handle in use - defer close */ - struct semaphore fh_sem; /* prevents reopen race after dead ses*/ + struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; }; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index e457e143434..d9006b04324 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -129,12 +129,64 @@ cifs_bp_rename_retry: return full_path; } +static void +cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, + struct cifsTconInfo *tcon, bool write_only) +{ + int oplock = 0; + struct cifsFileInfo *pCifsFile; + struct cifsInodeInfo *pCifsInode; + + pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + + if (pCifsFile == NULL) + return; + + if (oplockEnabled) + oplock = REQ_OPLOCK; + + pCifsFile->netfid = fileHandle; + pCifsFile->pid = current->tgid; + pCifsFile->pInode = newinode; + pCifsFile->invalidHandle = false; + pCifsFile->closePend = false; + mutex_init(&pCifsFile->fh_mutex); + mutex_init(&pCifsFile->lock_mutex); + INIT_LIST_HEAD(&pCifsFile->llist); + atomic_set(&pCifsFile->wrtPending, 0); + + /* set the following in open now + pCifsFile->pfile = file; */ + write_lock(&GlobalSMBSeslock); + list_add(&pCifsFile->tlist, &tcon->openFileList); + pCifsInode = CIFS_I(newinode); + if (pCifsInode) { + /* if readable file instance put first in list*/ + if (write_only) { + list_add_tail(&pCifsFile->flist, + &pCifsInode->openFileList); + } else { + list_add(&pCifsFile->flist, + &pCifsInode->openFileList); + } + if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { + pCifsInode->clientCanCacheAll = true; + pCifsInode->clientCanCacheRead = true; + cFYI(1, ("Exclusive Oplock inode %p", + newinode)); + } else if ((oplock & 0xF) == OPLOCK_READ) + pCifsInode->clientCanCacheRead = true; + } + write_unlock(&GlobalSMBSeslock); +} + int cifs_posix_open(char *full_path, struct inode **pinode, struct super_block *sb, int mode, int oflags, int *poplock, __u16 *pnetfid, int xid) { int rc; __u32 oplock; + bool write_only = false; FILE_UNIX_BASIC_INFO *presp_data; __u32 posix_flags = 0; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -172,6 +224,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode, if (oflags & O_DIRECT) posix_flags |= SMB_O_DIRECT; + if (!(oflags & FMODE_READ)) + write_only = true; rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, pnetfid, presp_data, &oplock, full_path, @@ -200,6 +254,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode, posix_fill_in_inode(*pinode, presp_data, 1); + cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only); + posix_open_ret: kfree(presp_data); return rc; @@ -241,7 +297,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, char *full_path = NULL; FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; - struct cifsInodeInfo *pCifsInode; int disposition = FILE_OVERWRITE_IF; bool write_only = false; @@ -412,44 +467,8 @@ cifs_create_set_dentry: /* mknod case - do not leave file open */ CIFSSMBClose(xid, tcon, fileHandle); } else if (newinode) { - struct cifsFileInfo *pCifsFile = - kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); - - if (pCifsFile == NULL) - goto cifs_create_out; - pCifsFile->netfid = fileHandle; - pCifsFile->pid = current->tgid; - pCifsFile->pInode = newinode; - pCifsFile->invalidHandle = false; - pCifsFile->closePend = false; - init_MUTEX(&pCifsFile->fh_sem); - mutex_init(&pCifsFile->lock_mutex); - INIT_LIST_HEAD(&pCifsFile->llist); - atomic_set(&pCifsFile->wrtPending, 0); - - /* set the following in open now - pCifsFile->pfile = file; */ - write_lock(&GlobalSMBSeslock); - list_add(&pCifsFile->tlist, &tcon->openFileList); - pCifsInode = CIFS_I(newinode); - if (pCifsInode) { - /* if readable file instance put first in list*/ - if (write_only) { - list_add_tail(&pCifsFile->flist, - &pCifsInode->openFileList); - } else { - list_add(&pCifsFile->flist, - &pCifsInode->openFileList); - } - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, ("Exclusive Oplock inode %p", - newinode)); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; - } - write_unlock(&GlobalSMBSeslock); + cifs_fill_fileinfo(newinode, fileHandle, + cifs_sb->tcon, write_only); } cifs_create_out: kfree(buf); @@ -582,17 +601,21 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, return rc; } - struct dentry * cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct nameidata *nd) { int xid; int rc = 0; /* to get around spurious gcc warning, set to zero here */ + int oplock = 0; + int mode; + __u16 fileHandle = 0; + bool posix_open = false; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; struct inode *newInode = NULL; char *full_path = NULL; + struct file *filp; xid = GetXid(); @@ -634,12 +657,27 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode)); - if (pTcon->unix_ext) - rc = cifs_get_inode_info_unix(&newInode, full_path, - parent_dir_inode->i_sb, xid); - else + if (pTcon->unix_ext) { + if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && + (nd->flags & LOOKUP_OPEN)) { + if (!((nd->intent.open.flags & O_CREAT) && + (nd->intent.open.flags & O_EXCL))) { + mode = nd->intent.open.create_mode & + ~current->fs->umask; + rc = cifs_posix_open(full_path, &newInode, + parent_dir_inode->i_sb, mode, + nd->intent.open.flags, &oplock, + &fileHandle, xid); + if ((rc != -EINVAL) && (rc != -EOPNOTSUPP)) + posix_open = true; + } + } + if (!posix_open) + rc = cifs_get_inode_info_unix(&newInode, full_path, + parent_dir_inode->i_sb, xid); + } else rc = cifs_get_inode_info(&newInode, full_path, NULL, - parent_dir_inode->i_sb, xid, NULL); + parent_dir_inode->i_sb, xid, NULL); if ((rc == 0) && (newInode != NULL)) { if (pTcon->nocase) @@ -647,7 +685,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, else direntry->d_op = &cifs_dentry_ops; d_add(direntry, newInode); - + if (posix_open) + filp = lookup_instantiate_filp(nd, direntry, NULL); /* since paths are not looked up by component - the parent directories are presumed to be good here */ renew_parental_timestamps(direntry); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index dfd3e6c52a1..48c9ae09f3d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -46,7 +46,7 @@ static inline struct cifsFileInfo *cifs_init_private( memset(private_data, 0, sizeof(struct cifsFileInfo)); private_data->netfid = netfid; private_data->pid = current->tgid; - init_MUTEX(&private_data->fh_sem); + mutex_init(&private_data->fh_mutex); mutex_init(&private_data->lock_mutex); INIT_LIST_HEAD(&private_data->llist); private_data->pfile = file; /* needed for writepage */ @@ -284,35 +284,34 @@ int cifs_open(struct inode *inode, struct file *file) cifs_sb = CIFS_SB(inode->i_sb); tcon = cifs_sb->tcon; - if (file->f_flags & O_CREAT) { - /* search inode for this file and fill in file->private_data */ - pCifsInode = CIFS_I(file->f_path.dentry->d_inode); - read_lock(&GlobalSMBSeslock); - list_for_each(tmp, &pCifsInode->openFileList) { - pCifsFile = list_entry(tmp, struct cifsFileInfo, - flist); - if ((pCifsFile->pfile == NULL) && - (pCifsFile->pid == current->tgid)) { - /* mode set in cifs_create */ - - /* needed for writepage */ - pCifsFile->pfile = file; - - file->private_data = pCifsFile; - break; - } - } - read_unlock(&GlobalSMBSeslock); - if (file->private_data != NULL) { - rc = 0; - FreeXid(xid); - return rc; - } else { - if (file->f_flags & O_EXCL) - cERROR(1, ("could not find file instance for " - "new file %p", file)); + /* search inode for this file and fill in file->private_data */ + pCifsInode = CIFS_I(file->f_path.dentry->d_inode); + read_lock(&GlobalSMBSeslock); + list_for_each(tmp, &pCifsInode->openFileList) { + pCifsFile = list_entry(tmp, struct cifsFileInfo, + flist); + if ((pCifsFile->pfile == NULL) && + (pCifsFile->pid == current->tgid)) { + /* mode set in cifs_create */ + + /* needed for writepage */ + pCifsFile->pfile = file; + + file->private_data = pCifsFile; + break; } } + read_unlock(&GlobalSMBSeslock); + + if (file->private_data != NULL) { + rc = 0; + FreeXid(xid); + return rc; + } else { + if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) + cERROR(1, ("could not find file instance for " + "new file %p", file)); + } full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { @@ -500,9 +499,9 @@ static int cifs_reopen_file(struct file *file, bool can_flush) return -EBADF; xid = GetXid(); - down(&pCifsFile->fh_sem); + mutex_unlock(&pCifsFile->fh_mutex); if (!pCifsFile->invalidHandle) { - up(&pCifsFile->fh_sem); + mutex_lock(&pCifsFile->fh_mutex); FreeXid(xid); return 0; } @@ -533,7 +532,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush) if (full_path == NULL) { rc = -ENOMEM; reopen_error_exit: - up(&pCifsFile->fh_sem); + mutex_lock(&pCifsFile->fh_mutex); FreeXid(xid); return rc; } @@ -575,14 +574,14 @@ reopen_error_exit: cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { - up(&pCifsFile->fh_sem); + mutex_lock(&pCifsFile->fh_mutex); cFYI(1, ("cifs_open returned 0x%x", rc)); cFYI(1, ("oplock: %d", oplock)); } else { reopen_success: pCifsFile->netfid = netfid; pCifsFile->invalidHandle = false; - up(&pCifsFile->fh_sem); + mutex_lock(&pCifsFile->fh_mutex); pCifsInode = CIFS_I(inode); if (pCifsInode) { if (can_flush) { -- cgit v1.2.3 From bc8cd4390c9129fbd286b10fa99972dfb68cd069 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 12 Apr 2009 18:18:40 +0000 Subject: [CIFS] Fix sparse warnings Signed-off-by: Shirish Pargaonkar CC: Jeff Layton Signed-off-by: Steve French --- fs/cifs/dir.c | 26 +++++++++++++++++--------- fs/cifs/file.c | 4 +--- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index d9006b04324..e937da7522e 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -149,7 +149,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, pCifsFile->pid = current->tgid; pCifsFile->pInode = newinode; pCifsFile->invalidHandle = false; - pCifsFile->closePend = false; + pCifsFile->closePend = false; mutex_init(&pCifsFile->fh_mutex); mutex_init(&pCifsFile->lock_mutex); INIT_LIST_HEAD(&pCifsFile->llist); @@ -162,20 +162,18 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, pCifsInode = CIFS_I(newinode); if (pCifsInode) { /* if readable file instance put first in list*/ - if (write_only) { + if (write_only) list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); - } else { - list_add(&pCifsFile->flist, - &pCifsInode->openFileList); - } + else + list_add(&pCifsFile->flist, &pCifsInode->openFileList); + if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { pCifsInode->clientCanCacheAll = true; pCifsInode->clientCanCacheRead = true; - cFYI(1, ("Exclusive Oplock inode %p", - newinode)); + cFYI(1, ("Exclusive Oplock inode %p", newinode)); } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + pCifsInode->clientCanCacheRead = true; } write_unlock(&GlobalSMBSeslock); } @@ -668,6 +666,16 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, parent_dir_inode->i_sb, mode, nd->intent.open.flags, &oplock, &fileHandle, xid); + /* + * This code works around a bug in + * samba posix open in samba versions 3.3.1 + * and earlier where create works + * but open fails with invalid parameter. + * If either of these error codes are + * returned, follow the normal lookup. + * Otherwise, the error during posix open + * is handled. + */ if ((rc != -EINVAL) && (rc != -EOPNOTSUPP)) posix_open = true; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 48c9ae09f3d..50ca088d886 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -307,11 +307,9 @@ int cifs_open(struct inode *inode, struct file *file) rc = 0; FreeXid(xid); return rc; - } else { - if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) + } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) cERROR(1, ("could not find file instance for " "new file %p", file)); - } full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { -- cgit v1.2.3 From 88dd47fff4891545bfcfdf39146dde8380771766 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 15 Apr 2009 03:09:39 +0000 Subject: [CIFS] Fix build break caused by change to new current_umask helper function Signed-off-by: Steve French --- fs/cifs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index e937da7522e..461750e0136 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -661,7 +661,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, if (!((nd->intent.open.flags & O_CREAT) && (nd->intent.open.flags & O_EXCL))) { mode = nd->intent.open.create_mode & - ~current->fs->umask; + ~current_umask(); rc = cifs_posix_open(full_path, &newInode, parent_dir_inode->i_sb, mode, nd->intent.open.flags, &oplock, -- cgit v1.2.3 From 27b87fe52baba0a55e9723030e76fce94fabcea4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 14 Apr 2009 11:00:53 -0400 Subject: cifs: fix unicode string area word alignment in session setup The handling of unicode string area alignment is wrong. decode_unicode_ssetup improperly assumes that it will always be preceded by a pad byte. This isn't the case if the string area is already word-aligned. This problem, combined with the bad buffer sizing for the serverDomain string can cause memory corruption. The bad alignment can make it so that the alignment of the characters is off. This can make them translate to characters that are greater than 2 bytes each. If this happens we can overflow the allocation. Fix this by fixing the alignment in CIFS_SessSetup instead so we can verify it against the head of the response. Also, clean up the workaround for improperly terminated strings by checking for a odd-length unicode buffers and then forcibly terminating them. Finally, resize the buffer for serverDomain. Now that we've fixed the alignment, it's probably fine, but a malicious server could overflow it. A better solution for handling these strings is still needed, but this should be a suitable bandaid. Signed-off-by: Jeff Layton CC: Stable Signed-off-by: Steve French --- fs/cifs/sess.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 5c68b4282be..70d04d08293 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -285,27 +285,26 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft, int words_left, len; char *data = *pbcc_area; - - cFYI(1, ("bleft %d", bleft)); - - /* SMB header is unaligned, so cifs servers word align start of - Unicode strings */ - data++; - bleft--; /* Windows servers do not always double null terminate - their final Unicode string - in which case we - now will not attempt to decode the byte of junk - which follows it */ + /* + * Windows servers do not always double null terminate their final + * Unicode string. Check to see if there are an uneven number of bytes + * left. If so, then add an extra NULL pad byte to the end of the + * response. + * + * See section 2.7.2 in "Implementing CIFS" for details + */ + if (bleft % 2) { + data[bleft] = 0; + ++bleft; + } words_left = bleft / 2; /* save off server operating system */ len = UniStrnlen((wchar_t *) data, words_left); -/* We look for obvious messed up bcc or strings in response so we do not go off - the end since (at least) WIN2K and Windows XP have a major bug in not null - terminating last Unicode string in response */ if (len >= words_left) return rc; @@ -343,13 +342,10 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft, return rc; kfree(ses->serverDomain); - ses->serverDomain = kzalloc(2 * (len + 1), GFP_KERNEL); /* BB FIXME wrong length */ - if (ses->serverDomain != NULL) { + ses->serverDomain = kzalloc((4 * len) + 2, GFP_KERNEL); + if (ses->serverDomain != NULL) cifs_strfromUCS_le(ses->serverDomain, (__le16 *)data, len, nls_cp); - ses->serverDomain[2*len] = 0; - ses->serverDomain[(2*len) + 1] = 0; - } data += 2 * (len + 1); words_left -= len + 1; @@ -702,12 +698,18 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, } /* BB check if Unicode and decode strings */ - if (smb_buf->Flags2 & SMBFLG2_UNICODE) + if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } rc = decode_unicode_ssetup(&bcc_ptr, bytes_remaining, - ses, nls_cp); - else + ses, nls_cp); + } else { rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); + } ssetup_exit: if (spnego_key) { -- cgit v1.2.3 From f083def68f84b04fe3f97312498911afce79609e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 16 Apr 2009 11:21:52 -0400 Subject: cifs: fix buffer size for tcon->nativeFileSystem field The buffer for this was resized recently to fix a bug. It's still possible however that a malicious server could overflow this field by sending characters in it that are >2 bytes in the local charset. Double the size of the buffer to account for this possibility. Also get rid of some really strange and seemingly pointless NULL termination. It's NULL terminating the string in the source buffer, but by the time that happens, we've already copied the string. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 01e280cab06..1a93604d98f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3756,16 +3756,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, BCC(smb_buffer_response)) { kfree(tcon->nativeFileSystem); tcon->nativeFileSystem = - kzalloc(2*(length + 1), GFP_KERNEL); + kzalloc((4 * length) + 2, GFP_KERNEL); if (tcon->nativeFileSystem) cifs_strfromUCS_le( tcon->nativeFileSystem, (__le16 *) bcc_ptr, length, nls_codepage); - bcc_ptr += 2 * length; - bcc_ptr[0] = 0; /* null terminate the string */ - bcc_ptr[1] = 0; - bcc_ptr += 2; + bcc_ptr += (2 * length) + 2; } /* else do not bother copying these information fields*/ } else { -- cgit v1.2.3 From 313fecfa69bbad0a10d3313a50a89d3064f47ce1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 16 Apr 2009 11:21:53 -0400 Subject: cifs: add cFYI messages with some of the saved strings from ssetup/tcon ...to make it easier to find problems in this area in the future. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 5 ++++- fs/cifs/sess.c | 9 +++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1a93604d98f..4a04ecdc1b7 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3757,11 +3757,14 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, kfree(tcon->nativeFileSystem); tcon->nativeFileSystem = kzalloc((4 * length) + 2, GFP_KERNEL); - if (tcon->nativeFileSystem) + if (tcon->nativeFileSystem) { cifs_strfromUCS_le( tcon->nativeFileSystem, (__le16 *) bcc_ptr, length, nls_codepage); + cFYI(1, ("nativeFileSystem=%s", + tcon->nativeFileSystem)); + } bcc_ptr += (2 * length) + 2; } /* else do not bother copying these information fields*/ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 70d04d08293..c652c73760d 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -311,8 +311,10 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft, kfree(ses->serverOS); /* UTF-8 string will not grow more than four times as big as UCS-16 */ ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL); - if (ses->serverOS != NULL) + if (ses->serverOS != NULL) { cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp); + cFYI(1, ("serverOS=%s", ses->serverOS)); + } data += 2 * (len + 1); words_left -= len + 1; @@ -327,6 +329,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft, if (ses->serverNOS != NULL) { cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len, nls_cp); + cFYI(1, ("serverNOS=%s", ses->serverNOS)); if (strncmp(ses->serverNOS, "NT LAN Manager 4", 16) == 0) { cFYI(1, ("NT4 server")); ses->flags |= CIFS_SES_NT4; @@ -343,9 +346,11 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft, kfree(ses->serverDomain); ses->serverDomain = kzalloc((4 * len) + 2, GFP_KERNEL); - if (ses->serverDomain != NULL) + if (ses->serverDomain != NULL) { cifs_strfromUCS_le(ses->serverDomain, (__le16 *)data, len, nls_cp); + cFYI(1, ("serverDomain=%s", ses->serverDomain)); + } data += 2 * (len + 1); words_left -= len + 1; -- cgit v1.2.3 From 22c9d52bc03b880045ab1081890a38f11b272ae7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 16 Apr 2009 13:48:49 -0400 Subject: cifs: remove unneeded bcc_ptr update in CIFSTCon This pointer isn't used again after this point. It's also not updated in the ascii case, so there's no need to update it here. Pointed-out-by: Dave Kleikamp Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4a04ecdc1b7..bacdef1546b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3765,7 +3765,6 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem)); } - bcc_ptr += (2 * length) + 2; } /* else do not bother copying these information fields*/ } else { -- cgit v1.2.3 From b80901bbf599553f483b9509f2dce416b938aae8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 16 Apr 2009 19:09:55 -0700 Subject: splice: fix new kernel-doc warnings splice: fix kernel-doc warnings Warning(fs/splice.c:617): bad line: Warning(fs/splice.c:722): No description found for parameter 'sd' Warning(fs/splice.c:722): Excess function parameter 'pipe' description in 'splice_from_pipe_begin' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- fs/splice.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 5384a90665d..666953d59a3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -614,7 +614,6 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe) * @actor: handler that splices the data * * Description: - * This function loops over the pipe and calls @actor to do the * actual moving of a single struct pipe_buffer to the desired * destination. It returns when there's no more buffers left in @@ -711,7 +710,7 @@ EXPORT_SYMBOL(splice_from_pipe_next); /** * splice_from_pipe_begin - start splicing from pipe - * @pipe: pipe to splice from + * @sd: information about the splice operation * * Description: * This function should be called before a loop containing -- cgit v1.2.3 From d29a2e943867bfa48f72ee6e99723a1b29fe6f7e Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 17 Apr 2009 12:22:35 +0100 Subject: vfat: Note the NLS requirement Close bug #4754. Stop people getting into a situation where they can't get their FAT filesystems to mount as they expect. Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- fs/fat/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index d0a69ff2537..182f9ffe2b5 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -95,3 +95,6 @@ config FAT_DEFAULT_IOCHARSET Note that "utf8" is not recommended for FAT filesystems. If unsure, you shouldn't set "utf8" here. See for more information. + + Enable any character sets you need in File Systems/Native Language + Support. -- cgit v1.2.3 From 6566abdbd0566fc1b5950c9f87ef57c7443d6fa8 Mon Sep 17 00:00:00 2001 From: Matt Kraai Date: Fri, 17 Apr 2009 12:56:38 +0100 Subject: AFS: Guard afs_file_readpage_read_complete() definition with CONFIG_AFS_FSCACHE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_AFS_FSCACHE is not defined, the following warning is displayed when fs/afs/file.c is compiled: fs/afs/file.c:111: warning: ‘afs_file_readpage_read_complete’ defined but not used This occurs because all calls to this function are guarded by CONFIG_AFS_FSCACHE. Thus, guard its definition as well. Signed-off-by: Matt Kraai Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/afs/file.c b/fs/afs/file.c index 7a1d942ef68..0149dab365e 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -102,6 +102,7 @@ int afs_release(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_AFS_FSCACHE /* * deal with notification that a page was read from the cache */ @@ -117,6 +118,7 @@ static void afs_file_readpage_read_complete(struct page *page, SetPageUptodate(page); unlock_page(page); } +#endif /* * AFS read page from file, directory or symlink -- cgit v1.2.3 From fc6f394332ef1bf6ff5fbeaba0f2cd7a3c7971b6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 17 Apr 2009 11:45:30 -0400 Subject: cifs: when renaming don't try to unlink negative dentry When attempting to rename a file on a read-only share, the kernel can call cifs_unlink on a negative dentry, which causes an oops. Only try to unlink the file if it's a positive dentry. Signed-off-by: Jeff Layton Tested-by: Shirish Pargaonkar CC: Stable Signed-off-by: Steve French --- fs/cifs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 09082ac8518..f36b4e40e44 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1453,7 +1453,8 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, checking the UniqueId via FILE_INTERNAL_INFO */ unlink_target: - if ((rc == -EACCES) || (rc == -EEXIST)) { + /* Try unlinking the target dentry if it's not negative */ + if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) { tmprc = cifs_unlink(target_dir, target_dentry); if (tmprc) goto cifs_rename_exit; -- cgit v1.2.3 From 613cbe3d4870429bf2e816d4bbe3146d157ee5c1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 19 Apr 2009 18:40:43 +0200 Subject: Don't set relatime when noatime is specified Since commit 0a1c01c9477602ee8b44548a9405b2c1d587b5a2 ("Make relatime default") when a file system is mounted explicitely with noatime it gets both the MNT_RELATIME and MNT_NOATIME bits set. This shows up like this in /proc/mounts: /dev/xxx /yyy ext3 rw,noatime,relatime,errors=continue,data=writeback 0 0 That looks strange. The VFS uses noatime in this case, but both flags are set. So it's more a cosmetic issue, but still better to fix. Cc: mjg@redhat.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- fs/namespace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index c6f54e4c429..d9138f81ec1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1920,8 +1920,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; - /* Default to relatime */ - mnt_flags |= MNT_RELATIME; + /* Default to relatime unless overriden */ + if (!(flags & MS_NOATIME)) + mnt_flags |= MNT_RELATIME; /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) -- cgit v1.2.3