From ab0920ce7ebb6d60063c793f227ae198a492251b Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 16 Mar 2006 15:06:37 -0800 Subject: ocfs2: multi node truncate fix Fix ocfs2_truncate_file() so that it forces a truncate_inode_pages() on all interested nodes in all cases of a truncate(), not just allocation change. Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 34e903a6a46..581eb451a41 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -260,6 +260,17 @@ static int ocfs2_truncate_file(struct inode *inode, if (new_i_size == le64_to_cpu(fe->i_size)) goto bail; + /* This forces other nodes to sync and drop their pages. Do + * this even if we have a truncate without allocation change - + * ocfs2 cluster sizes can be much greater than page size, so + * we have to truncate them anyway. */ + status = ocfs2_data_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + ocfs2_data_unlock(inode, 1); + if (le32_to_cpu(fe->i_clusters) == ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", @@ -272,14 +283,6 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail; } - /* This forces other nodes to sync and drop their pages */ - status = ocfs2_data_lock(inode, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - ocfs2_data_unlock(inode, 1); - /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the * truncate if necessary. This does the task of marking -- cgit v1.2.3 From 1f7bc828e30fe3e23ea0968b9595ad20e2785978 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 29 Mar 2006 10:33:35 -0800 Subject: ocfs2: remove an overly aggressive BUG() in dlmfs Don't BUG() user_dlm_unblock_lock() on the absence of the USER_LOCK_BLOCKED flag - this turns out to be a valid case. Make some of the related BUG() statements print more useful information. Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index c3764f4744e..bac4615965f 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -268,13 +268,26 @@ static void user_dlm_unblock_lock(void *opaque) spin_lock(&lockres->l_lock); - BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); - BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED)); + mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), + "Lockres %s, flags 0x%x\n", + lockres->l_name, lockres->l_flags); - /* notice that we don't clear USER_LOCK_BLOCKED here. That's - * for user_ast to do. */ + /* notice that we don't clear USER_LOCK_BLOCKED here. If it's + * set, we want user_ast clear it. */ lockres->l_flags &= ~USER_LOCK_QUEUED; + /* It's valid to get here and no longer be blocked - if we get + * several basts in a row, we might be queued by the first + * one, the unblock thread might run and clear the queued + * flag, and finally we might get another bast which re-queues + * us before our ast for the downconvert is called. */ + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { + mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n", + lockres->l_name, lockres->l_flags); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { mlog(0, "lock is in teardown so we do nothing\n"); spin_unlock(&lockres->l_lock); -- cgit v1.2.3 From cc6eb725955efb026007e1d7da8fe5383981afd2 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 29 Mar 2006 10:34:21 -0800 Subject: ocfs2: catch an invalid ast case in dlmfs Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index bac4615965f..d0f1027a385 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -139,6 +139,10 @@ static void user_ast(void *opaque) return; } + mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, + "Lockres %s, requested ivmode. flags 0x%x\n", + lockres->l_name, lockres->l_flags); + /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= -- cgit v1.2.3 From f43e6918c0e3906fd4483316f6a1a07bba615908 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 29 Mar 2006 18:24:12 -0800 Subject: ocfs2: Handle the DLM_CANCELGRANT case in user_unlock_ast() Remove the code which attempted to catch it via dlmunlock() return status - this never happens there. Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index d0f1027a385..808ec0527c7 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -233,23 +233,38 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name); - if (status != DLM_NORMAL) + if (status != DLM_NORMAL && status != DLM_CANCELGRANT) mlog(ML_ERROR, "Dlm returns status %d\n", status); spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) lockres->l_level = LKM_IVMODE; - else { + else if (status == DLM_CANCELGRANT) { + mlog(0, "Lock %s, cancel fails, flags 0x%x\n", + lockres->l_name, lockres->l_flags); + /* We tried to cancel a convert request, but it was + * already granted. Don't clear the busy flag - the + * ast should've done this already. */ + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + goto out_noclear; + } else { + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + /* Cancel succeeded, we want to re-queue */ + mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n", + lockres->l_name, lockres->l_flags); lockres->l_requested = LKM_IVMODE; /* cancel an * upconvert * request. */ lockres->l_flags &= ~USER_LOCK_IN_CANCEL; /* we want the unblock thread to look at it again * now. */ - __user_dlm_queue_lockres(lockres); + if (lockres->l_flags & USER_LOCK_BLOCKED) + __user_dlm_queue_lockres(lockres); } lockres->l_flags &= ~USER_LOCK_BUSY; +out_noclear: spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); @@ -299,7 +314,9 @@ static void user_dlm_unblock_lock(void *opaque) } if (lockres->l_flags & USER_LOCK_BUSY) { - mlog(0, "BUSY flag detected...\n"); + mlog(0, "Cancel lock %s, flags 0x%x\n", + lockres->l_name, lockres->l_flags); + if (lockres->l_flags & USER_LOCK_IN_CANCEL) { spin_unlock(&lockres->l_lock); goto drop_ref; @@ -313,14 +330,7 @@ static void user_dlm_unblock_lock(void *opaque) LKM_CANCEL, user_unlock_ast, lockres); - if (status == DLM_CANCELGRANT) { - /* If we got this, then the ast was fired - * before we could cancel. We cleanup our - * state, and restart the function. */ - spin_lock(&lockres->l_lock); - lockres->l_flags &= ~USER_LOCK_IN_CANCEL; - spin_unlock(&lockres->l_lock); - } else if (status != DLM_NORMAL) + if (status != DLM_NORMAL) user_log_dlm_error("dlmunlock", status, lockres); goto drop_ref; } -- cgit v1.2.3 From 2cd9888590c52ac7592e3607d0a3174ccd57ef86 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 29 Mar 2006 16:49:13 -0800 Subject: ocfs2: test and set teardown flag early in user_dlm_destroy_lock() Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index 808ec0527c7..74ca4e5f976 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -237,9 +237,13 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) mlog(ML_ERROR, "Dlm returns status %d\n", status); spin_lock(&lockres->l_lock); - if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) + /* The teardown flag gets set early during the unlock process, + * so test the cancel flag to make sure that this ast isn't + * for a concurrent cancel. */ + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN + && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { lockres->l_level = LKM_IVMODE; - else if (status == DLM_CANCELGRANT) { + } else if (status == DLM_CANCELGRANT) { mlog(0, "Lock %s, cancel fails, flags 0x%x\n", lockres->l_name, lockres->l_flags); /* We tried to cancel a convert request, but it was @@ -608,6 +612,14 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) mlog(0, "asked to destroy %s\n", lockres->l_name); spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + mlog(0, "Lock is already torn down\n"); + spin_unlock(&lockres->l_lock); + return 0; + } + + lockres->l_flags |= USER_LOCK_IN_TEARDOWN; + while (lockres->l_flags & USER_LOCK_BUSY) { spin_unlock(&lockres->l_lock); @@ -633,7 +645,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; - lockres->l_flags |= USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); mlog(0, "unlocking lockres %s\n", lockres->l_name); -- cgit v1.2.3 From a9e2ae39170d01937725e1fff2e606baaa71346c Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 24 Mar 2006 14:20:17 -0800 Subject: ocfs2: Better I/O error handling in heartbeat Propagate errors received in o2hb_bio_end_io() back to the heartbeat thread so it can skip re-arming the timer. Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/heartbeat.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bff0f0d0686..21f38accd03 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -153,6 +153,7 @@ struct o2hb_region { struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; + int wc_error; }; static void o2hb_write_timeout(void *arg) @@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, { atomic_set(&wc->wc_num_reqs, num_ios); init_completion(&wc->wc_io_complete); + wc->wc_error = 0; } /* Used in error paths too */ @@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio, { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (error) + if (error) { mlog(ML_ERROR, "IO Error %d\n", error); + wc->wc_error = error; + } if (bio->bi_size) return 1; @@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, bail_and_wait: o2hb_wait_on_io(reg, &wc); + if (wc.wc_error && !status) + status = wc.wc_error; if (bios) { for(i = 0; i < num_bios; i++) @@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes, return highest; } -static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) +static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; - if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes))) - return; + ret = o2nm_configured_node_map(configured_nodes, + sizeof(configured_nodes)); + if (ret) { + mlog_errno(ret); + return ret; + } highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); - return; + return -EINVAL; } /* No sense in reading the slots of nodes that don't exist @@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_read_slots(reg, highest_node + 1); if (ret < 0) { mlog_errno(ret); - return; + return ret; } /* With an up to date view of the slots, we can check that no @@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); if (ret < 0) { mlog_errno(ret); - return; + return ret; } i = -1; @@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) */ o2hb_wait_on_io(reg, &write_wc); bio_put(write_bio); + if (write_wc.wc_error) { + /* Do not re-arm the write timeout on I/O error - we + * can't be sure that the new block ever made it to + * disk */ + mlog(ML_ERROR, "Write error %d on device \"%s\"\n", + write_wc.wc_error, reg->hr_dev_name); + return write_wc.wc_error; + } + o2hb_arm_write_timeout(reg); /* let the person who launched us know when things are steady */ @@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) if (atomic_dec_and_test(®->hr_steady_iterations)) wake_up(&o2hb_steady_queue); } + + return 0; } /* Subtract b from a, storing the result in a. a *must* have a larger @@ -913,7 +934,10 @@ static int o2hb_thread(void *data) * likely to time itself out. */ do_gettimeofday(&before_hb); - o2hb_do_disk_heartbeat(reg); + i = 0; + do { + ret = o2hb_do_disk_heartbeat(reg); + } while (ret && ++i < 2); do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); -- cgit v1.2.3 From c7f21e4f5a3d4e378e4d453b2be209dcfd1bb964 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:01:01 +0200 Subject: [PATCH] splice: mark the io page as accessed We should do that, since we do the LRU manipulation ourselves now. Suggested by Nick Piggin. Signed-off-by: Jens Axboe --- fs/splice.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index bfa42a277bb..b450acdff39 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -501,6 +501,7 @@ find_page: } else if (ret) goto out; + mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { -- cgit v1.2.3 From 9aefe431f5a000884db7ae74ac208de814fe5913 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 10 Apr 2006 09:02:40 +0200 Subject: [PATCH] splice: potential !page dereference We can get to out: with a NULL page, which we probably don't want to be calling page_cache_release() on. Signed-off-by: Dave Jones Signed-off-by: Jens Axboe --- fs/splice.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index b450acdff39..26f5f7ecee5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -445,7 +445,7 @@ find_page: ret = -ENOMEM; page = find_or_create_page(mapping, index, gfp_mask); if (!page) - goto out; + goto out_nomem; /* * If the page is uptodate, it is also locked. If it isn't @@ -508,6 +508,7 @@ out: page_cache_release(page); unlock_page(page); } +out_nomem: buf->ops->unmap(info, buf); return ret; } -- cgit v1.2.3 From c0bd1f650bd06a43435808d44f1e9520ea806206 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:03:32 +0200 Subject: [PATCH] splice: only call wake_up_interruptible() when we really have to __wake_up_common() is pretty heavy in the kernel profiles, this brings it down to a more acceptable level. Signed-off-by: Jens Axboe --- fs/splice.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 26f5f7ecee5..9f796b1034d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -187,7 +187,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible_sync(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); do_wakeup = 0; @@ -201,7 +203,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, mutex_unlock(PIPE_MUTEX(*inode)); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); } @@ -600,7 +604,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible_sync(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); do_wakeup = 0; } @@ -611,7 +617,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, mutex_unlock(PIPE_MUTEX(*inode)); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } -- cgit v1.2.3 From 16c523ddabcce5d3d817f4a2491d628f84dfaaa1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:03:58 +0200 Subject: [PATCH] splice: cleanup __generic_file_splice_read() The whole shadow/pages logic got overly complex, and this simpler approach is actually faster in testing. Signed-off-by: Jens Axboe --- fs/splice.c | 59 ++++++++++------------------------------------------------- 1 file changed, 10 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 9f796b1034d..8b5efcc906d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -220,10 +220,10 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; - struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS]; + struct page *pages[PIPE_BUFFERS]; struct page *page; - pgoff_t index, pidx; - int i, j; + pgoff_t index; + int i; index = in->f_pos >> PAGE_CACHE_SHIFT; offset = in->f_pos & ~PAGE_CACHE_MASK; @@ -237,42 +237,14 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, */ do_page_cache_readahead(mapping, in, index, nr_pages); - /* - * Get as many pages from the page cache as possible.. - * Start IO on the page cache entries we create (we - * can assume that any pre-existing ones we find have - * already had IO started on them). - */ - i = find_get_pages(mapping, index, nr_pages, pages); - - /* - * common case - we found all pages and they are contiguous, - * kick them off - */ - if (i && (pages[i - 1]->index == index + i - 1)) - goto splice_them; - - /* - * fill shadow[] with pages at the right locations, so we only - * have to fill holes - */ - memset(shadow, 0, nr_pages * sizeof(struct page *)); - for (j = 0; j < i; j++) - shadow[pages[j]->index - index] = pages[j]; - /* * now fill in the holes */ - for (i = 0, pidx = index; i < nr_pages; pidx++, i++) { - int error; - - if (shadow[i]) - continue; - + for (i = 0; i < nr_pages; i++, index++) { /* * no page there, look one up / create it */ - page = find_or_create_page(mapping, pidx, + page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); if (!page) break; @@ -280,31 +252,20 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, if (PageUptodate(page)) unlock_page(page); else { - error = mapping->a_ops->readpage(in, page); + int error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { page_cache_release(page); break; } } - shadow[i] = page; - } - - if (!i) { - for (i = 0; i < nr_pages; i++) { - if (shadow[i]) - page_cache_release(shadow[i]); - } - return 0; + pages[i] = page; } - memcpy(pages, shadow, i * sizeof(struct page *)); + if (i) + return move_to_pipe(pipe, pages, i, offset, len, flags); - /* - * Now we splice them into the pipe.. - */ -splice_them: - return move_to_pipe(pipe, pages, i, offset, len, flags); + return 0; } /** -- cgit v1.2.3 From 49d0b21be21efc07526d637e0ae935019667e532 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:04:41 +0200 Subject: [PATCH] splice: optimize the splice buffer mapping We don't really need to lock down the pages, just make sure they are uptodate. Signed-off-by: Jens Axboe --- fs/splice.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 8b5efcc906d..50c43a1e092 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -84,26 +84,43 @@ static void *page_cache_pipe_buf_map(struct file *file, struct pipe_buffer *buf) { struct page *page = buf->page; - - lock_page(page); + int err; if (!PageUptodate(page)) { - unlock_page(page); - return ERR_PTR(-EIO); - } + lock_page(page); + + /* + * Page got truncated/unhashed. This will cause a 0-byte + * splice, if this is the first page + */ + if (!page->mapping) { + err = -ENODATA; + goto error; + } + + /* + * uh oh, read-error from disk + */ + if (!PageUptodate(page)) { + err = -EIO; + goto error; + } - if (!page->mapping) { + /* + * page is ok afterall, fall through to mapping + */ unlock_page(page); - return ERR_PTR(-ENODATA); } - return kmap(buf->page); + return kmap(page); +error: + unlock_page(page); + return ERR_PTR(err); } static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) { - unlock_page(buf->page); kunmap(buf->page); } @@ -379,7 +396,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, int ret; /* - * after this, page will be locked and unmapped + * make sure the data in this buffer is uptodate */ src = buf->ops->map(file, info, buf); if (IS_ERR(src)) @@ -399,6 +416,9 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, if (buf->ops->steal(info, buf)) goto find_page; + /* + * this will also set the page locked + */ page = buf->page; if (add_to_page_cache(page, mapping, index, gfp_mask)) goto find_page; -- cgit v1.2.3 From 0b749ce3802428007a37870eb51ba3c0bdf90857 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:05:04 +0200 Subject: [PATCH] splice: be smarter about calling do_page_cache_readahead() We don't want to call into the read-ahead logic unless we are at the start of a page, _or_ we have multiple pages to read. Signed-off-by: Jens Axboe --- fs/splice.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 50c43a1e092..9bfd6af0cf4 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -250,9 +250,12 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, nr_pages = PIPE_BUFFERS; /* - * initiate read-ahead on this page range + * initiate read-ahead on this page range. however, don't call into + * read-ahead if this is a non-zero offset (we are likely doing small + * chunk splice and the page is already there) for a single page. */ - do_page_cache_readahead(mapping, in, index, nr_pages); + if (!offset || nr_pages > 1) + do_page_cache_readahead(mapping, in, index, nr_pages); /* * now fill in the holes -- cgit v1.2.3 From 3a326a2ce88e71d00ac0d133e314a3342a7709f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Apr 2006 15:18:35 +0200 Subject: [PATCH] introduce a "kernel-internal pipe object" abstraction separate out the 'internal pipe object' abstraction, and make it usable to splice. This cleans up and fixes several aspects of the internal splice APIs and the pipe code: - pipes: the allocation and freeing of pipe_inode_info is now more symmetric and more streamlined with existing kernel practices. - splice: small micro-optimization: less pointer dereferencing in splice methods Signed-off-by: Ingo Molnar Update XFS for the ->splice_read/->splice_write changes. Signed-off-by: Jens Axboe --- fs/fifo.c | 12 +++-- fs/pipe.c | 51 +++++++++--------- fs/splice.c | 122 ++++++++++++++++++++++--------------------- fs/xfs/linux-2.6/xfs_file.c | 8 +-- fs/xfs/linux-2.6/xfs_lrw.c | 4 +- fs/xfs/linux-2.6/xfs_lrw.h | 4 +- fs/xfs/linux-2.6/xfs_vnode.h | 4 +- 7 files changed, 106 insertions(+), 99 deletions(-) (limited to 'fs') diff --git a/fs/fifo.c b/fs/fifo.c index 889f722ee36..b16e2f597d6 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -15,12 +15,13 @@ #include #include -static void wait_for_partner(struct inode* inode, unsigned int* cnt) +static void wait_for_partner(struct inode* inode, unsigned int *cnt) { int cur = *cnt; - while(cur == *cnt) { - pipe_wait(inode); - if(signal_pending(current)) + + while (cur == *cnt) { + pipe_wait(inode->i_pipe); + if (signal_pending(current)) break; } } @@ -37,7 +38,8 @@ static int fifo_open(struct inode *inode, struct file *filp) mutex_lock(PIPE_MUTEX(*inode)); if (!inode->i_pipe) { ret = -ENOMEM; - if(!pipe_new(inode)) + inode->i_pipe = alloc_pipe_info(inode); + if (!inode->i_pipe) goto err_nocleanup; } filp->f_version = 0; diff --git a/fs/pipe.c b/fs/pipe.c index 795df987cd3..705b4869262 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -36,7 +36,7 @@ */ /* Drop the inode semaphore and wait for a pipe event, atomically */ -void pipe_wait(struct inode * inode) +void pipe_wait(struct pipe_inode_info *pipe) { DEFINE_WAIT(wait); @@ -44,11 +44,13 @@ void pipe_wait(struct inode * inode) * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); - mutex_unlock(PIPE_MUTEX(*inode)); + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); schedule(); - finish_wait(PIPE_WAIT(*inode), &wait); - mutex_lock(PIPE_MUTEX(*inode)); + finish_wait(&pipe->wait, &wait); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); } static int @@ -223,7 +225,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, wake_up_interruptible_sync(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } - pipe_wait(inode); + pipe_wait(inode->i_pipe); } mutex_unlock(PIPE_MUTEX(*inode)); /* Signal writers asynchronously that there is more room. */ @@ -370,7 +372,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, do_wakeup = 0; } PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); + pipe_wait(inode->i_pipe); PIPE_WAITING_WRITERS(*inode)--; } out: @@ -675,6 +677,20 @@ static struct file_operations rdwr_pipe_fops = { .fasync = pipe_rdwr_fasync, }; +struct pipe_inode_info * alloc_pipe_info(struct inode *inode) +{ + struct pipe_inode_info *info; + + info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + if (info) { + init_waitqueue_head(&info->wait); + info->r_counter = info->w_counter = 1; + info->inode = inode; + } + + return info; +} + void free_pipe_info(struct inode *inode) { int i; @@ -691,23 +707,6 @@ void free_pipe_info(struct inode *inode) kfree(info); } -struct inode* pipe_new(struct inode* inode) -{ - struct pipe_inode_info *info; - - info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); - if (!info) - goto fail_page; - inode->i_pipe = info; - - init_waitqueue_head(PIPE_WAIT(*inode)); - PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; - - return inode; -fail_page: - return NULL; -} - static struct vfsmount *pipe_mnt __read_mostly; static int pipefs_delete_dentry(struct dentry *dentry) { @@ -724,8 +723,10 @@ static struct inode * get_pipe_inode(void) if (!inode) goto fail_inode; - if(!pipe_new(inode)) + inode->i_pipe = alloc_pipe_info(inode); + if (!inode->i_pipe) goto fail_iput; + PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1; inode->i_fop = &rdwr_pipe_fops; diff --git a/fs/splice.c b/fs/splice.c index 9bfd6af0cf4..ed91a62402e 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -136,34 +136,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct inode *inode, struct page **pages, +static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, int nr_pages, unsigned long offset, unsigned long len, unsigned int flags) { - struct pipe_inode_info *info; int ret, do_wakeup, i; ret = 0; do_wakeup = 0; i = 0; - mutex_lock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); - info = inode->i_pipe; for (;;) { int bufs; - if (!PIPE_READERS(*inode)) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - bufs = info->nrbufs; + bufs = pipe->nrbufs; if (bufs < PIPE_BUFFERS) { - int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1); - struct pipe_buffer *buf = info->bufs + newbuf; + int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pages[i++]; unsigned long this_len; @@ -175,7 +174,7 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, buf->offset = offset; buf->len = this_len; buf->ops = &page_cache_pipe_buf_ops; - info->nrbufs = ++bufs; + pipe->nrbufs = ++bufs; do_wakeup = 1; ret += this_len; @@ -205,25 +204,25 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, - POLL_IN); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); - PIPE_WAITING_WRITERS(*inode)--; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; } - mutex_unlock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } while (i < nr_pages) @@ -232,8 +231,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, return ret; } -static int __generic_file_splice_read(struct file *in, struct inode *pipe, - size_t len, unsigned int flags) +static int +__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; @@ -298,7 +298,7 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, * Will read pages from given file and fill them into a pipe. * */ -ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, +ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { ssize_t spliced; @@ -306,6 +306,7 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, ret = 0; spliced = 0; + while (len) { ret = __generic_file_splice_read(in, pipe, len, flags); @@ -509,11 +510,10 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -static ssize_t move_from_pipe(struct inode *inode, struct file *out, +static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, size_t len, unsigned int flags, splice_actor *actor) { - struct pipe_inode_info *info; int ret, do_wakeup, err; struct splice_desc sd; @@ -525,22 +525,22 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, sd.file = out; sd.pos = out->f_pos; - mutex_lock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); - info = inode->i_pipe; for (;;) { - int bufs = info->nrbufs; + int bufs = pipe->nrbufs; if (bufs) { - int curbuf = info->curbuf; - struct pipe_buffer *buf = info->bufs + curbuf; + int curbuf = pipe->curbuf; + struct pipe_buffer *buf = pipe->bufs + curbuf; struct pipe_buf_operations *ops = buf->ops; sd.len = buf->len; if (sd.len > sd.total_len) sd.len = sd.total_len; - err = actor(info, buf, &sd); + err = actor(pipe, buf, &sd); if (err) { if (!ret && err != -ENODATA) ret = err; @@ -553,10 +553,10 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, buf->len -= sd.len; if (!buf->len) { buf->ops = NULL; - ops->release(info, buf); + ops->release(pipe, buf); curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); - info->curbuf = curbuf; - info->nrbufs = --bufs; + pipe->curbuf = curbuf; + pipe->nrbufs = --bufs; do_wakeup = 1; } @@ -568,9 +568,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, if (bufs) continue; - if (!PIPE_WRITERS(*inode)) + if (!pipe->writers) break; - if (!PIPE_WAITING_WRITERS(*inode)) { + if (!pipe->waiting_writers) { if (ret) break; } @@ -589,22 +589,23 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); do_wakeup = 0; } - pipe_wait(inode); + pipe_wait(pipe); } - mutex_unlock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } mutex_lock(&out->f_mapping->host->i_mutex); @@ -616,7 +617,7 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, /** * generic_file_splice_write - splice data from a pipe to a file - * @inode: pipe inode + * @pipe: pipe info * @out: file to write to * @len: number of bytes to splice * @flags: splice modifier flags @@ -625,11 +626,14 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, * the given pipe inode to the given file. * */ -ssize_t generic_file_splice_write(struct inode *inode, struct file *out, - size_t len, unsigned int flags) +ssize_t +generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, + size_t len, unsigned int flags) { struct address_space *mapping = out->f_mapping; - ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file); + ssize_t ret; + + ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); /* * if file or inode is SYNC and we actually wrote some data, sync it @@ -664,10 +668,10 @@ EXPORT_SYMBOL(generic_file_splice_write); * is involved. * */ -ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, +ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, size_t len, unsigned int flags) { - return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); + return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -675,8 +679,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. */ -static long do_splice_from(struct inode *pipe, struct file *out, size_t len, - unsigned int flags) +static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + size_t len, unsigned int flags) { loff_t pos; int ret; @@ -698,8 +702,8 @@ static long do_splice_from(struct inode *pipe, struct file *out, size_t len, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct inode *pipe, size_t len, - unsigned int flags) +static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { loff_t pos, isize, left; int ret; @@ -732,14 +736,14 @@ static long do_splice_to(struct file *in, struct inode *pipe, size_t len, static long do_splice(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct inode *pipe; + struct pipe_inode_info *pipe; - pipe = in->f_dentry->d_inode; - if (pipe->i_pipe) + pipe = in->f_dentry->d_inode->i_pipe; + if (pipe) return do_splice_from(pipe, out, len, flags); - pipe = out->f_dentry->d_inode; - if (pipe->i_pipe) + pipe = out->f_dentry->d_inode->i_pipe; + if (pipe) return do_splice_to(in, pipe, len, flags); return -EINVAL; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index ae4c4754ed3..269721af02f 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -252,7 +252,7 @@ xfs_file_sendfile_invis( STATIC ssize_t xfs_file_splice_read( struct file *infilp, - struct inode *pipe, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) { @@ -266,7 +266,7 @@ xfs_file_splice_read( STATIC ssize_t xfs_file_splice_read_invis( struct file *infilp, - struct inode *pipe, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) { @@ -279,7 +279,7 @@ xfs_file_splice_read_invis( STATIC ssize_t xfs_file_splice_write( - struct inode *pipe, + struct pipe_inode_info *pipe, struct file *outfilp, size_t len, unsigned int flags) @@ -293,7 +293,7 @@ xfs_file_splice_write( STATIC ssize_t xfs_file_splice_write_invis( - struct inode *pipe, + struct pipe_inode_info *pipe, struct file *outfilp, size_t len, unsigned int flags) diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 90cd314acba..74a52937f20 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -338,7 +338,7 @@ ssize_t xfs_splice_read( bhv_desc_t *bdp, struct file *infilp, - struct inode *pipe, + struct pipe_inode_info *pipe, size_t count, int flags, int ioflags, @@ -380,7 +380,7 @@ xfs_splice_read( ssize_t xfs_splice_write( bhv_desc_t *bdp, - struct inode *pipe, + struct pipe_inode_info *pipe, struct file *outfilp, size_t count, int flags, diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index eaa5659713f..55c689a86ad 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -94,9 +94,9 @@ extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, - struct inode *, size_t, int, int, + struct pipe_inode_info *, size_t, int, int, struct cred *); -extern ssize_t xfs_splice_write(struct bhv_desc *, struct inode *, +extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *, struct file *, size_t, int, int, struct cred *); diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 6f1c79a28f8..88b09f18628 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -174,9 +174,9 @@ typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, - struct inode *, size_t, int, int, + struct pipe_inode_info *, size_t, int, int, struct cred *); -typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct inode *, +typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *, struct file *, size_t, int, int, struct cred *); typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, -- cgit v1.2.3 From 529565dcb1581c9a1e3f6df1c1763ca3e0f0d512 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Apr 2006 15:18:58 +0200 Subject: [PATCH] splice: add optional input and output offsets add optional input and output offsets to sys_splice(), for seekable file descriptors: asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, size_t len, unsigned int flags); semantics are straightforward: f_pos will be updated with the offset provided by user-space, before the splice transfer is about to begin. Providing a NULL offset pointer means the existing f_pos will be used (and updated in situ). Providing an offset for a pipe results in -ESPIPE. Providing an invalid offset pointer results in -EFAULT. Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/splice.c | 54 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index ed91a62402e..a5326127aad 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -680,7 +680,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t __user *off_out, size_t len, + unsigned int flags) { loff_t pos; int ret; @@ -691,7 +692,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (!(out->f_mode & FMODE_WRITE)) return -EBADF; + if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t))) + return -EFAULT; + pos = out->f_pos; + ret = rw_verify_area(WRITE, out, &pos, len); if (unlikely(ret < 0)) return ret; @@ -702,8 +707,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +static long do_splice_to(struct file *in, loff_t __user *off_in, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { loff_t pos, isize, left; int ret; @@ -714,7 +720,11 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, if (!(in->f_mode & FMODE_READ)) return -EBADF; + if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + return -EFAULT; + pos = in->f_pos; + ret = rw_verify_area(READ, in, &pos, len); if (unlikely(ret < 0)) return ret; @@ -733,23 +743,39 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, /* * Determine where to splice to/from. */ -static long do_splice(struct file *in, struct file *out, size_t len, - unsigned int flags) +static long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) { struct pipe_inode_info *pipe; + if (off_out && out->f_op->llseek == no_llseek) + return -EINVAL; + if (off_in && in->f_op->llseek == no_llseek) + return -EINVAL; + pipe = in->f_dentry->d_inode->i_pipe; - if (pipe) - return do_splice_from(pipe, out, len, flags); + if (pipe) { + if (off_in) + return -ESPIPE; + + return do_splice_from(pipe, out, off_out, len, flags); + } pipe = out->f_dentry->d_inode->i_pipe; - if (pipe) - return do_splice_to(in, pipe, len, flags); + if (pipe) { + if (off_out) + return -ESPIPE; + + return do_splice_to(in, off_in, pipe, len, flags); + } return -EINVAL; } -asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) +asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, + int fd_out, loff_t __user *off_out, + size_t len, unsigned int flags) { long error; struct file *in, *out; @@ -759,13 +785,15 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) return 0; error = -EBADF; - in = fget_light(fdin, &fput_in); + in = fget_light(fd_in, &fput_in); if (in) { if (in->f_mode & FMODE_READ) { - out = fget_light(fdout, &fput_out); + out = fget_light(fd_out, &fput_out); if (out) { if (out->f_mode & FMODE_WRITE) - error = do_splice(in, out, len, flags); + error = do_splice(in, off_in, + out, off_out, + len, flags); fput_light(out, fput_out); } } -- cgit v1.2.3 From cbca692c246874a3cc1b5a9b694add4c39e8bc18 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Thu, 23 Mar 2006 00:36:54 +0100 Subject: [PATCH] Bogus NULL pointer check in fs/configfs/dir.c We check the "group" pointer after we dereference it. This check is bogus, as it cannot be NULL coming in. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 8ed9b06a982..5638c8f9362 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -504,7 +504,7 @@ static int populate_groups(struct config_group *group) int ret = 0; int i; - if (group && group->default_groups) { + if (group->default_groups) { /* FYI, we're faking mkdir here * I'm not sure we need this semaphore, as we're called * from our parent's mkdir. That holds our parent's -- cgit v1.2.3 From 65714b918415e06c92426f6544b2296dae694590 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 26 Mar 2006 14:25:52 +0200 Subject: [PATCH] CONFIGFS_FS must depend on SYSFS This patch fixes the a compile error with CONFIG_SYSFS=n Configfs is creating, as a matter of policy, the /sys/kernel/config mountpoint. This means it requires CONFIG_SYSFS. Signed-off-by: Adrian Bunk Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index e207be68d4c..97f31741312 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -861,7 +861,7 @@ config RAMFS config CONFIGFS_FS tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" - depends on EXPERIMENTAL + depends on SYSFS && EXPERIMENTAL help configfs is a ram-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based -- cgit v1.2.3 From de12a7878c11f3b282d640888aa635e0711d0b5e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Apr 2006 17:16:49 -0600 Subject: [PATCH] de_thread: Don't confuse users do_each_thread. Oleg Nesterov spotted two interesting bugs with the current de_thread code. The simplest is a long standing double decrement of __get_cpu_var(process_counts) in __unhash_process. Caused by two processes exiting when only one was created. The other is that since we no longer detach from the thread_group list it is possible for do_each_thread when run under the tasklist_lock to see the same task_struct twice. Once on the task list as a thread_group_leader, and once on the thread list of another thread. The double appearance in do_each_thread can cause a double increment of mm_core_waiters in zap_threads resulting in problems later on in coredump_wait. To remedy those two problems this patch takes the simple approach of changing the old thread group leader into a child thread. The only routine in release_task that cares is __unhash_process, and it can be trivially seen that we handle cleaning up a thread group leader properly. Since de_thread doesn't change the pid of the exiting leader process and instead shares it with the new leader process. I change thread_group_leader to recognize group leadership based on the group_leader field and not based on pids. This should also be slightly cheaper then the existing thread_group_leader macro. I performed a quick audit and I couldn't see any user of thread_group_leader that cared about the difference. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- fs/exec.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 0291a68a362..4d38ad0b70d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -723,7 +723,12 @@ static int de_thread(struct task_struct *tsk) current->parent = current->real_parent = leader->real_parent; leader->parent = leader->real_parent = child_reaper; current->group_leader = current; - leader->group_leader = leader; + leader->group_leader = current; + + /* Reduce leader to a thread */ + detach_pid(leader, PIDTYPE_PGID); + detach_pid(leader, PIDTYPE_SID); + list_del_init(&leader->tasks); add_parent(current); add_parent(leader); -- cgit v1.2.3 From e50bd16fe49689bc5fb54fca5ed8b568dfba65c6 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 11 Apr 2006 15:10:45 +1000 Subject: [XFS] Fix superblock validation regression for the zero imaxpct case. Thanks to kjamieson for noticing. SGI-PV: 951661 SGI-Modid: xfs-linux-melb:xfs-kern:25675a Signed-off-by: Nathan Scott --- fs/xfs/xfs_mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 049fabb7f7e..c0b1c290688 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -270,7 +270,7 @@ xfs_mount_validate_sb( (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || - (sbp->sb_imax_pct > 100 || sbp->sb_imax_pct < 1))) { + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); return XFS_ERROR(EFSCORRUPTED); } -- cgit v1.2.3 From 8272145c05c6d01a34f5114357c5e8093fb66472 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 11 Apr 2006 15:10:55 +1000 Subject: [XFS] Fix a writepage regression where we accidentally stopped honouring nonblock mode with the new IO path code (since 2.6.16). SGI-PV: 951662 SGI-Modid: xfs-linux-melb:xfs-kern:25676a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_aops.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 6cbbd165c60..4d191ef39b6 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -870,12 +870,14 @@ xfs_page_state_convert( pgoff_t end_index, last_index, tlast; ssize_t size, len; int flags, err, iomap_valid = 0, uptodate = 1; - int page_dirty, count = 0, trylock_flag = 0; + int page_dirty, count = 0; + int trylock = 0; int all_bh = unmapped; - /* wait for other IO threads? */ - if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)) - trylock_flag |= BMAPI_TRYLOCK; + if (startio) { + if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) + trylock |= BMAPI_TRYLOCK; + } /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -956,15 +958,13 @@ xfs_page_state_convert( if (buffer_unwritten(bh)) { type = IOMAP_UNWRITTEN; - flags = BMAPI_WRITE|BMAPI_IGNSTATE; + flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { type = IOMAP_DELAY; - flags = BMAPI_ALLOCATE; - if (!startio) - flags |= trylock_flag; + flags = BMAPI_ALLOCATE | trylock; } else { type = IOMAP_NEW; - flags = BMAPI_WRITE|BMAPI_MMAP; + flags = BMAPI_WRITE | BMAPI_MMAP; } if (!iomap_valid) { -- cgit v1.2.3 From 1fc5d959d88a5f77aa7e4435f6c9d0e2d2236704 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Tue, 11 Apr 2006 15:11:12 +1000 Subject: [XFS] Fix inode reclaim scalability regression. When a filesystem has millions of inodes cached and has sparse cluster population, removing inodes from the cluster hash consumes excessive amounts of CPU time. Reduce the CPU cost by making removal O(1) via use of a double linked list for the hash chains. SGI-PV: 951551 SGI-Modid: xfs-linux-melb:xfs-kern:25683a Signed-off-by: David Chinner Signed-off-by: Nathan Scott --- fs/xfs/xfs_iget.c | 29 ++++++++++++----------------- fs/xfs/xfs_inode.h | 1 + 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index bb33113eef9..b5385432526 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -421,7 +421,10 @@ finish_inode: ip->i_chash = chlnew; chlnew->chl_ip = ip; chlnew->chl_blkno = ip->i_blkno; + if (ch->ch_list) + ch->ch_list->chl_prev = chlnew; chlnew->chl_next = ch->ch_list; + chlnew->chl_prev = NULL; ch->ch_list = chlnew; chlnew = NULL; } @@ -723,23 +726,15 @@ xfs_iextract( ASSERT(ip->i_cnext == ip && ip->i_cprev == ip); ASSERT(ip->i_chash != NULL); chm=NULL; - for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { - if (chl->chl_blkno == ip->i_blkno) { - if (chm == NULL) { - /* first item on the list */ - ch->ch_list = chl->chl_next; - } else { - chm->chl_next = chl->chl_next; - } - kmem_zone_free(xfs_chashlist_zone, chl); - break; - } else { - ASSERT(chl->chl_ip != ip); - chm = chl; - } - } - ASSERT_ALWAYS(chl != NULL); - } else { + chl = ip->i_chash; + if (chl->chl_prev) + chl->chl_prev->chl_next = chl->chl_next; + else + ch->ch_list = chl->chl_next; + if (chl->chl_next) + chl->chl_next->chl_prev = chl->chl_prev; + kmem_zone_free(xfs_chashlist_zone, chl); + } else { /* delete one inode from a non-empty list */ iq = ip->i_cnext; iq->i_cprev = ip->i_cprev; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 39ef9c36ea5..3b544db1790 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -189,6 +189,7 @@ typedef struct xfs_ihash { */ typedef struct xfs_chashlist { struct xfs_chashlist *chl_next; + struct xfs_chashlist *chl_prev; struct xfs_inode *chl_ip; xfs_daddr_t chl_blkno; /* starting block number of * the cluster */ -- cgit v1.2.3 From 58829e490ee805f1c8b3009abc90e2a1a7a0d278 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Tue, 11 Apr 2006 15:11:20 +1000 Subject: [XFS] Fix an inode use-after-free durin an unpin. When reclaiming inodes that have been unlinked, we may need to execute transactions during reclaim. By the time the transaction has hit the disk, the linux inode and xfs vnode may already have been freed so we can't reference them safely. Use the known xfs inode state to determine if it is safe to reference the vnode and linux inode during the unpin operation. SGI-PV: 946321 SGI-Modid: xfs-linux-melb:xfs-kern:25687a Signed-off-by: David Chinner Signed-off-by: Nathan Scott --- fs/xfs/xfs_inode.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 48146bdc6bd..94b60dd0380 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2732,16 +2732,29 @@ xfs_iunpin( ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) { - vnode_t *vp = XFS_ITOV_NULL(ip); + /* + * If the inode is currently being reclaimed, the + * linux inode _and_ the xfs vnode may have been + * freed so we cannot reference either of them safely. + * Hence we should not try to do anything to them + * if the xfs inode is currently in the reclaim + * path. + * + * However, we still need to issue the unpin wakeup + * call as the inode reclaim may be blocked waiting for + * the inode to become unpinned. + */ + if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) { + vnode_t *vp = XFS_ITOV_NULL(ip); - /* make sync come back and flush this inode */ - if (vp) { - struct inode *inode = vn_to_inode(vp); + /* make sync come back and flush this inode */ + if (vp) { + struct inode *inode = vn_to_inode(vp); - if (!(inode->i_state & I_NEW)) - mark_inode_dirty_sync(inode); + if (!(inode->i_state & I_NEW)) + mark_inode_dirty_sync(inode); + } } - wake_up(&ip->i_ipin_wait); } } -- cgit v1.2.3 From 8c0b5113a55c698f3190ec85925815640f1c2049 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 11 Apr 2006 15:12:45 +1000 Subject: [XFS] Fix utime(2) in the case that no times parameter was passed in. SGI-PV: 949858 SGI-Modid: xfs-linux-melb:xfs-kern:25717a Signed-off-by: Jes Sorensen Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_iops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 149237304fb..2e2e275c786 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -673,8 +673,7 @@ xfs_vn_setattr( if (ia_valid & ATTR_ATIME) { vattr.va_mask |= XFS_AT_ATIME; vattr.va_atime = attr->ia_atime; - if (ia_valid & ATTR_ATIME_SET) - inode->i_atime = attr->ia_atime; + inode->i_atime = attr->ia_atime; } if (ia_valid & ATTR_MTIME) { vattr.va_mask |= XFS_AT_MTIME; -- cgit v1.2.3 From 019ff2d57b0bbe77d1eca19f5b634e5e7ff2a0b8 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 11 Apr 2006 15:45:05 +1000 Subject: [XFS] Fix a problem in aligning inode allocations to stripe unit boundaries. SGI-PV: 951862 SGI-Modid: xfs-linux-melb:xfs-kern:25726a Signed-off-by: Nathan Scott --- fs/xfs/xfs_ialloc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 4eeb856183b..deddbd03c16 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -158,9 +158,10 @@ xfs_ialloc_ag_alloc( */ agi = XFS_BUF_TO_AGI(agbp); newino = be32_to_cpu(agi->agi_newino); - if(likely(newino != NULLAGINO)) { - args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + - XFS_IALLOC_BLOCKS(args.mp); + args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + + XFS_IALLOC_BLOCKS(args.mp); + if (likely(newino != NULLAGINO && + (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, be32_to_cpu(agi->agi_seqno), args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; @@ -182,8 +183,8 @@ xfs_ialloc_ag_alloc( * Set the alignment for the allocation. * If stripe alignment is turned on then align at stripe unit * boundary. - * If the cluster size is smaller than a filesystem block - * then we're doing I/O for inodes in filesystem block size + * If the cluster size is smaller than a filesystem block + * then we're doing I/O for inodes in filesystem block size * pieces, so don't need alignment anyway. */ isaligned = 0; @@ -192,7 +193,7 @@ xfs_ialloc_ag_alloc( args.alignment = args.mp->m_dalign; isaligned = 1; } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= + args.mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp))) args.alignment = args.mp->m_sb.sb_inoalignmt; @@ -220,7 +221,7 @@ xfs_ialloc_ag_alloc( if ((error = xfs_alloc_vextent(&args))) return error; } - + /* * If stripe alignment is turned on, then try again with cluster * alignment. -- cgit v1.2.3 From b92ce55893745e011edae70830b8bc863be881f9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:52:07 +0200 Subject: [PATCH] splice: add direct fd <-> fd splicing support It's more efficient for sendfile() emulation. Basically we cache an internal private pipe and just use that as the intermediate area for pages. Direct splicing is not available from sys_splice(), it is only meant to be used for sendfile() emulation. Additional patch from Ingo Molnar to avoid the PIPE_BUFFERS loop at exit for the normal fast path. Signed-off-by: Jens Axboe --- fs/pipe.c | 10 ++-- fs/splice.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 137 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 705b4869262..036536f072c 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -691,12 +691,10 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) return info; } -void free_pipe_info(struct inode *inode) +void __free_pipe_info(struct pipe_inode_info *info) { int i; - struct pipe_inode_info *info = inode->i_pipe; - inode->i_pipe = NULL; for (i = 0; i < PIPE_BUFFERS; i++) { struct pipe_buffer *buf = info->bufs + i; if (buf->ops) @@ -707,6 +705,12 @@ void free_pipe_info(struct inode *inode) kfree(info); } +void free_pipe_info(struct inode *inode) +{ + __free_pipe_info(inode->i_pipe); + inode->i_pipe = NULL; +} + static struct vfsmount *pipe_mnt __read_mostly; static int pipefs_delete_dentry(struct dentry *dentry) { diff --git a/fs/splice.c b/fs/splice.c index a5326127aad..c47b561edac 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -680,8 +680,7 @@ EXPORT_SYMBOL(generic_splice_sendpage); * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t __user *off_out, size_t len, - unsigned int flags) + size_t len, unsigned int flags) { loff_t pos; int ret; @@ -692,9 +691,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (!(out->f_mode & FMODE_WRITE)) return -EBADF; - if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t))) - return -EFAULT; - pos = out->f_pos; ret = rw_verify_area(WRITE, out, &pos, len); @@ -707,9 +703,8 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, loff_t __user *off_in, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { loff_t pos, isize, left; int ret; @@ -720,9 +715,6 @@ static long do_splice_to(struct file *in, loff_t __user *off_in, if (!(in->f_mode & FMODE_READ)) return -EBADF; - if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) - return -EFAULT; - pos = in->f_pos; ret = rw_verify_area(READ, in, &pos, len); @@ -740,6 +732,118 @@ static long do_splice_to(struct file *in, loff_t __user *off_in, return in->f_op->splice_read(in, pipe, len, flags); } +long do_splice_direct(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *pipe; + long ret, bytes; + umode_t i_mode; + int i; + + /* + * We require the input being a regular file, as we don't want to + * randomly drop data for eg socket -> socket splicing. Use the + * piped splicing for that! + */ + i_mode = in->f_dentry->d_inode->i_mode; + if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) + return -EINVAL; + + /* + * neither in nor out is a pipe, setup an internal pipe attached to + * 'out' and transfer the wanted data from 'in' to 'out' through that + */ + pipe = current->splice_pipe; + if (!pipe) { + pipe = alloc_pipe_info(NULL); + if (!pipe) + return -ENOMEM; + + /* + * We don't have an immediate reader, but we'll read the stuff + * out of the pipe right after the move_to_pipe(). So set + * PIPE_READERS appropriately. + */ + pipe->readers = 1; + + current->splice_pipe = pipe; + } + + /* + * do the splice + */ + ret = 0; + bytes = 0; + + while (len) { + size_t read_len, max_read_len; + + /* + * Do at most PIPE_BUFFERS pages worth of transfer: + */ + max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); + + ret = do_splice_to(in, pipe, max_read_len, flags); + if (unlikely(ret < 0)) + goto out_release; + + read_len = ret; + + /* + * NOTE: nonblocking mode only applies to the input. We + * must not do the output in nonblocking mode as then we + * could get stuck data in the internal pipe: + */ + ret = do_splice_from(pipe, out, read_len, + flags & ~SPLICE_F_NONBLOCK); + if (unlikely(ret < 0)) + goto out_release; + + bytes += ret; + len -= ret; + + /* + * In nonblocking mode, if we got back a short read then + * that was due to either an IO error or due to the + * pagecache entry not being there. In the IO error case + * the _next_ splice attempt will produce a clean IO error + * return value (not a short read), so in both cases it's + * correct to break out of the loop here: + */ + if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) + break; + } + + pipe->nrbufs = pipe->curbuf = 0; + + return bytes; + +out_release: + /* + * If we did an incomplete transfer we must release + * the pipe buffers in question: + */ + for (i = 0; i < PIPE_BUFFERS; i++) { + struct pipe_buffer *buf = pipe->bufs + i; + + if (buf->ops) { + buf->ops->release(pipe, buf); + buf->ops = NULL; + } + } + pipe->nrbufs = pipe->curbuf = 0; + + /* + * If we transferred some data, return the number of bytes: + */ + if (bytes > 0) + return bytes; + + return ret; +} + +EXPORT_SYMBOL(do_splice_direct); + /* * Determine where to splice to/from. */ @@ -749,25 +853,33 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *pipe; - if (off_out && out->f_op->llseek == no_llseek) - return -EINVAL; - if (off_in && in->f_op->llseek == no_llseek) - return -EINVAL; - pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { if (off_in) return -ESPIPE; + if (off_out) { + if (out->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&out->f_pos, off_out, + sizeof(loff_t))) + return -EFAULT; + } - return do_splice_from(pipe, out, off_out, len, flags); + return do_splice_from(pipe, out, len, flags); } pipe = out->f_dentry->d_inode->i_pipe; if (pipe) { if (off_out) return -ESPIPE; + if (off_in) { + if (in->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + return -EFAULT; + } - return do_splice_to(in, off_in, pipe, len, flags); + return do_splice_to(in, pipe, len, flags); } return -EINVAL; -- cgit v1.2.3 From 7480a90435673b4c717b6caf1350ec577d5f1adf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:52:47 +0200 Subject: [PATCH] splice: speedup __generic_file_splice_read Using find_get_page() is a lot faster than find_or_create_page(). This gets splice a lot closer to sendfile() for fd -> socket transfers. Signed-off-by: Jens Axboe --- fs/splice.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index c47b561edac..e30743c2c06 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -240,7 +240,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, struct page *pages[PIPE_BUFFERS]; struct page *page; pgoff_t index; - int i; + int i, error; index = in->f_pos >> PAGE_CACHE_SHIFT; offset = in->f_pos & ~PAGE_CACHE_MASK; @@ -260,32 +260,84 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, /* * now fill in the holes */ + error = 0; for (i = 0; i < nr_pages; i++, index++) { +find_page: /* - * no page there, look one up / create it + * lookup the page for this index */ - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - if (!page) - break; + page = find_get_page(mapping, index); + if (!page) { + /* + * If in nonblock mode then dont block on + * readpage (we've kicked readahead so there + * will be asynchronous progress): + */ + if (flags & SPLICE_F_NONBLOCK) + break; + + /* + * page didn't exist, allocate one + */ + page = page_cache_alloc_cold(mapping); + if (!page) + break; + + error = add_to_page_cache_lru(page, mapping, index, + mapping_gfp_mask(mapping)); + if (unlikely(error)) { + page_cache_release(page); + break; + } - if (PageUptodate(page)) - unlock_page(page); - else { - int error = mapping->a_ops->readpage(in, page); + goto readpage; + } + + /* + * If the page isn't uptodate, we may need to start io on it + */ + if (!PageUptodate(page)) { + lock_page(page); + + /* + * page was truncated, stop here. if this isn't the + * first page, we'll just complete what we already + * added + */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + break; + } + /* + * page was already under io and is now done, great + */ + if (PageUptodate(page)) { + unlock_page(page); + goto fill_it; + } + +readpage: + /* + * need to read in the page + */ + error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { page_cache_release(page); + if (error == AOP_TRUNCATED_PAGE) + goto find_page; break; } } +fill_it: pages[i] = page; } if (i) return move_to_pipe(pipe, pages, i, offset, len, flags); - return 0; + return error; } /** -- cgit v1.2.3 From 9aeedfc4712ed58d9f7ae41596185c72b8dc97e8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Apr 2006 13:53:10 +0200 Subject: [PATCH] get rid of the PIPE_*() macros get rid of the PIPE_*() macros. Scripted transformation. Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/fifo.c | 48 ++++++++++++++--------------- fs/pipe.c | 104 +++++++++++++++++++++++++++++++------------------------------- 2 files changed, 76 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/fifo.c b/fs/fifo.c index b16e2f597d6..2c27f56d730 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -28,14 +28,14 @@ static void wait_for_partner(struct inode* inode, unsigned int *cnt) static void wake_up_partner(struct inode* inode) { - wake_up_interruptible(PIPE_WAIT(*inode)); + wake_up_interruptible(&inode->i_pipe->wait); } static int fifo_open(struct inode *inode, struct file *filp) { int ret; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); if (!inode->i_pipe) { ret = -ENOMEM; inode->i_pipe = alloc_pipe_info(inode); @@ -55,18 +55,18 @@ static int fifo_open(struct inode *inode, struct file *filp) * opened, even when there is no process writing the FIFO. */ filp->f_op = &read_fifo_fops; - PIPE_RCOUNTER(*inode)++; - if (PIPE_READERS(*inode)++ == 0) + inode->i_pipe->r_counter++; + if (inode->i_pipe->readers++ == 0) wake_up_partner(inode); - if (!PIPE_WRITERS(*inode)) { + if (!inode->i_pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress POLLHUP until we have * seen a writer */ - filp->f_version = PIPE_WCOUNTER(*inode); + filp->f_version = inode->i_pipe->w_counter; } else { - wait_for_partner(inode, &PIPE_WCOUNTER(*inode)); + wait_for_partner(inode, &inode->i_pipe->w_counter); if(signal_pending(current)) goto err_rd; } @@ -80,16 +80,16 @@ static int fifo_open(struct inode *inode, struct file *filp) * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; - if ((filp->f_flags & O_NONBLOCK) && !PIPE_READERS(*inode)) + if ((filp->f_flags & O_NONBLOCK) && !inode->i_pipe->readers) goto err; filp->f_op = &write_fifo_fops; - PIPE_WCOUNTER(*inode)++; - if (!PIPE_WRITERS(*inode)++) + inode->i_pipe->w_counter++; + if (!inode->i_pipe->writers++) wake_up_partner(inode); - if (!PIPE_READERS(*inode)) { - wait_for_partner(inode, &PIPE_RCOUNTER(*inode)); + if (!inode->i_pipe->readers) { + wait_for_partner(inode, &inode->i_pipe->r_counter); if (signal_pending(current)) goto err_wr; } @@ -104,11 +104,11 @@ static int fifo_open(struct inode *inode, struct file *filp) */ filp->f_op = &rdwr_fifo_fops; - PIPE_READERS(*inode)++; - PIPE_WRITERS(*inode)++; - PIPE_RCOUNTER(*inode)++; - PIPE_WCOUNTER(*inode)++; - if (PIPE_READERS(*inode) == 1 || PIPE_WRITERS(*inode) == 1) + inode->i_pipe->readers++; + inode->i_pipe->writers++; + inode->i_pipe->r_counter++; + inode->i_pipe->w_counter++; + if (inode->i_pipe->readers == 1 || inode->i_pipe->writers == 1) wake_up_partner(inode); break; @@ -118,27 +118,27 @@ static int fifo_open(struct inode *inode, struct file *filp) } /* Ok! */ - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return 0; err_rd: - if (!--PIPE_READERS(*inode)) - wake_up_interruptible(PIPE_WAIT(*inode)); + if (!--inode->i_pipe->readers) + wake_up_interruptible(&inode->i_pipe->wait); ret = -ERESTARTSYS; goto err; err_wr: - if (!--PIPE_WRITERS(*inode)) - wake_up_interruptible(PIPE_WAIT(*inode)); + if (!--inode->i_pipe->writers) + wake_up_interruptible(&inode->i_pipe->wait); ret = -ERESTARTSYS; goto err; err: - if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) + if (!inode->i_pipe->readers && !inode->i_pipe->writers) free_pipe_info(inode); err_nocleanup: - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return ret; } diff --git a/fs/pipe.c b/fs/pipe.c index 036536f072c..0602fc9f7eb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -158,7 +158,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); info = inode->i_pipe; for (;;) { int bufs = info->nrbufs; @@ -202,9 +202,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } if (bufs) /* More to do? */ continue; - if (!PIPE_WRITERS(*inode)) + if (!inode->i_pipe->writers) break; - if (!PIPE_WAITING_WRITERS(*inode)) { + if (!inode->i_pipe->waiting_writers) { /* syscall merging: Usually we must not sleep * if O_NONBLOCK is set, or if we got some data. * But if a writer sleeps in kernel space, then @@ -222,16 +222,16 @@ pipe_readv(struct file *filp, const struct iovec *_iov, break; } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible_sync(&inode->i_pipe->wait); + kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); } pipe_wait(inode->i_pipe); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible(&inode->i_pipe->wait); + kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) file_accessed(filp); @@ -264,10 +264,10 @@ pipe_writev(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); info = inode->i_pipe; - if (!PIPE_READERS(*inode)) { + if (!inode->i_pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; @@ -306,7 +306,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, for (;;) { int bufs; - if (!PIPE_READERS(*inode)) { + if (!inode->i_pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; @@ -367,19 +367,19 @@ pipe_writev(struct file *filp, const struct iovec *_iov, break; } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + wake_up_interruptible_sync(&inode->i_pipe->wait); + kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - PIPE_WAITING_WRITERS(*inode)++; + inode->i_pipe->waiting_writers++; pipe_wait(inode->i_pipe); - PIPE_WAITING_WRITERS(*inode)--; + inode->i_pipe->waiting_writers--; } out: - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + wake_up_interruptible(&inode->i_pipe->wait); + kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) file_update_time(filp); @@ -416,7 +416,7 @@ pipe_ioctl(struct inode *pino, struct file *filp, switch (cmd) { case FIONREAD: - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); info = inode->i_pipe; count = 0; buf = info->curbuf; @@ -425,7 +425,7 @@ pipe_ioctl(struct inode *pino, struct file *filp, count += info->bufs[buf].len; buf = (buf+1) & (PIPE_BUFFERS-1); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return put_user(count, (int __user *)arg); default: return -EINVAL; @@ -441,14 +441,14 @@ pipe_poll(struct file *filp, poll_table *wait) struct pipe_inode_info *info = inode->i_pipe; int nrbufs; - poll_wait(filp, PIPE_WAIT(*inode), wait); + poll_wait(filp, &inode->i_pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ nrbufs = info->nrbufs; mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; - if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode)) + if (!inode->i_pipe->writers && filp->f_version != inode->i_pipe->w_counter) mask |= POLLHUP; } @@ -458,7 +458,7 @@ pipe_poll(struct file *filp, poll_table *wait) * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ - if (!PIPE_READERS(*inode)) + if (!inode->i_pipe->readers) mask |= POLLERR; } @@ -468,17 +468,17 @@ pipe_poll(struct file *filp, poll_table *wait) static int pipe_release(struct inode *inode, int decr, int decw) { - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_READERS(*inode) -= decr; - PIPE_WRITERS(*inode) -= decw; - if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) { + mutex_lock(&inode->i_mutex); + inode->i_pipe->readers -= decr; + inode->i_pipe->writers -= decw; + if (!inode->i_pipe->readers && !inode->i_pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible(&inode->i_pipe->wait); + kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); + kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return 0; } @@ -489,9 +489,9 @@ pipe_read_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -506,9 +506,9 @@ pipe_write_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -523,14 +523,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); if (retval >= 0) - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -569,9 +569,9 @@ pipe_read_open(struct inode *inode, struct file *filp) { /* We could have perhaps used atomic_t, but this and friends below are the only places. So it doesn't seem worthwhile. */ - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_READERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + inode->i_pipe->readers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -579,9 +579,9 @@ pipe_read_open(struct inode *inode, struct file *filp) static int pipe_write_open(struct inode *inode, struct file *filp) { - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_WRITERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + inode->i_pipe->writers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -589,12 +589,12 @@ pipe_write_open(struct inode *inode, struct file *filp) static int pipe_rdwr_open(struct inode *inode, struct file *filp) { - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); if (filp->f_mode & FMODE_READ) - PIPE_READERS(*inode)++; + inode->i_pipe->readers++; if (filp->f_mode & FMODE_WRITE) - PIPE_WRITERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + inode->i_pipe->writers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -731,7 +731,7 @@ static struct inode * get_pipe_inode(void) if (!inode->i_pipe) goto fail_iput; - PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1; + inode->i_pipe->readers = inode->i_pipe->writers = 1; inode->i_fop = &rdwr_pipe_fops; /* -- cgit v1.2.3 From 923f4f23940d2361e8d5c4245982163a8e9d1c91 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Apr 2006 13:53:33 +0200 Subject: [PATCH] pipe.c/fifo.c code cleanups more code cleanups after the macro conversion: - standardize on 'struct pipe_inode_info *pipe' variable names - introduce 'pipe' temporaries to reduce mass inode->i_pipe dereferencing Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/fifo.c | 49 ++++++++++--------- fs/pipe.c | 163 ++++++++++++++++++++++++++++++++------------------------------ 2 files changed, 111 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/fs/fifo.c b/fs/fifo.c index 2c27f56d730..49035b174b4 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -33,14 +33,17 @@ static void wake_up_partner(struct inode* inode) static int fifo_open(struct inode *inode, struct file *filp) { + struct pipe_inode_info *pipe; int ret; mutex_lock(&inode->i_mutex); - if (!inode->i_pipe) { + pipe = inode->i_pipe; + if (!pipe) { ret = -ENOMEM; - inode->i_pipe = alloc_pipe_info(inode); - if (!inode->i_pipe) + pipe = alloc_pipe_info(inode); + if (!pipe) goto err_nocleanup; + inode->i_pipe = pipe; } filp->f_version = 0; @@ -55,18 +58,18 @@ static int fifo_open(struct inode *inode, struct file *filp) * opened, even when there is no process writing the FIFO. */ filp->f_op = &read_fifo_fops; - inode->i_pipe->r_counter++; - if (inode->i_pipe->readers++ == 0) + pipe->r_counter++; + if (pipe->readers++ == 0) wake_up_partner(inode); - if (!inode->i_pipe->writers) { + if (!pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress POLLHUP until we have * seen a writer */ - filp->f_version = inode->i_pipe->w_counter; + filp->f_version = pipe->w_counter; } else { - wait_for_partner(inode, &inode->i_pipe->w_counter); + wait_for_partner(inode, &pipe->w_counter); if(signal_pending(current)) goto err_rd; } @@ -80,16 +83,16 @@ static int fifo_open(struct inode *inode, struct file *filp) * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; - if ((filp->f_flags & O_NONBLOCK) && !inode->i_pipe->readers) + if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) goto err; filp->f_op = &write_fifo_fops; - inode->i_pipe->w_counter++; - if (!inode->i_pipe->writers++) + pipe->w_counter++; + if (!pipe->writers++) wake_up_partner(inode); - if (!inode->i_pipe->readers) { - wait_for_partner(inode, &inode->i_pipe->r_counter); + if (!pipe->readers) { + wait_for_partner(inode, &pipe->r_counter); if (signal_pending(current)) goto err_wr; } @@ -104,11 +107,11 @@ static int fifo_open(struct inode *inode, struct file *filp) */ filp->f_op = &rdwr_fifo_fops; - inode->i_pipe->readers++; - inode->i_pipe->writers++; - inode->i_pipe->r_counter++; - inode->i_pipe->w_counter++; - if (inode->i_pipe->readers == 1 || inode->i_pipe->writers == 1) + pipe->readers++; + pipe->writers++; + pipe->r_counter++; + pipe->w_counter++; + if (pipe->readers == 1 || pipe->writers == 1) wake_up_partner(inode); break; @@ -122,19 +125,19 @@ static int fifo_open(struct inode *inode, struct file *filp) return 0; err_rd: - if (!--inode->i_pipe->readers) - wake_up_interruptible(&inode->i_pipe->wait); + if (!--pipe->readers) + wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err_wr: - if (!--inode->i_pipe->writers) - wake_up_interruptible(&inode->i_pipe->wait); + if (!--pipe->writers) + wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err: - if (!inode->i_pipe->readers && !inode->i_pipe->writers) + if (!pipe->readers && !pipe->writers) free_pipe_info(inode); err_nocleanup: diff --git a/fs/pipe.c b/fs/pipe.c index 0602fc9f7eb..b941e1951ea 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -93,7 +93,7 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) return 0; } -static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf) +static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; @@ -104,8 +104,8 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff * temporary page, let's keep track of it as a one-deep * allocation cache */ - if (page_count(page) == 1 && !info->tmp_page) { - info->tmp_page = page; + if (page_count(page) == 1 && !pipe->tmp_page) { + pipe->tmp_page = page; return; } @@ -115,17 +115,17 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff page_cache_release(page); } -static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf) +static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return kmap(buf->page); } -static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) +static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { kunmap(buf->page); } -static int anon_pipe_buf_steal(struct pipe_inode_info *info, +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { buf->flags |= PIPE_BUF_FLAG_STOLEN; @@ -145,7 +145,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, unsigned long nr_segs, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; int do_wakeup; ssize_t ret; struct iovec *iov = (struct iovec *)_iov; @@ -159,12 +159,12 @@ pipe_readv(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; mutex_lock(&inode->i_mutex); - info = inode->i_pipe; + pipe = inode->i_pipe; for (;;) { - int bufs = info->nrbufs; + int bufs = pipe->nrbufs; if (bufs) { - int curbuf = info->curbuf; - struct pipe_buffer *buf = info->bufs + curbuf; + int curbuf = pipe->curbuf; + struct pipe_buffer *buf = pipe->bufs + curbuf; struct pipe_buf_operations *ops = buf->ops; void *addr; size_t chars = buf->len; @@ -173,14 +173,14 @@ pipe_readv(struct file *filp, const struct iovec *_iov, if (chars > total_len) chars = total_len; - addr = ops->map(filp, info, buf); + addr = ops->map(filp, pipe, buf); if (IS_ERR(addr)) { if (!ret) ret = PTR_ERR(addr); break; } error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); - ops->unmap(info, buf); + ops->unmap(pipe, buf); if (unlikely(error)) { if (!ret) ret = -EFAULT; break; @@ -190,10 +190,10 @@ pipe_readv(struct file *filp, const struct iovec *_iov, buf->len -= chars; if (!buf->len) { buf->ops = NULL; - ops->release(info, buf); + ops->release(pipe, buf); curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); - info->curbuf = curbuf; - info->nrbufs = --bufs; + pipe->curbuf = curbuf; + pipe->nrbufs = --bufs; do_wakeup = 1; } total_len -= chars; @@ -202,9 +202,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } if (bufs) /* More to do? */ continue; - if (!inode->i_pipe->writers) + if (!pipe->writers) break; - if (!inode->i_pipe->waiting_writers) { + if (!pipe->waiting_writers) { /* syscall merging: Usually we must not sleep * if O_NONBLOCK is set, or if we got some data. * But if a writer sleeps in kernel space, then @@ -222,16 +222,16 @@ pipe_readv(struct file *filp, const struct iovec *_iov, break; } if (do_wakeup) { - wake_up_interruptible_sync(&inode->i_pipe->wait); - kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - pipe_wait(inode->i_pipe); + pipe_wait(pipe); } mutex_unlock(&inode->i_mutex); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible(&inode->i_pipe->wait); - kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) file_accessed(filp); @@ -250,7 +250,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, unsigned long nr_segs, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; ssize_t ret; int do_wakeup; struct iovec *iov = (struct iovec *)_iov; @@ -265,9 +265,9 @@ pipe_writev(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; mutex_lock(&inode->i_mutex); - info = inode->i_pipe; + pipe = inode->i_pipe; - if (!inode->i_pipe->readers) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; @@ -275,23 +275,23 @@ pipe_writev(struct file *filp, const struct iovec *_iov, /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ - if (info->nrbufs && chars != 0) { - int lastbuf = (info->curbuf + info->nrbufs - 1) & (PIPE_BUFFERS-1); - struct pipe_buffer *buf = info->bufs + lastbuf; + if (pipe->nrbufs && chars != 0) { + int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (PIPE_BUFFERS-1); + struct pipe_buffer *buf = pipe->bufs + lastbuf; struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; if (ops->can_merge && offset + chars <= PAGE_SIZE) { void *addr; int error; - addr = ops->map(filp, info, buf); + addr = ops->map(filp, pipe, buf); if (IS_ERR(addr)) { error = PTR_ERR(addr); goto out; } error = pipe_iov_copy_from_user(offset + addr, iov, chars); - ops->unmap(info, buf); + ops->unmap(pipe, buf); ret = error; do_wakeup = 1; if (error) @@ -306,16 +306,16 @@ pipe_writev(struct file *filp, const struct iovec *_iov, for (;;) { int bufs; - if (!inode->i_pipe->readers) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - bufs = info->nrbufs; + bufs = pipe->nrbufs; if (bufs < PIPE_BUFFERS) { - int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS-1); - struct pipe_buffer *buf = info->bufs + newbuf; - struct page *page = info->tmp_page; + int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + struct page *page = pipe->tmp_page; int error; if (!page) { @@ -324,7 +324,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, ret = ret ? : -ENOMEM; break; } - info->tmp_page = page; + pipe->tmp_page = page; } /* Always wakeup, even if the copy fails. Otherwise * we lock up (O_NONBLOCK-)readers that sleep due to @@ -349,8 +349,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = chars; - info->nrbufs = ++bufs; - info->tmp_page = NULL; + pipe->nrbufs = ++bufs; + pipe->tmp_page = NULL; total_len -= chars; if (!total_len) @@ -367,19 +367,19 @@ pipe_writev(struct file *filp, const struct iovec *_iov, break; } if (do_wakeup) { - wake_up_interruptible_sync(&inode->i_pipe->wait); - kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - inode->i_pipe->waiting_writers++; - pipe_wait(inode->i_pipe); - inode->i_pipe->waiting_writers--; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; } out: mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(&inode->i_pipe->wait); - kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) file_update_time(filp); @@ -411,21 +411,22 @@ pipe_ioctl(struct inode *pino, struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; int count, buf, nrbufs; switch (cmd) { case FIONREAD: mutex_lock(&inode->i_mutex); - info = inode->i_pipe; + pipe = inode->i_pipe; count = 0; - buf = info->curbuf; - nrbufs = info->nrbufs; + buf = pipe->curbuf; + nrbufs = pipe->nrbufs; while (--nrbufs >= 0) { - count += info->bufs[buf].len; + count += pipe->bufs[buf].len; buf = (buf+1) & (PIPE_BUFFERS-1); } mutex_unlock(&inode->i_mutex); + return put_user(count, (int __user *)arg); default: return -EINVAL; @@ -438,17 +439,17 @@ pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info = inode->i_pipe; + struct pipe_inode_info *pipe = inode->i_pipe; int nrbufs; - poll_wait(filp, &inode->i_pipe->wait, wait); + poll_wait(filp, &pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ - nrbufs = info->nrbufs; + nrbufs = pipe->nrbufs; mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; - if (!inode->i_pipe->writers && filp->f_version != inode->i_pipe->w_counter) + if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= POLLHUP; } @@ -458,7 +459,7 @@ pipe_poll(struct file *filp, poll_table *wait) * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ - if (!inode->i_pipe->readers) + if (!pipe->readers) mask |= POLLERR; } @@ -468,15 +469,18 @@ pipe_poll(struct file *filp, poll_table *wait) static int pipe_release(struct inode *inode, int decr, int decw) { + struct pipe_inode_info *pipe; + mutex_lock(&inode->i_mutex); - inode->i_pipe->readers -= decr; - inode->i_pipe->writers -= decw; - if (!inode->i_pipe->readers && !inode->i_pipe->writers) { + pipe = inode->i_pipe; + pipe->readers -= decr; + pipe->writers -= decw; + if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible(&inode->i_pipe->wait); - kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); - kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } mutex_unlock(&inode->i_mutex); @@ -679,30 +683,30 @@ static struct file_operations rdwr_pipe_fops = { struct pipe_inode_info * alloc_pipe_info(struct inode *inode) { - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; - info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); - if (info) { - init_waitqueue_head(&info->wait); - info->r_counter = info->w_counter = 1; - info->inode = inode; + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + if (pipe) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->inode = inode; } - return info; + return pipe; } -void __free_pipe_info(struct pipe_inode_info *info) +void __free_pipe_info(struct pipe_inode_info *pipe) { int i; for (i = 0; i < PIPE_BUFFERS; i++) { - struct pipe_buffer *buf = info->bufs + i; + struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) - buf->ops->release(info, buf); + buf->ops->release(pipe, buf); } - if (info->tmp_page) - __free_page(info->tmp_page); - kfree(info); + if (pipe->tmp_page) + __free_page(pipe->tmp_page); + kfree(pipe); } void free_pipe_info(struct inode *inode) @@ -723,15 +727,17 @@ static struct dentry_operations pipefs_dentry_operations = { static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct pipe_inode_info *pipe; if (!inode) goto fail_inode; - inode->i_pipe = alloc_pipe_info(inode); - if (!inode->i_pipe) + pipe = alloc_pipe_info(inode); + if (!pipe) goto fail_iput; + inode->i_pipe = pipe; - inode->i_pipe->readers = inode->i_pipe->writers = 1; + pipe->readers = pipe->writers = 1; inode->i_fop = &rdwr_pipe_fops; /* @@ -746,6 +752,7 @@ static struct inode * get_pipe_inode(void) inode->i_gid = current->fsgid; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_blksize = PAGE_SIZE; + return inode; fail_iput: -- cgit v1.2.3 From 6f767b0425f5902e4817648632230b512e81c963 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:53:56 +0200 Subject: [PATCH] splice: speedups and optimizations - Kill the local variables that cache ->nrbufs, they just take up space. - Only set do_wakeup for a real pipe. This is a big win for direct splicing. - Kill i_mutex lock around ->f_pos update, regular io paths don't do this either. Signed-off-by: Jens Axboe --- fs/splice.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index e30743c2c06..36bc262dfbd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -150,8 +150,6 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, mutex_lock(&pipe->inode->i_mutex); for (;;) { - int bufs; - if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) @@ -159,9 +157,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; } - bufs = pipe->nrbufs; - if (bufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS - 1); + if (pipe->nrbufs < PIPE_BUFFERS) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pages[i++]; unsigned long this_len; @@ -174,8 +171,9 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, buf->offset = offset; buf->len = this_len; buf->ops = &page_cache_pipe_buf_ops; - pipe->nrbufs = ++bufs; - do_wakeup = 1; + pipe->nrbufs++; + if (pipe->inode) + do_wakeup = 1; ret += this_len; len -= this_len; @@ -184,7 +182,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; if (!len) break; - if (bufs < PIPE_BUFFERS) + if (pipe->nrbufs < PIPE_BUFFERS) continue; break; @@ -581,11 +579,8 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, mutex_lock(&pipe->inode->i_mutex); for (;;) { - int bufs = pipe->nrbufs; - - if (bufs) { - int curbuf = pipe->curbuf; - struct pipe_buffer *buf = pipe->bufs + curbuf; + if (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; struct pipe_buf_operations *ops = buf->ops; sd.len = buf->len; @@ -606,10 +601,10 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); - pipe->curbuf = curbuf; - pipe->nrbufs = --bufs; - do_wakeup = 1; + pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->nrbufs--; + if (pipe->inode) + do_wakeup = 1; } sd.pos += sd.len; @@ -618,7 +613,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, break; } - if (bufs) + if (pipe->nrbufs) continue; if (!pipe->writers) break; @@ -660,9 +655,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_lock(&out->f_mapping->host->i_mutex); out->f_pos = sd.pos; - mutex_unlock(&out->f_mapping->host->i_mutex); return ret; } -- cgit v1.2.3 From 49570e9b29a3d78950b5eba6b73bdcca955f0877 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:56:09 +0200 Subject: [PATCH] splice: unlikely() optimizations Also corrects a few comments. Patch mainly from Ingo, changes by me. Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/read_write.c | 2 +- fs/splice.c | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 6256ca81a71..5bc0e9234f9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -202,7 +202,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count goto Einval; inode = file->f_dentry->d_inode; - if (inode->i_flock && MANDATORY_LOCK(inode)) { + if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) { int retval = locks_mandatory_area( read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); diff --git a/fs/splice.c b/fs/splice.c index 36bc262dfbd..77b026baff7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -346,7 +346,6 @@ fill_it: * @flags: splice modifier flags * * Will read pages from given file and fill them into a pipe. - * */ ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -690,7 +689,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, mutex_lock(&inode->i_mutex); err = generic_osync_inode(mapping->host, mapping, - OSYNC_METADATA|OSYNC_DATA); + OSYNC_METADATA|OSYNC_DATA); mutex_unlock(&inode->i_mutex); if (err) @@ -730,10 +729,10 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t pos; int ret; - if (!out->f_op || !out->f_op->splice_write) + if (unlikely(!out->f_op || !out->f_op->splice_write)) return -EINVAL; - if (!(out->f_mode & FMODE_WRITE)) + if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; pos = out->f_pos; @@ -754,10 +753,10 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, loff_t pos, isize, left; int ret; - if (!in->f_op || !in->f_op->splice_read) + if (unlikely(!in->f_op || !in->f_op->splice_read)) return -EINVAL; - if (!(in->f_mode & FMODE_READ)) + if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; pos = in->f_pos; @@ -771,7 +770,7 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, return 0; left = isize - in->f_pos; - if (left < len) + if (unlikely(left < len)) len = left; return in->f_op->splice_read(in, pipe, len, flags); @@ -799,7 +798,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, * 'out' and transfer the wanted data from 'in' to 'out' through that */ pipe = current->splice_pipe; - if (!pipe) { + if (unlikely(!pipe)) { pipe = alloc_pipe_info(NULL); if (!pipe) return -ENOMEM; -- cgit v1.2.3 From c2058e0611c2fb4a00e63db61ef3a638cf760a98 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:56:34 +0200 Subject: [PATCH] splice: add Ingo as addition copyright holder The comment is also somewhat out of date, correct that as well. Signed-off-by: Jens Axboe --- fs/splice.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 77b026baff7..48b4382cc11 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -9,11 +9,12 @@ * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by - * Jens to support splicing to files and fixing the initial implementation - * bugs. + * Jens to support splicing to files, network, direct splicing, etc and + * fixing lots of bugs. * - * Copyright (C) 2005 Jens Axboe - * Copyright (C) 2005 Linus Torvalds + * Copyright (C) 2005-2006 Jens Axboe + * Copyright (C) 2005-2006 Linus Torvalds + * Copyright (C) 2006 Ingo Molnar * */ #include -- cgit v1.2.3 From 73d62d83ec3627782ba6f55defc76f3ffbef46ee Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Apr 2006 13:57:21 +0200 Subject: [PATCH] splice: comment styles - capitalize consistently - end sentences in one way or another - update comment text to match the implementation Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/splice.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 48b4382cc11..e50a460239d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -92,7 +92,7 @@ static void *page_cache_pipe_buf_map(struct file *file, /* * Page got truncated/unhashed. This will cause a 0-byte - * splice, if this is the first page + * splice, if this is the first page. */ if (!page->mapping) { err = -ENODATA; @@ -100,7 +100,7 @@ static void *page_cache_pipe_buf_map(struct file *file, } /* - * uh oh, read-error from disk + * Uh oh, read-error from disk. */ if (!PageUptodate(page)) { err = -EIO; @@ -108,7 +108,7 @@ static void *page_cache_pipe_buf_map(struct file *file, } /* - * page is ok afterall, fall through to mapping + * Page is ok afterall, fall through to mapping. */ unlock_page(page); } @@ -249,7 +249,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, nr_pages = PIPE_BUFFERS; /* - * initiate read-ahead on this page range. however, don't call into + * Initiate read-ahead on this page range. however, don't call into * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ @@ -257,7 +257,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, do_page_cache_readahead(mapping, in, index, nr_pages); /* - * now fill in the holes + * Now fill in the holes: */ error = 0; for (i = 0; i < nr_pages; i++, index++) { @@ -396,10 +396,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, int more; /* - * sub-optimal, but we are limited by the pipe ->map. we don't + * Sub-optimal, but we are limited by the pipe ->map. We don't * need a kmap'ed buffer here, we just want to make sure we * have the page pinned if the pipe page originates from the - * page cache + * page cache. */ ptr = buf->ops->map(file, info, buf); if (IS_ERR(ptr)) @@ -460,7 +460,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, offset = sd->pos & ~PAGE_CACHE_MASK; /* - * reuse buf page, if SPLICE_F_MOVE is set + * Reuse buf page, if SPLICE_F_MOVE is set. */ if (sd->flags & SPLICE_F_MOVE) { /* @@ -501,7 +501,7 @@ find_page: if (!PageUptodate(page)) { /* - * page got invalidated, repeat + * Page got invalidated, repeat. */ if (!page->mapping) { unlock_page(page); @@ -598,6 +598,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, ret += sd.len; buf->offset += sd.len; buf->len -= sd.len; + if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); @@ -681,7 +682,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); /* - * if file or inode is SYNC and we actually wrote some data, sync it + * If file or inode is SYNC and we actually wrote some data, sync it. */ if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) && ret > 0) { @@ -815,7 +816,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, } /* - * do the splice + * Do the splice. */ ret = 0; bytes = 0; -- cgit v1.2.3 From 341b446bc5aa36d1d5b8159c1e66716b5d89024d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Apr 2006 13:57:45 +0200 Subject: [PATCH] another round of fs/pipe.c cleanups make pipe.c a bit more readable and hackable. Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/pipe.c | 76 ++++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index b941e1951ea..e984beb93a0 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -44,7 +44,8 @@ void pipe_wait(struct pipe_inode_info *pipe) * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); + prepare_to_wait(&pipe->wait, &wait, + TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); schedule(); @@ -93,7 +94,8 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) return 0; } -static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +static void anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; @@ -102,25 +104,22 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buff /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep - * allocation cache + * allocation cache. (Otherwise just release our reference to it) */ - if (page_count(page) == 1 && !pipe->tmp_page) { + if (page_count(page) == 1 && !pipe->tmp_page) pipe->tmp_page = page; - return; - } - - /* - * Otherwise just release our reference to it - */ - page_cache_release(page); + else + page_cache_release(page); } -static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, struct pipe_buffer *buf) +static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return kmap(buf->page); } -static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { kunmap(buf->page); } @@ -182,7 +181,8 @@ pipe_readv(struct file *filp, const struct iovec *_iov, error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); ops->unmap(pipe, buf); if (unlikely(error)) { - if (!ret) ret = -EFAULT; + if (!ret) + ret = -EFAULT; break; } ret += chars; @@ -218,7 +218,8 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } } if (signal_pending(current)) { - if (!ret) ret = -ERESTARTSYS; + if (!ret) + ret = -ERESTARTSYS; break; } if (do_wakeup) { @@ -228,7 +229,8 @@ pipe_readv(struct file *filp, const struct iovec *_iov, pipe_wait(pipe); } mutex_unlock(&inode->i_mutex); - /* Signal writers asynchronously that there is more room. */ + + /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { wake_up_interruptible(&pipe->wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); @@ -242,6 +244,7 @@ static ssize_t pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = count }; + return pipe_readv(filp, &iov, 1, ppos); } @@ -276,10 +279,12 @@ pipe_writev(struct file *filp, const struct iovec *_iov, /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ if (pipe->nrbufs && chars != 0) { - int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (PIPE_BUFFERS-1); + int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & + (PIPE_BUFFERS-1); struct pipe_buffer *buf = pipe->bufs + lastbuf; struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; + if (ops->can_merge && offset + chars <= PAGE_SIZE) { void *addr; int error; @@ -306,9 +311,11 @@ pipe_writev(struct file *filp, const struct iovec *_iov, for (;;) { int bufs; + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); - if (!ret) ret = -EPIPE; + if (!ret) + ret = -EPIPE; break; } bufs = pipe->nrbufs; @@ -326,7 +333,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, } pipe->tmp_page = page; } - /* Always wakeup, even if the copy fails. Otherwise + /* Always wake up, even if the copy fails. Otherwise * we lock up (O_NONBLOCK-)readers that sleep due to * syscall merging. * FIXME! Is this really true? @@ -339,7 +346,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, error = pipe_iov_copy_from_user(kmap(page), iov, chars); kunmap(page); if (unlikely(error)) { - if (!ret) ret = -EFAULT; + if (!ret) + ret = -EFAULT; break; } ret += chars; @@ -359,11 +367,13 @@ pipe_writev(struct file *filp, const struct iovec *_iov, if (bufs < PIPE_BUFFERS) continue; if (filp->f_flags & O_NONBLOCK) { - if (!ret) ret = -EAGAIN; + if (!ret) + ret = -EAGAIN; break; } if (signal_pending(current)) { - if (!ret) ret = -ERESTARTSYS; + if (!ret) + ret = -ERESTARTSYS; break; } if (do_wakeup) { @@ -391,6 +401,7 @@ pipe_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + return pipe_writev(filp, &iov, 1, ppos); } @@ -401,7 +412,8 @@ bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) } static ssize_t -bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) +bad_pipe_w(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) { return -EBADF; } @@ -475,6 +487,7 @@ pipe_release(struct inode *inode, int decr, int decw) pipe = inode->i_pipe; pipe->readers -= decr; pipe->writers -= decw; + if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { @@ -525,14 +538,15 @@ static int pipe_rdwr_fasync(int fd, struct file *filp, int on) { struct inode *inode = filp->f_dentry->d_inode; + struct pipe_inode_info *pipe = inode->i_pipe; int retval; mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); + retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if (retval >= 0) - retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); + retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); mutex_unlock(&inode->i_mutex); @@ -720,6 +734,7 @@ static int pipefs_delete_dentry(struct dentry *dentry) { return 1; } + static struct dentry_operations pipefs_dentry_operations = { .d_delete = pipefs_delete_dentry, }; @@ -757,6 +772,7 @@ static struct inode * get_pipe_inode(void) fail_iput: iput(inode); + fail_inode: return NULL; } @@ -769,7 +785,7 @@ int do_pipe(int *fd) struct inode * inode; struct file *f1, *f2; int error; - int i,j; + int i, j; error = -ENFILE; f1 = get_empty_filp(); @@ -802,6 +818,7 @@ int do_pipe(int *fd) dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this); if (!dentry) goto close_f12_inode_i_j; + dentry->d_op = &pipefs_dentry_operations; d_add(dentry, inode); f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt)); @@ -825,6 +842,7 @@ int do_pipe(int *fd) fd_install(j, f2); fd[0] = i; fd[1] = j; + return 0; close_f12_inode_i_j: @@ -849,8 +867,9 @@ no_files: * d_name - pipe: will go nicely and kill the special-casing in procfs. */ -static struct super_block *pipefs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static struct super_block * +pipefs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC); } @@ -864,6 +883,7 @@ static struct file_system_type pipe_fs_type = { static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); + if (!err) { pipe_mnt = kern_mount(&pipe_fs_type); if (IS_ERR(pipe_mnt)) { -- cgit v1.2.3 From 29ff2db55196717e2e67e0f04adc833ee7edd491 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Apr 2006 22:52:46 -0700 Subject: [PATCH] select() warning fixes fs/select.c: In function `core_sys_select': fs/select.c:339: warning: assignment from incompatible pointer type fs/select.c:376: warning: comparison of distinct pointer types lacks a cast By using a void* we can remove lots of casts rather than adding more. Cc: Jes Sorensen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/select.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index 071660fa7b0..fce0fd1bb1d 100644 --- a/fs/select.c +++ b/fs/select.c @@ -310,7 +310,7 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout) { fd_set_bits fds; - char *bits; + void *bits; int ret, size, max_fdset; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ @@ -341,12 +341,12 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, bits = kmalloc(6 * size, GFP_KERNEL); if (!bits) goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); + fds.in = bits; + fds.out = bits + size; + fds.ex = bits + 2*size; + fds.res_in = bits + 3*size; + fds.res_out = bits + 4*size; + fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || -- cgit v1.2.3 From 7b04d7170e9af805cac19f97b28fff10db897893 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 10 Apr 2006 22:53:27 -0700 Subject: [PATCH] Add GFP_NOWAIT Introduce GFP_NOWAIT, as an alias for GFP_ATOMIC & ~__GFP_HIGH. This also changes XFS, which is the only in-tree user of this idiom that I could find. The XFS piece is compile-tested only. Signed-off-by: Jeff Dike Acked-by: Nathan Scott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 9fb0312665c..26fed0756f0 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -182,7 +182,7 @@ free_address( { a_list_t *aentry; - aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); + aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); if (likely(aentry)) { spin_lock(&as_lock); aentry->next = as_free_head; -- cgit v1.2.3 From 5246d0503130fa58904c8beb987fcf93b96d8ab6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Apr 2006 22:53:57 -0700 Subject: [PATCH] sync_file_range(): use unsigned for flags Ulrich suggested that the `flags' arg to sync_file_range() become unsigned. Cc: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sync.c b/fs/sync.c index 8616006d209..aab5ffe77e9 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -61,7 +61,7 @@ * will be available after a crash. */ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, - int flags) + unsigned int flags) { int ret; struct file *file; @@ -126,7 +126,7 @@ out: * `endbyte' is inclusive */ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, - int flags) + unsigned int flags) { int ret; struct address_space *mapping; -- cgit v1.2.3 From f6422f17d3a480f21917a3895e2a46b968f56a08 Mon Sep 17 00:00:00 2001 From: Herbert Poetzl Date: Mon, 10 Apr 2006 22:54:03 -0700 Subject: [PATCH] vfs: propagate mnt_flags into do_loopback/vfsmount The mnt_flags are propagated into do_loopback(), so that they can be stored with the vfsmount Signed-off-by: Herbert Poetzl Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index bf478addb85..2c5f1f80bdc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -899,11 +899,13 @@ static int do_change_type(struct nameidata *nd, int flag) /* * do loopback mount. */ -static int do_loopback(struct nameidata *nd, char *old_name, int recurse) +static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags) { struct nameidata old_nd; struct vfsmount *mnt = NULL; + int recurse = flags & MS_REC; int err = mount_is_safe(nd); + if (err) return err; if (!old_name || !*old_name) @@ -937,6 +939,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse) spin_unlock(&vfsmount_lock); release_mounts(&umount_list); } + mnt->mnt_flags = mnt_flags; out: up_write(&namespace_sem); @@ -1350,7 +1353,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&nd, dev_name, flags, mnt_flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&nd, flags); else if (flags & MS_MOVE) -- cgit v1.2.3 From 00fbc6dfe7c4487f812829bff79c3121c8fd3bca Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Mon, 10 Apr 2006 22:54:06 -0700 Subject: [PATCH] 9p: handle sget() failure Handle a failing sget() in v9fs_get_sb(). Signed-off-by: Christoph Hellwig Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_super.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index b0a0ae509c0..61c599b4a1e 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -127,12 +127,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) { dprintk(DEBUG_ERROR, "problem initiating session\n"); - kfree(v9ses); - return ERR_PTR(newfid); + sb = ERR_PTR(newfid); + goto out_free_session; } sb = sget(fs_type, NULL, v9fs_set_super, v9ses); - + if (IS_ERR(sb)) + goto out_close_session; v9fs_fill_super(sb, v9ses, flags); inode = v9fs_get_inode(sb, S_IFDIR | mode); @@ -185,6 +186,12 @@ static struct super_block *v9fs_get_sb(struct file_system_type return sb; +out_close_session: + v9fs_session_close(v9ses); +out_free_session: + kfree(v9ses); + return sb; + put_back_sb: /* deactivate_super calls v9fs_kill_super which will frees the rest */ up_write(&sb->s_umount); -- cgit v1.2.3 From b04eb6aa08ecc3e24df2f78ebc486011ebd74feb Mon Sep 17 00:00:00 2001 From: Mitchell Blank Jr Date: Mon, 10 Apr 2006 22:54:08 -0700 Subject: [PATCH] select: don't overflow if (SELECT_STACK_ALLOC % sizeof(long) != 0) If SELECT_STACK_ALLOC is not a multiple of sizeof(long) then stack_fds[] would be shorter than SELECT_STACK_ALLOC bytes and could overflow later in the function. Fixed by simply rearranging the test later to work on sizeof(stack_fds) Currently SELECT_STACK_ALLOC is 256 so this doesn't happen, but it's nasty to have things like this hidden in the code. What if later someone decides to change SELECT_STACK_ALLOC to 300? Signed-off-by: Mitchell Blank Jr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/select.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index fce0fd1bb1d..a8109baa5e4 100644 --- a/fs/select.c +++ b/fs/select.c @@ -311,7 +311,8 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, { fd_set_bits fds; void *bits; - int ret, size, max_fdset; + int ret, max_fdset; + unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -333,14 +334,15 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, * since we used fdset we need to allocate memory in units of * long-words. */ - ret = -ENOMEM; size = FDS_BYTES(n); - if (6*size < SELECT_STACK_ALLOC) - bits = stack_fds; - else + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + /* Not enough space in on-stack array; must use kmalloc */ + ret = -ENOMEM; bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; + if (!bits) + goto out_nofds; + } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; -- cgit v1.2.3 From 80e8ff634169be3fc2ac48f258cc7638e898cd46 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 10 Apr 2006 22:54:10 -0700 Subject: [PATCH] kdump proc vmcore size oveflow fix A couple of /proc/vmcore data structures overflow with 32bit systems having memory more than 4G. This patch fixes those. Signed-off-by: Ken'ichi Ohmichi Signed-off-by: Vivek Goyal Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7efa73d44c9..20d4b2237fc 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { ssize_t acc = 0, tmp; - size_t tsz, nr_bytes; - u64 start; + size_t tsz; + u64 start, nr_bytes; struct vmcore *curr_m = NULL; if (buflen == 0 || *fpos >= vmcore_size) -- cgit v1.2.3 From 2395140ee2bffe38b1c8a59318f62882b797f5e6 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Mon, 10 Apr 2006 22:54:12 -0700 Subject: [PATCH] uniform POLLRDHUP handling between epoll and poll/select As reported by Michael Kerrisk, POLLRDHUP handling was not consistent between epoll and poll/select, since in epoll it was unmaskeable. This patch brings uniformity in POLLRDHUP handling. Signed-off-by: Davide Libenzi Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 242fe1a66ce..1b4491cdd11 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -599,7 +599,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= POLLERR | POLLHUP | POLLRDHUP; + epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tfile, fd); } else @@ -613,7 +613,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP | POLLRDHUP; + epds.events |= POLLERR | POLLHUP; error = ep_modify(ep, epi, &epds); } else error = -ENOENT; -- cgit v1.2.3 From f5e902817fee1589badca1284f49eecc0ef0c200 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 10 Apr 2006 22:54:16 -0700 Subject: [PATCH] process accounting: take original leader's start_time in non-leader exec The only record we have of the real-time age of a process, regardless of execs it's done, is start_time. When a non-leader thread exec, the original start_time of the process is lost. Things looking at the real-time age of the process are fooled, for example the process accounting record when the process finally dies. This change makes the oldest start_time stick around with the process after a non-leader exec. This way the association between PID and start_time is kept constant, which seems correct to me. Signed-off-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 4d38ad0b70d..3234a0c32d5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -678,6 +678,18 @@ static int de_thread(struct task_struct *tsk) while (leader->exit_state != EXIT_ZOMBIE) yield(); + /* + * The only record we have of the real-time age of a + * process, regardless of execs it's done, is start_time. + * All the past CPU time is accumulated in signal_struct + * from sister threads now dead. But in this non-leader + * exec, nothing survives from the original leader thread, + * whose birth marks the true age of this process now. + * When we take on its identity by switching to its PID, we + * also take its birthdate (always earlier than our own). + */ + current->start_time = leader->start_time; + spin_lock(&leader->proc_lock); spin_lock(¤t->proc_lock); proc_dentry1 = proc_pid_unhash(current); -- cgit v1.2.3 From 68250ba5df4c9d00d3064a0ba9a894035436916b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 10 Apr 2006 22:54:30 -0700 Subject: [PATCH] kdump: enable CONFIG_PROC_VMCORE by default Everybody seems to be using /proc/vmcore as a method to access the kernel crash dump. Hence probably it makes sense to enable CONFIG_PROC_VMCORE by default if CONFIG_CRASH_DUMP is selected. This makes kdump configuration further easier for a user. Signed-off-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 97f31741312..2524629dc83 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -799,6 +799,7 @@ config PROC_KCORE config PROC_VMCORE bool "/proc/vmcore support (EXPERIMENTAL)" depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP + default y help Exports the dump image of crashed kernel in ELF format. -- cgit v1.2.3 From 091e881d0e55496d8887b61446ae1c598b0995b6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 10 Apr 2006 22:54:31 -0700 Subject: [PATCH] inotify: check for NULL inode in inotify_d_instantiate The spufs file system creates files in a directory before instantiating the directory itself, which causes a NULL pointer access in inotify_d_instantiate since c32ccd87bfd1414b0aabfcd8dbc7539ad23bcbaa. I'd like to keep this behavior since it means that the user will not have access to files in the directory before I know that I succeed in creating everything in it. This patch adds a simple check for the inode to keep that working. Signed-off-by: Arnd Bergmann Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inotify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/inotify.c b/fs/inotify.c index 367c487c014..1f50302849c 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -538,7 +538,7 @@ void inotify_d_instantiate(struct dentry *entry, struct inode *inode) WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED); spin_lock(&entry->d_lock); parent = entry->d_parent; - if (inotify_inode_watched(parent->d_inode)) + if (parent->d_inode && inotify_inode_watched(parent->d_inode)) entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED; spin_unlock(&entry->d_lock); } -- cgit v1.2.3 From 389ed39b9711bbe5210d5e118e1f1af36ca88b7c Mon Sep 17 00:00:00 2001 From: "Ananiev, Leonid I" Date: Mon, 10 Apr 2006 22:54:38 -0700 Subject: [PATCH] ext3: Fix missed mutex unlock Missed unlock_super()call is added in error condition code path. Signed-off-by: Leonid Ananiev Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/resize.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 1041dab6de2..14f5f6ea3e7 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -974,6 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); + unlock_super(sb); err = -EBUSY; goto exit_put; } -- cgit v1.2.3 From d3406ffa4af8af1d7c14cff06e003eb0a557d4ad Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:49 -0700 Subject: [PATCH] fuse: fix oops in fuse_send_readpages() During heavy parallel filesystem activity it was possible to Oops the kernel. The reason is that read_cache_pages() could skip pages which have already been inserted into the cache by another task. Occasionally this may result in zero pages actually being sent, while fuse_send_readpages() relies on at least one page being in the request. So check this corner case and just free the request instead of trying to send it. Reported and tested by Konstantin Isakov. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 975f2697e86..3ac39c0288d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -397,8 +397,12 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, return -EINTR; err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) - fuse_send_readpages(data.req, file, inode); + if (!err) { + if (data.req->num_pages) + fuse_send_readpages(data.req, file, inode); + else + fuse_put_request(fc, data.req); + } return err; } -- cgit v1.2.3 From 7025d9ad10a38dadef8b286e0092731c2d3cdc53 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:50 -0700 Subject: [PATCH] fuse: fix fuse_dev_poll() return value fuse_dev_poll() returned an error value instead of a poll mask. Luckily (or unluckily) -ENODEV does contain the POLLERR bit. There's also a race if filesystem is unmounted between fuse_get_conn() and spin_lock(), in which case this event will be missed by poll(). Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 23d1f52eb1b..b2e8613a26d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -804,17 +804,18 @@ static ssize_t fuse_dev_write(struct file *file, const char __user *buf, static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { - struct fuse_conn *fc = fuse_get_conn(file); unsigned mask = POLLOUT | POLLWRNORM; - + struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return POLLERR; poll_wait(file, &fc->waitq, wait); spin_lock(&fuse_lock); - if (!list_empty(&fc->pending)) - mask |= POLLIN | POLLRDNORM; + if (!fc->connected) + mask = POLLERR; + else if (!list_empty(&fc->pending)) + mask |= POLLIN | POLLRDNORM; spin_unlock(&fuse_lock); return mask; -- cgit v1.2.3 From 385a17bfc3cb035333c8a91eddc78a6e04c4625e Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 10 Apr 2006 22:54:52 -0700 Subject: [PATCH] fuse: add O_ASYNC support to FUSE device This adds asynchronous notification to FUSE - a FUSE server can request O_ASYNC on a /dev/fuse file descriptor and receive SIGIO when there is input available. One subtlety - fuse_dev_fasync, which is called when O_ASYNC is requested, does no locking, unlink the other methods. I think it's unnecessary, as the fuse_conn.fasync list is manipulated only by fasync_helper and kill_fasync, which provide their own locking. It would also be wrong to use the fuse_lock, as it's a spin lock and fasync_helper can sleep. My one concern with this is the fuse_conn going away underneath fuse_dev_fasync - sys_fcntl takes a reference on the file struct, so this seems not to be a problem. Signed-off-by: Jeff Dike Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 17 ++++++++++++++++- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b2e8613a26d..438770f8867 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -317,6 +317,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) list_add_tail(&req->list, &fc->pending); req->state = FUSE_REQ_PENDING; wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); } /* @@ -901,6 +902,7 @@ void fuse_abort_conn(struct fuse_conn *fc) end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); wake_up_all(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); } spin_unlock(&fuse_lock); } @@ -917,12 +919,24 @@ static int fuse_dev_release(struct inode *inode, struct file *file) end_requests(fc, &fc->processing); } spin_unlock(&fuse_lock); - if (fc) + if (fc) { + fasync_helper(-1, file, 0, &fc->fasync); kobject_put(&fc->kobj); + } return 0; } +static int fuse_dev_fasync(int fd, struct file *file, int on) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -ENODEV; + + /* No locking - fasync_helper does its own locking */ + return fasync_helper(fd, file, on, &fc->fasync); +} + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .llseek = no_llseek, @@ -932,6 +946,7 @@ const struct file_operations fuse_dev_operations = { .writev = fuse_dev_writev, .poll = fuse_dev_poll, .release = fuse_dev_release, + .fasync = fuse_dev_fasync, }; static struct miscdevice fuse_miscdevice = { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a16a04fcf41..e5cb46b7843 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -318,6 +318,9 @@ struct fuse_conn { /** kobject */ struct kobject kobj; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 879e6fba948..78700cbb9cd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -216,6 +216,7 @@ static void fuse_put_super(struct super_block *sb) spin_unlock(&fuse_lock); up_write(&fc->sbput_sem); /* Flush all readers on this fs */ + kill_fasync(&fc->fasync, SIGIO, POLL_IN); wake_up_all(&fc->waitq); kobject_del(&fc->kobj); kobject_put(&fc->kobj); -- cgit v1.2.3 From e5ac1d1e70a8c19a65a959d73650203df7a2e168 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 10 Apr 2006 22:54:53 -0700 Subject: [PATCH] fuse: add O_NONBLOCK support to FUSE device I don't like duplicating the connected and list_empty tests in fuse_dev_readv, but this seemed cleaner than adding the f_flags test to request_wait. Signed-off-by: Jeff Dike Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 438770f8867..75c6e9166c3 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -619,6 +619,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, err = -EPERM; if (!fc) goto err_unlock; + + err = -EAGAIN; + if ((file->f_flags & O_NONBLOCK) && fc->connected && + list_empty(&fc->pending)) + goto err_unlock; + request_wait(fc); err = -ENODEV; if (!fc->connected) -- cgit v1.2.3 From 0720b315976447cba3f0c3e211223b8cb82b0f93 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:55 -0700 Subject: [PATCH] fuse: simplify locking This is in preparation for removing the global spinlock in favor of a per-mount one. The only critical part is the interaction between fuse_dev_release() and fuse_fill_super(): fuse_dev_release() must see the assignment to file->private_data, otherwise it will leak the reference to fuse_conn. This is ensured by the fput() operation, which will synchronize the assignment with other CPU's that may do a final fput() soon after this. Also redundant locking is removed from fuse_fill_super(), where exclusion is already ensured by the BKL held for this function by the VFS. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 31 ++++++++++------------------ fs/fuse/inode.c | 64 ++++++++++++++++++++------------------------------------- 2 files changed, 33 insertions(+), 62 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 75c6e9166c3..c510533c684 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -23,13 +23,11 @@ static kmem_cache_t *fuse_req_cachep; static struct fuse_conn *fuse_get_conn(struct file *file) { - struct fuse_conn *fc; - spin_lock(&fuse_lock); - fc = file->private_data; - if (fc && !fc->connected) - fc = NULL; - spin_unlock(&fuse_lock); - return fc; + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return file->private_data; } static void fuse_request_init(struct fuse_req *req) @@ -607,19 +605,16 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *off) { int err; - struct fuse_conn *fc; struct fuse_req *req; struct fuse_in *in; struct fuse_copy_state cs; unsigned reqsize; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; restart: spin_lock(&fuse_lock); - fc = file->private_data; - err = -EPERM; - if (!fc) - goto err_unlock; - err = -EAGAIN; if ((file->f_flags & O_NONBLOCK) && fc->connected && list_empty(&fc->pending)) @@ -915,17 +910,13 @@ void fuse_abort_conn(struct fuse_conn *fc) static int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc; - - spin_lock(&fuse_lock); - fc = file->private_data; + struct fuse_conn *fc = fuse_get_conn(file); if (fc) { + spin_lock(&fuse_lock); fc->connected = 0; end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); - } - spin_unlock(&fuse_lock); - if (fc) { + spin_unlock(&fuse_lock); fasync_helper(-1, file, 0, &fc->fasync); kobject_put(&fc->kobj); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 78700cbb9cd..620579a6910 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -414,37 +414,6 @@ static struct fuse_conn *new_conn(void) return fc; } -static struct fuse_conn *get_conn(struct file *file, struct super_block *sb) -{ - struct fuse_conn *fc; - int err; - - err = -EINVAL; - if (file->f_op != &fuse_dev_operations) - goto out_err; - - err = -ENOMEM; - fc = new_conn(); - if (!fc) - goto out_err; - - spin_lock(&fuse_lock); - err = -EINVAL; - if (file->private_data) - goto out_unlock; - - kobject_get(&fc->kobj); - file->private_data = fc; - spin_unlock(&fuse_lock); - return fc; - - out_unlock: - spin_unlock(&fuse_lock); - kobject_put(&fc->kobj); - out_err: - return ERR_PTR(err); -} - static struct inode *get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; @@ -526,12 +495,9 @@ static void fuse_send_init(struct fuse_conn *fc) static unsigned long long conn_id(void) { + /* BKL is held for ->get_sb() */ static unsigned long long ctr = 1; - unsigned long long val; - spin_lock(&fuse_lock); - val = ctr++; - spin_unlock(&fuse_lock); - return val; + return ctr++; } static int fuse_fill_super(struct super_block *sb, void *data, int silent) @@ -556,10 +522,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) return -EINVAL; - fc = get_conn(file, sb); - fput(file); - if (IS_ERR(fc)) - return PTR_ERR(fc); + if (file->f_op != &fuse_dev_operations) + return -EINVAL; + + /* Setting file->private_data can't race with other mount() + instances, since BKL is held for ->get_sb() */ + if (file->private_data) + return -EINVAL; + + fc = new_conn(); + if (!fc) + return -ENOMEM; fc->flags = d.flags; fc->user_id = d.user_id; @@ -589,10 +562,16 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_put_root; sb->s_root = root_dentry; - spin_lock(&fuse_lock); fc->mounted = 1; fc->connected = 1; - spin_unlock(&fuse_lock); + kobject_get(&fc->kobj); + file->private_data = fc; + /* + * atomic_dec_and_test() in fput() provides the necessary + * memory barrier for file->private_data to be visible on all + * CPUs after this + */ + fput(file); fuse_send_init(fc); @@ -601,6 +580,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) err_put_root: dput(root_dentry); err: + fput(file); kobject_put(&fc->kobj); return err; } -- cgit v1.2.3 From d713311464bcca73c990d1a1b5c9467eae87f5b4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:55 -0700 Subject: [PATCH] fuse: use a per-mount spinlock Remove the global spinlock in favor of a per-mount one. This patch is basically find & replace. The difficult part has already been done by the previous patch. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 122 ++++++++++++++++++++++++++++--------------------------- fs/fuse/fuse_i.h | 24 +++-------- fs/fuse/inode.c | 12 +++--- 3 files changed, 74 insertions(+), 84 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c510533c684..63d2cf43b5e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi + Copyright (C) 2001-2006 Miklos Szeredi This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -94,11 +94,11 @@ static struct fuse_req *do_get_request(struct fuse_conn *fc) { struct fuse_req *req; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); BUG_ON(list_empty(&fc->unused_list)); req = list_entry(fc->unused_list.next, struct fuse_req, list); list_del_init(&req->list); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); fuse_request_init(req); req->preallocated = 1; req->in.h.uid = current->fsuid; @@ -124,7 +124,7 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc) return do_get_request(fc); } -/* Must be called with fuse_lock held */ +/* Must be called with fc->lock held */ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) { if (req->preallocated) { @@ -143,9 +143,9 @@ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); fuse_putback_request(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } } @@ -155,15 +155,15 @@ static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req) fuse_putback_request(fc, req); } -void fuse_release_background(struct fuse_req *req) +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { iput(req->inode); iput(req->inode2); if (req->file) fput(req->file); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); list_del(&req->bg_entry); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } /* @@ -182,7 +182,7 @@ void fuse_release_background(struct fuse_req *req) * interrupted and put in the background, it will return with an error * and hence never be reset and reused. * - * Called with fuse_lock, unlocks it + * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) { @@ -191,14 +191,14 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (!req->background) { wake_up(&req->waitq); fuse_put_request_locked(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } else { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); down_read(&fc->sbput_sem); if (fc->mounted) - fuse_release_background(req); + fuse_release_background(fc, req); up_read(&fc->sbput_sem); if (end) end(fc, req); @@ -248,16 +248,16 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req) get_file(req->file); } -/* Called with fuse_lock held. Releases, and then reacquires it. */ +/* Called with fc->lock held. Releases, and then reacquires it. */ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) { sigset_t oldset; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); block_sigs(&oldset); wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); restore_sigs(&oldset); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (req->state == FUSE_REQ_FINISHED && !req->interrupted) return; @@ -271,9 +271,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) locked state, there mustn't be any filesystem operation (e.g. page fault), since that could lead to deadlock */ - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } if (req->state == FUSE_REQ_PENDING) { list_del(&req->list); @@ -324,7 +324,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) void request_send(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; else if (fc->conn_error) @@ -337,15 +337,15 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) request_wait_answer(fc, req); } - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (fc->connected) { queue_request(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } else { req->out.h.error = -ENOTCONN; request_end(fc, req); @@ -361,9 +361,9 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) void request_send_background(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); background_request(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); request_send_nowait(fc, req); } @@ -372,16 +372,16 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req) * anything that could cause a page-fault. If the request was already * interrupted bail out. */ -static int lock_request(struct fuse_req *req) +static int lock_request(struct fuse_conn *fc, struct fuse_req *req) { int err = 0; if (req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (req->interrupted) err = -ENOENT; else req->locked = 1; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } return err; } @@ -391,18 +391,19 @@ static int lock_request(struct fuse_req *req) * requester thread is currently waiting for it to be unlocked, so * wake it up. */ -static void unlock_request(struct fuse_req *req) +static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) { if (req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (req->interrupted) wake_up(&req->waitq); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } } struct fuse_copy_state { + struct fuse_conn *fc; int write; struct fuse_req *req; const struct iovec *iov; @@ -415,11 +416,12 @@ struct fuse_copy_state { unsigned len; }; -static void fuse_copy_init(struct fuse_copy_state *cs, int write, - struct fuse_req *req, const struct iovec *iov, - unsigned long nr_segs) +static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, + int write, struct fuse_req *req, + const struct iovec *iov, unsigned long nr_segs) { memset(cs, 0, sizeof(*cs)); + cs->fc = fc; cs->write = write; cs->req = req; cs->iov = iov; @@ -449,7 +451,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) unsigned long offset; int err; - unlock_request(cs->req); + unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); if (!cs->seglen) { BUG_ON(!cs->nr_segs); @@ -472,7 +474,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->seglen -= cs->len; cs->addr += cs->len; - return lock_request(cs->req); + return lock_request(cs->fc, cs->req); } /* Do as much copy to/from userspace buffer as we can */ @@ -584,9 +586,9 @@ static void request_wait(struct fuse_conn *fc) if (signal_pending(current)) break; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); schedule(); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } set_current_state(TASK_RUNNING); remove_wait_queue(&fc->waitq, &wait); @@ -614,7 +616,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, return -EPERM; restart: - spin_lock(&fuse_lock); + spin_lock(&fc->lock); err = -EAGAIN; if ((file->f_flags & O_NONBLOCK) && fc->connected && list_empty(&fc->pending)) @@ -643,14 +645,14 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, request_end(fc, req); goto restart; } - spin_unlock(&fuse_lock); - fuse_copy_init(&cs, 1, req, iov, nr_segs); + spin_unlock(&fc->lock); + fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); if (!err) err = fuse_copy_args(&cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (!err && req->interrupted) err = -ENOENT; @@ -665,12 +667,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, else { req->state = FUSE_REQ_SENT; list_move_tail(&req->list, &fc->processing); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } return reqsize; err_unlock: - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); return err; } @@ -739,7 +741,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, if (!fc) return -ENODEV; - fuse_copy_init(&cs, 0, NULL, iov, nr_segs); + fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; @@ -751,7 +753,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, oh.len != nbytes) goto err_finish; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); err = -ENOENT; if (!fc->connected) goto err_unlock; @@ -762,9 +764,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, goto err_unlock; if (req->interrupted) { - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); request_end(fc, req); return -ENOENT; } @@ -772,12 +774,12 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, req->out.h = oh; req->locked = 1; cs.req = req; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); err = copy_out_args(&cs, &req->out, nbytes); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (!err) { if (req->interrupted) @@ -789,7 +791,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, return err ? err : nbytes; err_unlock: - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); err_finish: fuse_copy_finish(&cs); return err; @@ -813,12 +815,12 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) poll_wait(file, &fc->waitq, wait); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (!fc->connected) mask = POLLERR; else if (!list_empty(&fc->pending)) mask |= POLLIN | POLLRDNORM; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); return mask; } @@ -826,7 +828,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) /* * Abort all requests on the given list (pending or processing) * - * This function releases and reacquires fuse_lock + * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) { @@ -835,7 +837,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; request_end(fc, req); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } } @@ -866,10 +868,10 @@ static void end_io_requests(struct fuse_conn *fc) req->end = NULL; /* The end function will consume this reference */ __fuse_get_request(req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); end(fc, req); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } } } @@ -896,7 +898,7 @@ static void end_io_requests(struct fuse_conn *fc) */ void fuse_abort_conn(struct fuse_conn *fc) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (fc->connected) { fc->connected = 0; end_io_requests(fc); @@ -905,18 +907,18 @@ void fuse_abort_conn(struct fuse_conn *fc) wake_up_all(&fc->waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); } - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } static int fuse_dev_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = fuse_get_conn(file); if (fc) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); fc->connected = 0; end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); fasync_helper(-1, file, 0, &fc->fasync); kobject_put(&fc->kobj); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e5cb46b7843..6ed812fd620 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi + Copyright (C) 2001-2006 Miklos Szeredi This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -144,7 +144,7 @@ struct fuse_req { /* * The following bitfields are either set once before the * request is queued or setting/clearing them is protected by - * fuse_lock + * fuse_conn->lock */ /** True if the request has reply */ @@ -213,6 +213,9 @@ struct fuse_req { * unmounted. */ struct fuse_conn { + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + /** The user id for this mount */ uid_t user_id; @@ -351,21 +354,6 @@ static inline u64 get_node_id(struct inode *inode) /** Device operations */ extern const struct file_operations fuse_dev_operations; -/** - * This is the single global spinlock which protects FUSE's structures - * - * The following data is protected by this lock: - * - * - the private_data field of the device file - * - the s_fs_info field of the super block - * - unused_list, pending, processing lists in fuse_conn - * - background list in fuse_conn - * - the unique request ID counter reqctr in fuse_conn - * - the sb (super_block) field in fuse_conn - * - the file (device file) field in fuse_conn - */ -extern spinlock_t fuse_lock; - /** * Get a filled in inode */ @@ -490,7 +478,7 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req); /** * Release inodes and file associated with background request */ -void fuse_release_background(struct fuse_req *req); +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 620579a6910..cc58debeabd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi + Copyright (C) 2001-2006 Miklos Szeredi This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -22,7 +22,6 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); -spinlock_t fuse_lock; static kmem_cache_t *fuse_inode_cachep; static struct subsystem connections_subsys; @@ -207,13 +206,14 @@ static void fuse_put_super(struct super_block *sb) down_write(&fc->sbput_sem); while (!list_empty(&fc->background)) - fuse_release_background(list_entry(fc->background.next, + fuse_release_background(fc, + list_entry(fc->background.next, struct fuse_req, bg_entry)); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); fc->mounted = 0; fc->connected = 0; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); up_write(&fc->sbput_sem); /* Flush all readers on this fs */ kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -388,6 +388,7 @@ static struct fuse_conn *new_conn(void) fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (fc) { int i; + spin_lock_init(&fc->lock); init_waitqueue_head(&fc->waitq); INIT_LIST_HEAD(&fc->pending); INIT_LIST_HEAD(&fc->processing); @@ -734,7 +735,6 @@ static int __init fuse_init(void) printk("fuse init (API version %i.%i)\n", FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); - spin_lock_init(&fuse_lock); res = fuse_fs_init(); if (res) goto err; -- cgit v1.2.3 From a87046d822f2d982d25b24c4a644d34f22d4888a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:56 -0700 Subject: [PATCH] fuse: consolidate device errors Return consistent error values for the case when the opened device file has no mount associated yet. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 63d2cf43b5e..6b8843d4ad8 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -739,7 +739,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, struct fuse_copy_state cs; struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return -EPERM; fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) @@ -930,7 +930,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return -EPERM; /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fc->fasync); -- cgit v1.2.3 From ce1d5a491f0ee50560416a73faa5e4ddbab074bd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:58 -0700 Subject: [PATCH] fuse: clean up request accounting FUSE allocated most requests from a fixed size pool filled at mount time. However in some cases (release/forget) non-pool requests were used. File locking operations aren't well served by the request pool, since they may block indefinetly thus exhausting the pool. This patch removes the request pool and always allocates requests on demand. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 73 +++++----------------------------- fs/fuse/dir.c | 118 +++++++++++++++++++++++++++---------------------------- fs/fuse/file.c | 48 +++++++++++----------- fs/fuse/fuse_i.h | 26 +++--------- fs/fuse/inode.c | 54 +++++++------------------ 5 files changed, 111 insertions(+), 208 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6b8843d4ad8..4dc104c0e95 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -72,10 +72,8 @@ static void restore_sigs(sigset_t *oldset) */ void fuse_reset_request(struct fuse_req *req) { - int preallocated = req->preallocated; BUG_ON(atomic_read(&req->count) != 1); fuse_request_init(req); - req->preallocated = preallocated; } static void __fuse_get_request(struct fuse_req *req) @@ -90,71 +88,28 @@ static void __fuse_put_request(struct fuse_req *req) atomic_dec(&req->count); } -static struct fuse_req *do_get_request(struct fuse_conn *fc) +struct fuse_req *fuse_get_req(struct fuse_conn *fc) { - struct fuse_req *req; + struct fuse_req *req = fuse_request_alloc(); + if (!req) + return ERR_PTR(-ENOMEM); - spin_lock(&fc->lock); - BUG_ON(list_empty(&fc->unused_list)); - req = list_entry(fc->unused_list.next, struct fuse_req, list); - list_del_init(&req->list); - spin_unlock(&fc->lock); + atomic_inc(&fc->num_waiting); fuse_request_init(req); - req->preallocated = 1; req->in.h.uid = current->fsuid; req->in.h.gid = current->fsgid; req->in.h.pid = current->pid; return req; } -/* This can return NULL, but only in case it's interrupted by a SIGKILL */ -struct fuse_req *fuse_get_request(struct fuse_conn *fc) -{ - int intr; - sigset_t oldset; - - atomic_inc(&fc->num_waiting); - block_sigs(&oldset); - intr = down_interruptible(&fc->outstanding_sem); - restore_sigs(&oldset); - if (intr) { - atomic_dec(&fc->num_waiting); - return NULL; - } - return do_get_request(fc); -} - -/* Must be called with fc->lock held */ -static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) -{ - if (req->preallocated) { - atomic_dec(&fc->num_waiting); - list_add(&req->list, &fc->unused_list); - } else - fuse_request_free(req); - - /* If we are in debt decrease that first */ - if (fc->outstanding_debt) - fc->outstanding_debt--; - else - up(&fc->outstanding_sem); -} - void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - spin_lock(&fc->lock); - fuse_putback_request(fc, req); - spin_unlock(&fc->lock); + atomic_dec(&fc->num_waiting); + fuse_request_free(req); } } -static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req) -{ - if (atomic_dec_and_test(&req->count)) - fuse_putback_request(fc, req); -} - void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { iput(req->inode); @@ -189,9 +144,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) list_del(&req->list); req->state = FUSE_REQ_FINISHED; if (!req->background) { - wake_up(&req->waitq); - fuse_put_request_locked(fc, req); spin_unlock(&fc->lock); + wake_up(&req->waitq); + fuse_put_request(fc, req); } else { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; @@ -302,16 +257,6 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) req->in.h.unique = fc->reqctr; req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); - if (!req->preallocated) { - /* If request is not preallocated (either FORGET or - RELEASE), then still decrease outstanding_sem, so - user can't open infinite number of files while not - processing the RELEASE requests. However for - efficiency do it without blocking, so if down() - would block, just increase the debt instead */ - if (down_trylock(&fc->outstanding_sem)) - fc->outstanding_debt++; - } list_add_tail(&req->list, &fc->pending); req->state = FUSE_REQ_PENDING; wake_up(&fc->waitq); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 256355b8025..8d7546e832e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -117,8 +117,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) return 0; fc = get_fuse_conn(inode); - req = fuse_get_request(fc); - if (!req) + req = fuse_get_req(fc); + if (IS_ERR(req)) return 0; fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg); @@ -188,9 +188,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, if (entry->d_name.len > FUSE_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - req = fuse_get_request(fc); - if (!req) - return ERR_PTR(-EINTR); + req = fuse_get_req(fc); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); fuse_lookup_init(req, dir, entry, &outarg); request_send(fc, req); @@ -244,15 +244,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, struct file *file; int flags = nd->intent.open.flags - 1; - err = -ENOSYS; if (fc->no_create) - goto out; + return -ENOSYS; - err = -EINTR; - req = fuse_get_request(fc); - if (!req) - goto out; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + err = -ENOMEM; ff = fuse_file_alloc(); if (!ff) goto out_put_request; @@ -314,7 +313,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, fuse_file_free(ff); out_put_request: fuse_put_request(fc, req); - out: return err; } @@ -375,9 +373,9 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; @@ -407,9 +405,9 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; @@ -427,9 +425,9 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_SYMLINK; req->in.numargs = 2; @@ -444,9 +442,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_UNLINK; req->in.h.nodeid = get_node_id(dir); @@ -476,9 +474,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_RMDIR; req->in.h.nodeid = get_node_id(dir); @@ -504,9 +502,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.newdir = get_node_id(newdir); @@ -553,9 +551,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); @@ -583,9 +581,9 @@ int fuse_do_getattr(struct inode *inode) int err; struct fuse_attr_out arg; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_GETATTR; req->in.h.nodeid = get_node_id(inode); @@ -673,9 +671,9 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask; @@ -780,9 +778,9 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); page = alloc_page(GFP_KERNEL); if (!page) { @@ -809,11 +807,11 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); + struct fuse_req *req = fuse_get_req(fc); char *link; - if (!req) - return ERR_PTR(-EINTR); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); link = (char *) __get_free_page(GFP_KERNEL); if (!link) { @@ -933,9 +931,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) } } - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); iattr_to_fattr(attr, &inarg); @@ -995,9 +993,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name, if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1035,9 +1033,9 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1085,9 +1083,9 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1131,9 +1129,9 @@ static int fuse_removexattr(struct dentry *entry, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_REMOVEXATTR; req->in.h.nodeid = get_node_id(inode); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3ac39c0288d..e4f041a11bb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -22,9 +22,9 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir, struct fuse_req *req; int err; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -184,9 +184,9 @@ static int fuse_flush(struct file *file) if (fc->no_flush) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -223,9 +223,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -297,9 +297,9 @@ static int fuse_readpage(struct file *file, struct page *page) if (is_bad_inode(inode)) goto out; - err = -EINTR; - req = fuse_get_request(fc); - if (!req) + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) goto out; req->out.page_zeroing = 1; @@ -368,10 +368,10 @@ static int fuse_readpages_fill(void *_data, struct page *page) (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_send_readpages(req, data->file, inode); - data->req = req = fuse_get_request(fc); - if (!req) { + data->req = req = fuse_get_req(fc); + if (IS_ERR(req)) { unlock_page(page); - return -EINTR; + return PTR_ERR(req); } } req->pages[req->num_pages] = page; @@ -392,9 +392,9 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_request(fc); - if (!data.req) - return -EINTR; + data.req = fuse_get_req(fc); + if (IS_ERR(data.req)) + return PTR_ERR(data.req); err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); if (!err) { @@ -455,9 +455,9 @@ static int fuse_commit_write(struct file *file, struct page *page, if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->num_pages = 1; req->pages[0] = page; @@ -532,9 +532,9 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); while (count) { size_t nres; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6ed812fd620..242e69cb125 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -18,9 +18,6 @@ /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 -/** If more requests are outstanding, then the operation will block */ -#define FUSE_MAX_OUTSTANDING 10 - /** It could be as large as PATH_MAX, but would that have any uses? */ #define FUSE_NAME_MAX 1024 @@ -131,8 +128,8 @@ struct fuse_conn; * A request to the client */ struct fuse_req { - /** This can be on either unused_list, pending processing or - io lists in fuse_conn */ + /** This can be on either pending processing or io lists in + fuse_conn */ struct list_head list; /** Entry on the background list */ @@ -150,9 +147,6 @@ struct fuse_req { /** True if the request has reply */ unsigned isreply:1; - /** The request is preallocated */ - unsigned preallocated:1; - /** The request was interrupted */ unsigned interrupted:1; @@ -247,19 +241,9 @@ struct fuse_conn { interrupted request) */ struct list_head background; - /** Controls the maximum number of outstanding requests */ - struct semaphore outstanding_sem; - - /** This counts the number of outstanding requests if - outstanding_sem would go negative */ - unsigned outstanding_debt; - /** RW semaphore for exclusion with fuse_put_super() */ struct rw_semaphore sbput_sem; - /** The list of unused requests */ - struct list_head unused_list; - /** The next unique request id */ u64 reqctr; @@ -452,11 +436,11 @@ void fuse_reset_request(struct fuse_req *req); /** * Reserve a preallocated request */ -struct fuse_req *fuse_get_request(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc); /** - * Decrement reference count of a request. If count goes to zero put - * on unused list (preallocated) or free request (not preallocated). + * Decrement reference count of a request. If count goes to zero free + * the request. */ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cc58debeabd..824ebbc428e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -243,9 +243,9 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf) struct fuse_statfs_out outarg; int err; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&outarg, 0, sizeof(outarg)); req->in.numargs = 0; @@ -370,15 +370,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) static void fuse_conn_release(struct kobject *kobj) { - struct fuse_conn *fc = get_fuse_conn_kobj(kobj); - - while (!list_empty(&fc->unused_list)) { - struct fuse_req *req; - req = list_entry(fc->unused_list.next, struct fuse_req, list); - list_del(&req->list); - fuse_request_free(req); - } - kfree(fc); + kfree(get_fuse_conn_kobj(kobj)); } static struct fuse_conn *new_conn(void) @@ -387,27 +379,16 @@ static struct fuse_conn *new_conn(void) fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (fc) { - int i; spin_lock_init(&fc->lock); init_waitqueue_head(&fc->waitq); INIT_LIST_HEAD(&fc->pending); INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->unused_list); INIT_LIST_HEAD(&fc->background); - sema_init(&fc->outstanding_sem, 1); /* One for INIT */ init_rwsem(&fc->sbput_sem); kobj_set_kset_s(fc, connections_subsys); kobject_init(&fc->kobj); atomic_set(&fc->num_waiting, 0); - for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) { - struct fuse_req *req = fuse_request_alloc(); - if (!req) { - kobject_put(&fc->kobj); - return NULL; - } - list_add(&req->list, &fc->unused_list); - } fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; fc->reqctr = 0; @@ -438,7 +419,6 @@ static struct super_operations fuse_super_operations = { static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { - int i; struct fuse_init_out *arg = &req->misc.init_out; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) @@ -457,22 +437,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; } - - /* After INIT reply is received other requests can go - out. So do (FUSE_MAX_OUTSTANDING - 1) number of - up()s on outstanding_sem. The last up() is done in - fuse_putback_request() */ - for (i = 1; i < FUSE_MAX_OUTSTANDING; i++) - up(&fc->outstanding_sem); - fuse_put_request(fc, req); } -static void fuse_send_init(struct fuse_conn *fc) +static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) { - /* This is called from fuse_read_super() so there's guaranteed - to be exactly one request available */ - struct fuse_req *req = fuse_get_request(fc); struct fuse_init_in *arg = &req->misc.init_in; arg->major = FUSE_KERNEL_VERSION; @@ -508,6 +477,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) struct fuse_mount_data d; struct file *file; struct dentry *root_dentry; + struct fuse_req *init_req; int err; if (!parse_fuse_opt((char *) data, &d)) @@ -554,13 +524,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err; } + init_req = fuse_request_alloc(); + if (!init_req) + goto err_put_root; + err = kobject_set_name(&fc->kobj, "%llu", conn_id()); if (err) - goto err_put_root; + goto err_free_req; err = kobject_add(&fc->kobj); if (err) - goto err_put_root; + goto err_free_req; sb->s_root = root_dentry; fc->mounted = 1; @@ -574,10 +548,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) */ fput(file); - fuse_send_init(fc); + fuse_send_init(fc, init_req); return 0; + err_free_req: + fuse_request_free(init_req); err_put_root: dput(root_dentry); err: -- cgit v1.2.3 From 08a53cdce62d37d918530bbbf726cc01b21dc3d1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:59 -0700 Subject: [PATCH] fuse: account background requests The previous patch removed limiting the number of outstanding requests. This patch adds a much simpler limiting, that is also compatible with file locking operations. A task may have at most one synchronous request allocated. So these requests need not be otherwise limited. However the number of background requests (release, forget, asynchronous reads, interrupted requests) can grow indefinitely. This can be used by a malicous user to cause FUSE to allocate arbitrary amounts of unswappable kernel memory, denying service. For this reason add a limit for the number of background requests, and block allocations of new requests until the number goes bellow the limit. Also use this mechanism to block all requests until the INIT reply is received. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 24 ++++++++++++++++++++---- fs/fuse/fuse_i.h | 14 ++++++++++++++ fs/fuse/inode.c | 4 ++++ 3 files changed, 38 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4dc104c0e95..6c740f86066 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -90,7 +90,17 @@ static void __fuse_put_request(struct fuse_req *req) struct fuse_req *fuse_get_req(struct fuse_conn *fc) { - struct fuse_req *req = fuse_request_alloc(); + struct fuse_req *req; + sigset_t oldset; + int err; + + block_sigs(&oldset); + err = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); + restore_sigs(&oldset); + if (err) + return ERR_PTR(-EINTR); + + req = fuse_request_alloc(); if (!req) return ERR_PTR(-ENOMEM); @@ -118,6 +128,11 @@ void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) fput(req->file); spin_lock(&fc->lock); list_del(&req->bg_entry); + if (fc->num_background == FUSE_MAX_BACKGROUND) { + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); + } + fc->num_background--; spin_unlock(&fc->lock); } @@ -195,6 +210,9 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req) { req->background = 1; list_add(&req->bg_entry, &fc->background); + fc->num_background++; + if (fc->num_background == FUSE_MAX_BACKGROUND) + fc->blocked = 1; if (req->inode) req->inode = igrab(req->inode); if (req->inode2) @@ -288,6 +306,7 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) { spin_lock(&fc->lock); + background_request(fc, req); if (fc->connected) { queue_request(fc, req); spin_unlock(&fc->lock); @@ -306,9 +325,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) void request_send_background(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fc->lock); - background_request(fc, req); - spin_unlock(&fc->lock); request_send_nowait(fc, req); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 242e69cb125..19c7185a754 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -18,6 +18,9 @@ /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 +/** Maximum number of outstanding background requests */ +#define FUSE_MAX_BACKGROUND 10 + /** It could be as large as PATH_MAX, but would that have any uses? */ #define FUSE_NAME_MAX 1024 @@ -241,6 +244,17 @@ struct fuse_conn { interrupted request) */ struct list_head background; + /** Number of requests currently in the background */ + unsigned num_background; + + /** Flag indicating if connection is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; + + /** waitq for blocked connection */ + wait_queue_head_t blocked_waitq; + /** RW semaphore for exclusion with fuse_put_super() */ struct rw_semaphore sbput_sem; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 824ebbc428e..fd34037b058 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -381,6 +381,7 @@ static struct fuse_conn *new_conn(void) if (fc) { spin_lock_init(&fc->lock); init_waitqueue_head(&fc->waitq); + init_waitqueue_head(&fc->blocked_waitq); INIT_LIST_HEAD(&fc->pending); INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); @@ -392,6 +393,7 @@ static struct fuse_conn *new_conn(void) fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; fc->reqctr = 0; + fc->blocked = 1; } return fc; } @@ -438,6 +440,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; } fuse_put_request(fc, req); + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); } static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) -- cgit v1.2.3 From 7775f4c85dcbd1175f21b2fbb7221c79ec70b722 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:20 -0700 Subject: [PATCH] knfsd: Correct reserved reply space for read requests. NFSd makes sure there is enough space to hold the maximum possible reply before accepting a request. The units for this maximum is (4byte) words. However in three places, particularly for read request, the number given is a number of bytes. This means too much space is reserved which is slightly wasteful. This is the sort of patch that could uncover a deeper bug, and it is not critical, so it would be best for it to spend a while in -mm before going in to mainline. (akpm: target 2.6.17-rc2, 2.6.16.3 (approx)) Discovered-by: "Eivind Sarto" Signed-off-by: Neil Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs3proc.c | 2 +- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfsproc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 6d2dfed1de0..f61142afea4 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -682,7 +682,7 @@ static struct svc_procedure nfsd_procedures3[22] = { PROC(lookup, dirop, dirop, fhandle2, RC_NOCACHE, ST+FH+pAT+pAT), PROC(access, access, access, fhandle, RC_NOCACHE, ST+pAT+1), PROC(readlink, readlink, readlink, fhandle, RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4), - PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE), + PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4), PROC(write, write, write, fhandle, RC_REPLBUFF, ST+WC+4), PROC(create, create, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), PROC(mkdir, mkdir, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6d63f1d9e5f..ca8a4c410de 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -975,7 +975,7 @@ struct nfsd4_voidargs { int dummy; }; */ static struct svc_procedure nfsd_procedures4[2] = { PROC(null, void, void, void, RC_NOCACHE, 1), - PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE) + PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE/4) }; struct svc_version nfsd_version4 = { diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 3e6b75cd90f..06cd0db0f32 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -553,7 +553,7 @@ static struct svc_procedure nfsd_procedures2[18] = { PROC(none, void, void, none, RC_NOCACHE, ST), PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT), PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4), - PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE), + PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4), PROC(none, void, void, none, RC_NOCACHE, ST), PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), -- cgit v1.2.3 From d5b9026a670fdb404e6e2e2e0a1b447e9ea9c1f6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:22 -0700 Subject: [PATCH] knfsd: locks: flag NFSv4-owned locks Use the fl_lmops field to identify which locks are ours, instead of trying to look them up in our private hash. This is safer and more efficient. Earlier versions of this patch used a lock flag instead, but Trond pointed out that adding a new flag for each lock manager wasn't going to scale well, and suggested this approach instead; a separate patch converts lockd to using fl_lmops in the same way. In the NFSv4 case this looks like a bit of a hack, since the NFSv4 server isn't currently actually defining a lock_manager_operations struct, so we end up defining one *just* to serve as a cookie to identify our locks. But it works, and we actually do expect to start using the lock_manager_operations at some point anyway. Signed-off-by: Marc Eshel Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 47ec112b266..ffedce08b4c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2495,36 +2495,27 @@ nfs4_transform_lock_offset(struct file_lock *lock) lock->fl_end = OFFSET_MAX; } -static int -nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval) -{ - struct nfs4_stateowner *local = NULL; - int status = 0; - - if (hashval >= LOCK_HASH_SIZE) - goto out; - list_for_each_entry(local, &lock_ownerid_hashtbl[hashval], so_idhash) { - if (local == sop) { - status = 1; - goto out; - } - } -out: - return status; -} - +/* Hack!: For now, we're defining this just so we can use a pointer to it + * as a unique cookie to identify our (NFSv4's) posix locks. */ +struct lock_manager_operations nfsd_posix_mng_ops = { +}; static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { - struct nfs4_stateowner *sop = (struct nfs4_stateowner *) fl->fl_owner; - unsigned int hval = lockownerid_hashval(sop->so_id); + struct nfs4_stateowner *sop; + unsigned int hval; - deny->ld_sop = NULL; - if (nfs4_verify_lock_stateowner(sop, hval)) { + if (fl->fl_lmops == &nfsd_posix_mng_ops) { + sop = (struct nfs4_stateowner *) fl->fl_owner; + hval = lockownerid_hashval(sop->so_id); kref_get(&sop->so_ref); deny->ld_sop = sop; deny->ld_clientid = sop->so_client->cl_clientid; + } else { + deny->ld_sop = NULL; + deny->ld_clientid.cl_boot = 0; + deny->ld_clientid.cl_id = 0; } deny->ld_start = fl->fl_start; deny->ld_length = ~(u64)0; @@ -2736,6 +2727,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lock->lk_offset; if ((lock->lk_length == ~(u64)0) || @@ -2841,6 +2833,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lockt->lt_offset; if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) @@ -2900,6 +2893,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = locku->lu_offset; if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) -- cgit v1.2.3 From e465a77f943f51df1a169426df879340bd0db3f3 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Apr 2006 22:55:23 -0700 Subject: [PATCH] fs/nfsd/nfs4state.c: make a struct static Signed-off-by: Adrian Bunk Cc: Marc Eshel Cc: Andy Adamson Cc: J. Bruce Fields Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ffedce08b4c..a8c2122a481 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2497,7 +2497,7 @@ nfs4_transform_lock_offset(struct file_lock *lock) /* Hack!: For now, we're defining this just so we can use a pointer to it * as a unique cookie to identify our (NFSv4's) posix locks. */ -struct lock_manager_operations nfsd_posix_mng_ops = { +static struct lock_manager_operations nfsd_posix_mng_ops = { }; static inline void -- cgit v1.2.3 From 249920527f9e6e5c305538bbf1ea882ee7dc1c06 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:24 -0700 Subject: [PATCH] knfsd: nfsd4: Wrong error handling in nfs4acl this fixes coverity id #3. Coverity detected dead code, since the == -1 comparison only returns 0 or 1 to error. Therefore the if ( error < 0 ) statement was always false. Seems that this was an if( error = nfs4... ) statement some time ago, which got broken during cleanup. Signed-off-by: Eric Sesterhenn Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 7391f4aabed..63818a51c05 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -790,7 +790,7 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl) continue; error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, - ace->access_mask, ace->whotype, ace->who) == -1; + ace->access_mask, ace->whotype, ace->who); if (error < 0) goto out; -- cgit v1.2.3 From b905b7b0a054d2ab3e0c9304def998546c93f6b5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:25 -0700 Subject: [PATCH] knfsd: nfsd4: better nfs4acl errors We're returning -1 in a few places in the NFSv4<->POSIX acl translation code where we could return a reasonable error. Also allows some minor simplification elsewhere. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4acl.c | 6 +++--- fs/nfsd/nfs4xdr.c | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 63818a51c05..edb107e61b9 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -710,9 +710,9 @@ calculate_posix_ace_count(struct nfs4_acl *n4acl) /* Also, the remaining entries are for named users and * groups, and come in threes (mask, allow, deny): */ if (n4acl->naces < 7) - return -1; + return -EINVAL; if ((n4acl->naces - 7) % 3) - return -1; + return -EINVAL; return 4 + (n4acl->naces - 7)/3; } } @@ -866,7 +866,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, struct nfs4_ace *ace; if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL) - return -1; + return -ENOMEM; ace->type = type; ace->flag = flag; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 03857fd8112..845f25251d8 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -299,11 +299,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia buf, dummy32, &ace.who); if (status) goto out_nfserr; - if (nfs4_acl_add_ace(*acl, ace.type, ace.flag, - ace.access_mask, ace.whotype, ace.who) != 0) { - status = -ENOMEM; + status = nfs4_acl_add_ace(*acl, ace.type, ace.flag, + ace.access_mask, ace.whotype, ace.who); + if (status) goto out_nfserr; - } } } else *acl = NULL; -- cgit v1.2.3 From b5872b0dcc0501035d5ae53c60f8cbbb3798da8a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:26 -0700 Subject: [PATCH] knfsd: nfsd4: fix acl xattr length return We should be using the length from the second vfs_getxattr, in case it changed. (Note: there's still a small race here; we could end up returning -ENOMEM if the length increased between the first and second call. I don't know whether it's worth spending a lot of effort to fix that.) This makes XFS ACLs usable on NFS exports, which they currently aren't, since XFS appears to be returning a too-large value for vfs_getxattr() when it's passed a NULL buffer. So there's probably an XFS bug here too, though since getxattr with a NULL buffer is usually used to decide how much memory to allocate, it may be a fairly harmless bug in most cases. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/vfs.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 31018333dc3..6aa92d0e687 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -371,7 +371,6 @@ out_nfserr: static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) { ssize_t buflen; - int error; buflen = vfs_getxattr(dentry, key, NULL, 0); if (buflen <= 0) @@ -381,10 +380,7 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) if (!*buf) return -ENOMEM; - error = vfs_getxattr(dentry, key, *buf, buflen); - if (error < 0) - return error; - return buflen; + return vfs_getxattr(dentry, key, *buf, buflen); } #endif -- cgit v1.2.3 From cd15654963cf7e4dd938a403de3ec5bcd09f8350 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:27 -0700 Subject: [PATCH] knfsd: nfsd: oops exporting nonexistent directory Export a directory that does not exist: exportfs -orw,fsid=0,insecure,no_subtree_check client:/home/NFS4 Try to mount from client with nfs4. Mount hangs (I'm not sure why - that's another issue). While client is hung, back on server mkdir /home/NFS4 The server panics in dput. I traced the problem back to svc_export_parse() calling path_release() even though path_lookup() failed (it happens to fill in the nameidata structure with a negative dentry - so the test after out: succeeds). After patching, an recreating the problem, the client mount still takes some time before finally exiting with a message "couldn't read superblock". Here is a simple patch to resolve this issue: Signed-off-by: Frank Filz Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c340be0a3f5..4e0578121d9 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -422,7 +422,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) goto out; err = path_lookup(buf, 0, &nd); - if (err) goto out; + if (err) goto out_no_path; exp.h.flags = 0; exp.ex_client = dom; @@ -475,6 +475,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) out: if (nd.dentry) path_release(&nd); + out_no_path: if (dom) auth_domain_put(dom); kfree(buf); -- cgit v1.2.3 From 54cceebb679a8d10fa382422aa2035cdc65fe7ce Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:30 -0700 Subject: [PATCH] knfsd: nfsd: nfsd_setuser doesn't really need to modify rqstp->rq_cred. In addition to setting the processes filesystem id's, nfsd_setuser also modifies the value of the rq_cred which stores the id's that originally came from the rpc call, for example to reflect root squashing. There's no real reason to do that--the only case where rqstp->rq_cred is actually used later on is in the NFSv4 SETCLIENTID/SETCLIENTID_CONFIRM operations, and there the results are the opposite of what we want--those two operations don't deal with the filesystem at all, they only record the credentials used with the rpc call for later reference (so that we may require the same credentials be used on later operations), and the credentials shouldn't vary just because there was or wasn't a previous operation in the compound that referred to some export This fixes a bug which caused mounts from Solaris clients to fail. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/auth.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index cfe9ce88161..6e92b0fe532 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -14,46 +14,46 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) { - struct svc_cred *cred = &rqstp->rq_cred; + struct svc_cred cred = rqstp->rq_cred; int i; int ret; if (exp->ex_flags & NFSEXP_ALLSQUASH) { - cred->cr_uid = exp->ex_anon_uid; - cred->cr_gid = exp->ex_anon_gid; - put_group_info(cred->cr_group_info); - cred->cr_group_info = groups_alloc(0); + cred.cr_uid = exp->ex_anon_uid; + cred.cr_gid = exp->ex_anon_gid; + cred.cr_group_info = groups_alloc(0); } else if (exp->ex_flags & NFSEXP_ROOTSQUASH) { struct group_info *gi; - if (!cred->cr_uid) - cred->cr_uid = exp->ex_anon_uid; - if (!cred->cr_gid) - cred->cr_gid = exp->ex_anon_gid; - gi = groups_alloc(cred->cr_group_info->ngroups); + if (!cred.cr_uid) + cred.cr_uid = exp->ex_anon_uid; + if (!cred.cr_gid) + cred.cr_gid = exp->ex_anon_gid; + gi = groups_alloc(cred.cr_group_info->ngroups); if (gi) - for (i = 0; i < cred->cr_group_info->ngroups; i++) { - if (!GROUP_AT(cred->cr_group_info, i)) + for (i = 0; i < cred.cr_group_info->ngroups; i++) { + if (!GROUP_AT(cred.cr_group_info, i)) GROUP_AT(gi, i) = exp->ex_anon_gid; else - GROUP_AT(gi, i) = GROUP_AT(cred->cr_group_info, i); + GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i); } - put_group_info(cred->cr_group_info); - cred->cr_group_info = gi; - } + cred.cr_group_info = gi; + } else + get_group_info(cred.cr_group_info); - if (cred->cr_uid != (uid_t) -1) - current->fsuid = cred->cr_uid; + if (cred.cr_uid != (uid_t) -1) + current->fsuid = cred.cr_uid; else current->fsuid = exp->ex_anon_uid; - if (cred->cr_gid != (gid_t) -1) - current->fsgid = cred->cr_gid; + if (cred.cr_gid != (gid_t) -1) + current->fsgid = cred.cr_gid; else current->fsgid = exp->ex_anon_gid; - if (!cred->cr_group_info) + if (!cred.cr_group_info) return -ENOMEM; - ret = set_current_groups(cred->cr_group_info); - if ((cred->cr_uid)) { + ret = set_current_groups(cred.cr_group_info); + put_group_info(cred.cr_group_info); + if ((cred.cr_uid)) { cap_t(current->cap_effective) &= ~CAP_NFSD_MASK; } else { cap_t(current->cap_effective) |= (CAP_NFSD_MASK & -- cgit v1.2.3 From f0e2993e9e73e8f38b05a89c98b9db94fec2199d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:31 -0700 Subject: [PATCH] knfsd: nfsd4: remove nfsd_setuser from putrootfh Since nfsd_setuser() is already called from any operation that uses the current filehandle (because it's called from fh_verify), there's no reason to call it from putrootfh. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4proc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ca8a4c410de..b0e095ea0c0 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -288,8 +288,6 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh) fh_put(current_fh); status = exp_pseudoroot(rqstp->rq_client, current_fh, &rqstp->rq_chandle); - if (!status) - status = nfserrno(nfsd_setuser(rqstp, current_fh->fh_export)); return status; } -- cgit v1.2.3 From 6ed6decccf544970664757464cfb67e081775e6a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:32 -0700 Subject: [PATCH] knfsd: nfsd4: fix corruption of returned data when using 64k pages In v4 we grab an extra page just for the padding of returned data. The formula that the rpc server uses to allocate pages for the response doesn't take into account this extra page. Instead of adjusting those formulae, we adopt the same solution as v2 and v3, and put the "tail" data in the same page as the "head" data. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4xdr.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 845f25251d8..f2710cfc0bc 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2084,27 +2084,20 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read WRITE32(eof); WRITE32(maxcount); ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; - + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; resp->xbuf->page_len = maxcount; - /* read zero bytes -> don't set up tail */ - if(!maxcount) - return 0; - - /* set up page for remaining responses */ - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; - if (maxcount&3) { - *(resp->p)++ = 0; + RESERVE_SPACE(4); + WRITE32(0); resp->xbuf->tail[0].iov_base += maxcount&3; resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); } return 0; } @@ -2141,21 +2134,20 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r WRITE32(maxcount); ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->page_len = maxcount; - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; - - resp->xbuf->page_len = maxcount; if (maxcount&3) { - *(resp->p)++ = 0; + RESERVE_SPACE(4); + WRITE32(0); resp->xbuf->tail[0].iov_base += maxcount&3; resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); } return 0; } -- cgit v1.2.3 From bb6e8a9f4005237401a45f1ea43db060ea5f9725 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:33 -0700 Subject: [PATCH] knfsd: nfsd4: fix corruption on readdir encoding with 64k pages Fix corruption on readdir encoding with 64k pages. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4xdr.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f2710cfc0bc..de3998f15f1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2157,7 +2157,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re { int maxcount; loff_t offset; - u32 *page, *savep; + u32 *page, *savep, *tailbase; ENCODE_HEAD; if (nfserr) @@ -2173,6 +2173,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re WRITE32(0); ADJUST_ARGS(); resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + tailbase = p; maxcount = PAGE_SIZE; if (maxcount > readdir->rd_maxcount) @@ -2217,14 +2218,12 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re *p++ = htonl(readdir->common.err == nfserr_eof); resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - /* allocate a page for the tail */ - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = tailbase; resp->xbuf->tail[0].iov_len = 0; resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; + resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4; return 0; err_no_verf: -- cgit v1.2.3 From 5e8d5c29482dc56de5971ddc99c6e7f69e4d16f6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:37 -0700 Subject: [PATCH] knfsd: nfsd4: fix laundromat shutdown race We need to make sure the laundromat work doesn't reschedule itself just when we try to cancel it. Also, we shouldn't be waiting for it to finish running while holding the state lock, as that's a potential deadlock. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a8c2122a481..01ff544dc1f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3238,8 +3238,6 @@ __nfs4_state_shutdown(void) } cancel_delayed_work(&laundromat_work); - flush_workqueue(laundry_wq); - destroy_workqueue(laundry_wq); nfsd4_shutdown_recdir(); nfs4_init = 0; } @@ -3247,6 +3245,8 @@ __nfs4_state_shutdown(void) void nfs4_state_shutdown(void) { + cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work); + destroy_workqueue(laundry_wq); nfs4_lock_state(); nfs4_release_reclaim(); __nfs4_state_shutdown(); -- cgit v1.2.3 From 541e0e09814594e907e18fb8d9fc9b746aa4b18a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:38 -0700 Subject: [PATCH] knfsd: nfsd4: nfsd4_probe_callback cleanup Some obvious cleanup. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4callback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c872bd07fc1..dbaf3f93f32 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -441,8 +441,9 @@ nfsd4_probe_callback(struct nfs4_client *clp) goto out_clnt; } - /* the task holds a reference to the nfs4_client struct */ cb->cb_client = clnt; + + /* the task holds a reference to the nfs4_client struct */ atomic_inc(&clp->cl_count); msg.rpc_cred = nfsd4_lookupcred(clp,0); @@ -460,13 +461,12 @@ nfsd4_probe_callback(struct nfs4_client *clp) out_rpciod: atomic_dec(&clp->cl_count); rpciod_down(); + cb->cb_client = NULL; out_clnt: rpc_shutdown_client(clnt); - goto out_err; out_err: dprintk("NFSD: warning: no callback path to client %.*s\n", (int)clp->cl_name.len, clp->cl_name.data); - cb->cb_client = NULL; } static void -- cgit v1.2.3 From 4e2fd495b520b51e4ba83340f13520b7f07e3743 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:39 -0700 Subject: [PATCH] knfsd: nfsd4: add missing rpciod_down() We should be shutting down rpciod for the callback channel when we shut down the server. Also note that we do rpciod_up() and create the callback client *before* setting cb_set--the cb_set only determines whether the initial null was succesful. So cb_set is not a reliable determiner of whether we need to clean up, only cb_client is. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 01ff544dc1f..e97c58aafde 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -329,23 +329,30 @@ put_nfs4_client(struct nfs4_client *clp) free_client(clp); } +static void +shutdown_callback_client(struct nfs4_client *clp) +{ + struct rpc_clnt *clnt = clp->cl_callback.cb_client; + + /* shutdown rpc client, ending any outstanding recall rpcs */ + if (clnt) { + clp->cl_callback.cb_client = NULL; + rpc_shutdown_client(clnt); + rpciod_down(); + } +} + static void expire_client(struct nfs4_client *clp) { struct nfs4_stateowner *sop; struct nfs4_delegation *dp; - struct nfs4_callback *cb = &clp->cl_callback; - struct rpc_clnt *clnt = clp->cl_callback.cb_client; struct list_head reaplist; dprintk("NFSD: expire_client cl_count %d\n", atomic_read(&clp->cl_count)); - /* shutdown rpc client, ending any outstanding recall rpcs */ - if (atomic_read(&cb->cb_set) == 1 && clnt) { - rpc_shutdown_client(clnt); - clnt = clp->cl_callback.cb_client = NULL; - } + shutdown_callback_client(clp); INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); -- cgit v1.2.3 From ef0f3390ebedac78bff1936bbb26606bca83e891 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:41 -0700 Subject: [PATCH] knfsd: nfsd4: limit number of delegations handed out. It's very easy for the server to DOS itself by just giving out too many delegations. For now we just solve the problem with a dumb hard limit. Eventually we'll want a smarter policy. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 74 +++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e97c58aafde..1e2a89aaf89 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -147,6 +147,42 @@ get_nfs4_file(struct nfs4_file *fi) kref_get(&fi->fi_ref); } +static int num_delegations; + +/* + * Open owner state (share locks) + */ + +/* hash tables for nfs4_stateowner */ +#define OWNER_HASH_BITS 8 +#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) +#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) + +#define ownerid_hashval(id) \ + ((id) & OWNER_HASH_MASK) +#define ownerstr_hashval(clientid, ownername) \ + (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) + +static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; +static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; + +/* hash table for nfs4_file */ +#define FILE_HASH_BITS 8 +#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) +#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) +/* hash table for (open)nfs4_stateid */ +#define STATEID_HASH_BITS 10 +#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) +#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) + +#define file_hashval(x) \ + hash_ptr(x, FILE_HASH_BITS) +#define stateid_hashval(owner_id, file_id) \ + (((owner_id) + (file_id)) & STATEID_HASH_MASK) + +static struct list_head file_hashtbl[FILE_HASH_SIZE]; +static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -155,9 +191,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; dprintk("NFSD alloc_init_deleg\n"); + if (num_delegations > STATEID_HASH_SIZE * 4) + return NULL; dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); if (dp == NULL) return dp; + num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); @@ -192,6 +231,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp) dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); kmem_cache_free(deleg_slab, dp); + num_delegations--; } } @@ -943,40 +983,6 @@ out: return status; } -/* - * Open owner state (share locks) - */ - -/* hash tables for nfs4_stateowner */ -#define OWNER_HASH_BITS 8 -#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) -#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) - -#define ownerid_hashval(id) \ - ((id) & OWNER_HASH_MASK) -#define ownerstr_hashval(clientid, ownername) \ - (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) - -static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; -static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; - -/* hash table for nfs4_file */ -#define FILE_HASH_BITS 8 -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) -/* hash table for (open)nfs4_stateid */ -#define STATEID_HASH_BITS 10 -#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) -#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) - -#define file_hashval(x) \ - hash_ptr(x, FILE_HASH_BITS) -#define stateid_hashval(owner_id, file_id) \ - (((owner_id) + (file_id)) & STATEID_HASH_MASK) - -static struct list_head file_hashtbl[FILE_HASH_SIZE]; -static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; - /* OPEN Share state helper functions */ static inline struct nfs4_file * alloc_init_file(struct inode *ino) -- cgit v1.2.3 From 358dd55aa3a77fbbae482b83d96733d9ad441d05 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:42 -0700 Subject: [PATCH] knfsd: nfsd4: grant delegations more frequently Keep unused openowners around for at least one lease period, to avoid the need for as many open confirmations and to allow handing out more delegations. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1e2a89aaf89..96c7578cbe1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1199,8 +1199,7 @@ move_to_close_lru(struct nfs4_stateowner *sop) { dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); - unhash_stateowner(sop); - list_add_tail(&sop->so_close_lru, &close_lru); + list_move_tail(&sop->so_close_lru, &close_lru); sop->so_time = get_seconds(); } @@ -1929,8 +1928,7 @@ nfs4_laundromat(void) } dprintk("NFSD: purging unused open stateowner (so_id %d)\n", sop->so_id); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); + release_stateowner(sop); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -3218,15 +3216,8 @@ __nfs4_state_shutdown(void) int i; struct nfs4_client *clp = NULL; struct nfs4_delegation *dp = NULL; - struct nfs4_stateowner *sop = NULL; struct list_head *pos, *next, reaplist; - list_for_each_safe(pos, next, &close_lru) { - sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); - } - for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&conf_id_hashtbl[i])) { clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); -- cgit v1.2.3 From cbb7e577e732f576b9f399bc2600bdc0626c68dc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 14:57:50 +0200 Subject: [PATCH] splice: pass offset around for ->splice_read() and ->splice_write() We need not use ->f_pos as the offset for the file input/output. If the user passed an offset pointer in through sys_splice(), just use that and leave ->f_pos alone. Signed-off-by: Jens Axboe --- fs/splice.c | 86 ++++++++++++++++++++++---------------------- fs/xfs/linux-2.6/xfs_file.c | 12 ++++--- fs/xfs/linux-2.6/xfs_lrw.c | 14 ++++---- fs/xfs/linux-2.6/xfs_lrw.h | 4 +-- fs/xfs/linux-2.6/xfs_vnode.h | 12 +++---- 5 files changed, 68 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index e50a460239d..5d3eda64703 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -231,8 +231,9 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, } static int -__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +__generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; @@ -241,8 +242,8 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, pgoff_t index; int i, error; - index = in->f_pos >> PAGE_CACHE_SHIFT; - offset = in->f_pos & ~PAGE_CACHE_MASK; + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) @@ -348,8 +349,9 @@ fill_it: * * Will read pages from given file and fill them into a pipe. */ -ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { ssize_t spliced; int ret; @@ -358,12 +360,12 @@ ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, spliced = 0; while (len) { - ret = __generic_file_splice_read(in, pipe, len, flags); + ret = __generic_file_splice_read(in, ppos, pipe, len, flags); if (ret <= 0) break; - in->f_pos += ret; + *ppos += ret; len -= ret; spliced += ret; @@ -561,7 +563,7 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags, + loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { int ret, do_wakeup, err; @@ -573,7 +575,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, sd.total_len = len; sd.flags = flags; sd.file = out; - sd.pos = out->f_pos; + sd.pos = *ppos; if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); @@ -656,9 +658,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - out->f_pos = sd.pos; return ret; - } /** @@ -674,12 +674,12 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, */ ssize_t generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { struct address_space *mapping = out->f_mapping; ssize_t ret; - ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); + ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); /* * If file or inode is SYNC and we actually wrote some data, sync it. @@ -715,9 +715,9 @@ EXPORT_SYMBOL(generic_file_splice_write); * */ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { - return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage); + return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -726,9 +726,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { - loff_t pos; int ret; if (unlikely(!out->f_op || !out->f_op->splice_write)) @@ -737,22 +736,21 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; - pos = out->f_pos; - - ret = rw_verify_area(WRITE, out, &pos, len); + ret = rw_verify_area(WRITE, out, ppos, len); if (unlikely(ret < 0)) return ret; - return out->f_op->splice_write(pipe, out, len, flags); + return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +static long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { - loff_t pos, isize, left; + loff_t isize, left; int ret; if (unlikely(!in->f_op || !in->f_op->splice_read)) @@ -761,28 +759,27 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; - pos = in->f_pos; - - ret = rw_verify_area(READ, in, &pos, len); + ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; isize = i_size_read(in->f_mapping->host); - if (unlikely(in->f_pos >= isize)) + if (unlikely(*ppos >= isize)) return 0; - left = isize - in->f_pos; + left = isize - *ppos; if (unlikely(left < len)) len = left; - return in->f_op->splice_read(in, pipe, len, flags); + return in->f_op->splice_read(in, ppos, pipe, len, flags); } -long do_splice_direct(struct file *in, struct file *out, size_t len, - unsigned int flags) +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + size_t len, unsigned int flags) { struct pipe_inode_info *pipe; long ret, bytes; + loff_t out_off; umode_t i_mode; int i; @@ -820,6 +817,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, */ ret = 0; bytes = 0; + out_off = 0; while (len) { size_t read_len, max_read_len; @@ -829,7 +827,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, */ max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); - ret = do_splice_to(in, pipe, max_read_len, flags); + ret = do_splice_to(in, ppos, pipe, max_read_len, flags); if (unlikely(ret < 0)) goto out_release; @@ -840,7 +838,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ - ret = do_splice_from(pipe, out, read_len, + ret = do_splice_from(pipe, out, &out_off, read_len, flags & ~SPLICE_F_NONBLOCK); if (unlikely(ret < 0)) goto out_release; @@ -898,6 +896,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, size_t len, unsigned int flags) { struct pipe_inode_info *pipe; + loff_t offset, *off; pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { @@ -906,12 +905,13 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_out) { if (out->f_op->llseek == no_llseek) return -EINVAL; - if (copy_from_user(&out->f_pos, off_out, - sizeof(loff_t))) + if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; - } + off = &offset; + } else + off = &out->f_pos; - return do_splice_from(pipe, out, len, flags); + return do_splice_from(pipe, out, off, len, flags); } pipe = out->f_dentry->d_inode->i_pipe; @@ -921,11 +921,13 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_in) { if (in->f_op->llseek == no_llseek) return -EINVAL; - if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; - } + off = &offset; + } else + off = &in->f_pos; - return do_splice_to(in, pipe, len, flags); + return do_splice_to(in, off, pipe, len, flags); } return -EINVAL; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 269721af02f..c847416f6d1 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -252,6 +252,7 @@ xfs_file_sendfile_invis( STATIC ssize_t xfs_file_splice_read( struct file *infilp, + loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -259,13 +260,14 @@ xfs_file_splice_read( vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_READ(vp, infilp, pipe, len, flags, 0, NULL, rval); + VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval); return rval; } STATIC ssize_t xfs_file_splice_read_invis( struct file *infilp, + loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -273,7 +275,7 @@ xfs_file_splice_read_invis( vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_READ(vp, infilp, pipe, len, flags, IO_INVIS, NULL, rval); + VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval); return rval; } @@ -281,13 +283,14 @@ STATIC ssize_t xfs_file_splice_write( struct pipe_inode_info *pipe, struct file *outfilp, + loff_t *ppos, size_t len, unsigned int flags) { vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, 0, NULL, rval); + VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval); return rval; } @@ -295,13 +298,14 @@ STATIC ssize_t xfs_file_splice_write_invis( struct pipe_inode_info *pipe, struct file *outfilp, + loff_t *ppos, size_t len, unsigned int flags) { vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, IO_INVIS, NULL, rval); + VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval); return rval; } diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 74a52937f20..67efe330898 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -338,6 +338,7 @@ ssize_t xfs_splice_read( bhv_desc_t *bdp, struct file *infilp, + loff_t *ppos, struct pipe_inode_info *pipe, size_t count, int flags, @@ -360,7 +361,7 @@ xfs_splice_read( int error; error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), - infilp->f_pos, count, + *ppos, count, FILP_DELAY_FLAG(infilp), &locktype); if (error) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -368,8 +369,8 @@ xfs_splice_read( } } xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore, - pipe, count, infilp->f_pos, ioflags); - ret = generic_file_splice_read(infilp, pipe, count, flags); + pipe, count, *ppos, ioflags); + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -382,6 +383,7 @@ xfs_splice_write( bhv_desc_t *bdp, struct pipe_inode_info *pipe, struct file *outfilp, + loff_t *ppos, size_t count, int flags, int ioflags, @@ -403,7 +405,7 @@ xfs_splice_write( int error; error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp), - outfilp->f_pos, count, + *ppos, count, FILP_DELAY_FLAG(outfilp), &locktype); if (error) { xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -411,8 +413,8 @@ xfs_splice_write( } } xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, - pipe, count, outfilp->f_pos, ioflags); - ret = generic_file_splice_write(pipe, outfilp, count, flags); + pipe, count, *ppos, ioflags); + ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); if (ret > 0) XFS_STATS_ADD(xs_write_bytes, ret); diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index 55c689a86ad..8f453995235 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -93,11 +93,11 @@ extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); -extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, +extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *, struct pipe_inode_info *, size_t, int, int, struct cred *); extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *, - struct file *, size_t, int, int, + struct file *, loff_t *, size_t, int, int, struct cred *); #endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 88b09f18628..2a8e16c2235 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -173,11 +173,11 @@ typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); -typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, +typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *, struct pipe_inode_info *, size_t, int, int, struct cred *); typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *, - struct file *, size_t, int, int, + struct file *, loff_t *, size_t, int, int, struct cred *); typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, int, unsigned int, void __user *); @@ -284,10 +284,10 @@ typedef struct vnodeops { rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) #define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \ rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr) -#define VOP_SPLICE_READ(vp,f,pipe,cnt,fl,iofl,cr,rv) \ - rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr) -#define VOP_SPLICE_WRITE(vp,f,pipe,cnt,fl,iofl,cr,rv) \ - rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr) +#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr) +#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr) #define VOP_BMAP(vp,of,sz,rw,b,n,rv) \ rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n) #define VOP_OPEN(vp, cr, rv) \ -- cgit v1.2.3 From 70524490ee2ea1bbf6cee6c106597b3ac25a3fc2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 15:51:17 +0200 Subject: [PATCH] splice: add support for sys_tee() Basically an in-kernel implementation of tee, which uses splice and the pipe buffers as an intelligent way to pass data around by reference. Where the user space tee consumes the input and produces a stdout and file output, this syscall merely duplicates the data inside a pipe to another pipe. No data is copied, the output just grabs a reference to the input pipe data. Signed-off-by: Jens Axboe --- fs/pipe.c | 7 +++ fs/splice.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index e984beb93a0..7fefb10db8d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -131,12 +131,19 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, return 0; } +static void anon_pipe_buf_get(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + static struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .map = anon_pipe_buf_map, .unmap = anon_pipe_buf_unmap, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, + .get = anon_pipe_buf_get, }; static ssize_t diff --git a/fs/splice.c b/fs/splice.c index 5d3eda64703..8d57e89924a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -125,12 +125,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, kunmap(buf->page); } +static void page_cache_pipe_buf_get(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + static struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = page_cache_pipe_buf_map, .unmap = page_cache_pipe_buf_unmap, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, + .get = page_cache_pipe_buf_get, }; /* @@ -963,3 +970,182 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, return error; } + +/* + * Link contents of ipipe to opipe. + */ +static int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, do_wakeup = 0, i; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by inode address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + if (ipipe->inode < opipe->inode) { + mutex_lock(&ipipe->inode->i_mutex); + mutex_lock(&opipe->inode->i_mutex); + } else { + mutex_lock(&opipe->inode->i_mutex); + mutex_lock(&ipipe->inode->i_mutex); + } + + for (i = 0;; i++) { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + if (ipipe->nrbufs - i) { + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); + + /* + * If we have room, fill this buffer + */ + if (opipe->nrbufs < PIPE_BUFFERS) { + int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + + if (obuf->len > len) + obuf->len = len; + + opipe->nrbufs++; + do_wakeup = 1; + ret += obuf->len; + len -= obuf->len; + + if (!len) + break; + if (opipe->nrbufs < PIPE_BUFFERS) + continue; + } + + /* + * We have input available, but no output room. + * If we already copied data, return that. + */ + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + + opipe->waiting_writers++; + pipe_wait(opipe); + opipe->waiting_writers--; + continue; + } + + /* + * No input buffers, do the usual checks for available + * writers and blocking and wait if necessary + */ + if (!ipipe->writers) + break; + if (!ipipe->waiting_writers) { + if (ret) + break; + } + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (waitqueue_active(&ipipe->wait)) + wake_up_interruptible_sync(&ipipe->wait); + kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + + pipe_wait(ipipe); + } + + mutex_unlock(&ipipe->inode->i_mutex); + mutex_unlock(&opipe->inode->i_mutex); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + } + + return ret; +} + +/* + * This is a tee(1) implementation that works on pipes. It doesn't copy + * any data, it simply references the 'in' pages on the 'out' pipe. + * The 'flags' used are the SPLICE_F_* variants, currently the only + * applicable one is SPLICE_F_NONBLOCK. + */ +static long do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; + struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + + /* + * Link ipipe to the two output pipes, consuming as we go along. + */ + if (ipipe && opipe) + return link_pipe(ipipe, opipe, len, flags); + + return -EINVAL; +} + +asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) +{ + struct file *in; + int error, fput_in; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fget_light(fdin, &fput_in); + if (in) { + if (in->f_mode & FMODE_READ) { + int fput_out; + struct file *out = fget_light(fdout, &fput_out); + + if (out) { + if (out->f_mode & FMODE_WRITE) + error = do_tee(in, out, len, flags); + fput_light(out, fput_out); + } + } + fput_light(in, fput_in); + } + + return error; +} -- cgit v1.2.3 From 73ce8355c243a434524a34c05cc417dd0467996e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 11 Apr 2006 21:14:26 +0200 Subject: [fuse] fix deadlock between fuse_put_super() and request_end() A deadlock was possible, when the last reference to the superblock was held due to a background request containing a file reference. Releasing the file would release the vfsmount which in turn would release the superblock. Since sbput_sem is held during the fput() and fuse_put_super() tries to acquire this same semaphore, a deadlock results. The chosen soltuion is to get rid of sbput_sem, and instead use the spinlock to ensure the referenced inodes/file are released only once. Since the actual release may sleep, defer these outside the locked region, but using local variables instead of the structure members. This is a much more rubust solution. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 28 ++++++++++++++++------------ fs/fuse/fuse_i.h | 12 +++--------- fs/fuse/inode.c | 27 +++++++++++++++++---------- 3 files changed, 36 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6c740f86066..d4efb6223e2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -120,20 +120,14 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) } } -void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) +void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req) { - iput(req->inode); - iput(req->inode2); - if (req->file) - fput(req->file); - spin_lock(&fc->lock); - list_del(&req->bg_entry); + list_del_init(&req->bg_entry); if (fc->num_background == FUSE_MAX_BACKGROUND) { fc->blocked = 0; wake_up_all(&fc->blocked_waitq); } fc->num_background--; - spin_unlock(&fc->lock); } /* @@ -163,17 +157,27 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); fuse_put_request(fc, req); } else { + struct inode *inode = req->inode; + struct inode *inode2 = req->inode2; + struct file *file = req->file; void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; + req->inode = NULL; + req->inode2 = NULL; + req->file = NULL; + if (!list_empty(&req->bg_entry)) + fuse_remove_background(fc, req); spin_unlock(&fc->lock); - down_read(&fc->sbput_sem); - if (fc->mounted) - fuse_release_background(fc, req); - up_read(&fc->sbput_sem); + if (end) end(fc, req); else fuse_put_request(fc, req); + + if (file) + fput(file); + iput(inode); + iput(inode2); } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 19c7185a754..ee9b8304251 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -255,15 +255,9 @@ struct fuse_conn { /** waitq for blocked connection */ wait_queue_head_t blocked_waitq; - /** RW semaphore for exclusion with fuse_put_super() */ - struct rw_semaphore sbput_sem; - /** The next unique request id */ u64 reqctr; - /** Mount is active */ - unsigned mounted; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -474,11 +468,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); void request_send_background(struct fuse_conn *fc, struct fuse_req *req); /** - * Release inodes and file associated with background request + * Remove request from the the background list */ -void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req); +void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req); -/* Abort all requests */ +/** Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); /** diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd34037b058..43a6fc0db8a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -204,17 +204,26 @@ static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); - down_write(&fc->sbput_sem); - while (!list_empty(&fc->background)) - fuse_release_background(fc, - list_entry(fc->background.next, - struct fuse_req, bg_entry)); - spin_lock(&fc->lock); - fc->mounted = 0; fc->connected = 0; + while (!list_empty(&fc->background)) { + struct fuse_req *req = list_entry(fc->background.next, + struct fuse_req, bg_entry); + struct inode *inode = req->inode; + struct inode *inode2 = req->inode2; + + /* File would hold a reference to vfsmount */ + BUG_ON(req->file); + req->inode = NULL; + req->inode2 = NULL; + fuse_remove_background(fc, req); + + spin_unlock(&fc->lock); + iput(inode); + iput(inode2); + spin_lock(&fc->lock); + } spin_unlock(&fc->lock); - up_write(&fc->sbput_sem); /* Flush all readers on this fs */ kill_fasync(&fc->fasync, SIGIO, POLL_IN); wake_up_all(&fc->waitq); @@ -386,7 +395,6 @@ static struct fuse_conn *new_conn(void) INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); INIT_LIST_HEAD(&fc->background); - init_rwsem(&fc->sbput_sem); kobj_set_kset_s(fc, connections_subsys); kobject_init(&fc->kobj); atomic_set(&fc->num_waiting, 0); @@ -541,7 +549,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_free_req; sb->s_root = root_dentry; - fc->mounted = 1; fc->connected = 1; kobject_get(&fc->kobj); file->private_data = fc; -- cgit v1.2.3 From 9bc5dddad1294955e70eeb87325ba1505fb5fe2e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 11 Apr 2006 21:16:09 +0200 Subject: [fuse] Fix accounting the number of waiting requests Properly accounting the number of waiting requests was forgotten in "clean up request accounting" patch. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 25 +++++++++++++++++++------ fs/fuse/fuse_i.h | 3 +++ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d4efb6223e2..8538b298a6b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -92,30 +92,39 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) { struct fuse_req *req; sigset_t oldset; + int intr; int err; + atomic_inc(&fc->num_waiting); block_sigs(&oldset); - err = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); + intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); restore_sigs(&oldset); - if (err) - return ERR_PTR(-EINTR); + err = -EINTR; + if (intr) + goto out; req = fuse_request_alloc(); + err = -ENOMEM; if (!req) - return ERR_PTR(-ENOMEM); + goto out; - atomic_inc(&fc->num_waiting); fuse_request_init(req); req->in.h.uid = current->fsuid; req->in.h.gid = current->fsgid; req->in.h.pid = current->pid; + req->waiting = 1; return req; + + out: + atomic_dec(&fc->num_waiting); + return ERR_PTR(err); } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - atomic_dec(&fc->num_waiting); + if (req->waiting) + atomic_dec(&fc->num_waiting); fuse_request_free(req); } } @@ -281,6 +290,10 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) len_args(req->in.numargs, (struct fuse_arg *) req->in.args); list_add_tail(&req->list, &fc->pending); req->state = FUSE_REQ_PENDING; + if (!req->waiting) { + req->waiting = 1; + atomic_inc(&fc->num_waiting); + } wake_up(&fc->waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ee9b8304251..59661c481d9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -159,6 +159,9 @@ struct fuse_req { /** Data is being copied to/from the request */ unsigned locked:1; + /** Request is counted as "waiting" */ + unsigned waiting:1; + /** State of the request */ enum fuse_req_state state; -- cgit v1.2.3 From 4858cae4f0904681eab58a16891c22397618a2a2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 11 Apr 2006 21:16:38 +0200 Subject: [fuse] Don't init request twice Request is already initialized in fuse_request_alloc() so no need to do it again in fuse_get_req(). Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8538b298a6b..cc750c68fe7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -108,7 +108,6 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) if (!req) goto out; - fuse_request_init(req); req->in.h.uid = current->fsuid; req->in.h.gid = current->fsgid; req->in.h.pid = current->pid; -- cgit v1.2.3 From 56cf34ff0795692327234963dcdcc2cdeec2bb3d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 11 Apr 2006 21:16:51 +0200 Subject: [fuse] Direct I/O should not use fuse_reset_request It's cleaner to allocate a new request, otherwise the uid/gid/pid fields of the request won't be filled in. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e4f041a11bb..fc342cf7c2c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi + Copyright (C) 2001-2006 Miklos Szeredi This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -565,8 +565,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, buf += nres; if (nres != nbytes) break; - if (count) - fuse_reset_request(req); + if (count) { + fuse_put_request(fc, req); + req = fuse_get_req(fc); + if (IS_ERR(req)) + break; + } } fuse_put_request(fc, req); if (res > 0) { -- cgit v1.2.3 From c06511d12d720b23c8dffff23004f0a888698f20 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 14 Apr 2006 04:05:55 -0600 Subject: [PATCH] de_thread: Don't change our parents and ptrace flags. This is two distinct changes. - Not changing our real parents. - Not changing our ptrace parents. Not changing our real parents is trivially correct because both tasks have the same real parents as they are part of a thread group. Now that we demote the leader to a thread there is no longer any reason to change it's parentage. Not changing our ptrace parents is a user visible change if someone looks hard enough. I don't think user space applications will care or even notice. In the practical and I think common case a debugger will have attached to all of the threads using the same ptrace flags. From my quick skim of strace and gdb that appears to be the case. Which if true means debuggers will not notice a change. Before this point we have already generated a ptrace event in do_exit that reports the leaders pid has died so de_thread is visible to a debugger. Which means attempting to hide this case by copying flags around appears excessive. By not doing anything it avoids all of the weird locking issues between de_thread and ptrace attach, and removes one case from consideration for fixing the ptrace locking. This only addresses Oleg's first concern with ptrace_attach, that of the problems caused by reparenting. Oleg's second concern is essentially a race between ptrace_attach and release_task that causes an oops when we get to force_sig_specific. There is nothing special about de_thread with respect to that race. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- fs/exec.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 3234a0c32d5..4121bb55973 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -665,9 +665,7 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(current)) { - struct task_struct *parent; struct dentry *proc_dentry1, *proc_dentry2; - unsigned long ptrace; /* * Wait for the thread group leader to be a zombie. @@ -704,22 +702,6 @@ static int de_thread(struct task_struct *tsk) * two threads with a switched PID, and release * the former thread group leader: */ - ptrace = leader->ptrace; - parent = leader->parent; - if (unlikely(ptrace) && unlikely(parent == current)) { - /* - * Joker was ptracing his own group leader, - * and now he wants to be his own parent! - * We can't have that. - */ - ptrace = 0; - } - - ptrace_unlink(current); - ptrace_unlink(leader); - remove_parent(current); - remove_parent(leader); - /* Become a process group leader with the old leader's pid. * Note: The old leader also uses thispid until release_task @@ -732,8 +714,6 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_SID, current->signal->session); list_add_tail(¤t->tasks, &init_task.tasks); - current->parent = current->real_parent = leader->real_parent; - leader->parent = leader->real_parent = child_reaper; current->group_leader = current; leader->group_leader = current; @@ -742,13 +722,6 @@ static int de_thread(struct task_struct *tsk) detach_pid(leader, PIDTYPE_SID); list_del_init(&leader->tasks); - add_parent(current); - add_parent(leader); - if (ptrace) { - current->ptrace = ptrace; - __ptrace_link(current, parent); - } - current->exit_signal = SIGCHLD; BUG_ON(leader->exit_state != EXIT_ZOMBIE); -- cgit v1.2.3 From 4508a7a734b111b8b7e39986237d84acb1168dd0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 20 Mar 2006 17:53:53 +1100 Subject: [PATCH] sysfs: Allow sysfs attribute files to be pollable It works like this: Open the file Read all the contents. Call poll requesting POLLERR or POLLPRI (so select/exceptfds works) When poll returns, close the file and go to top of loop. or lseek to start of file and go back to the 'read'. Events are signaled by an object manager calling sysfs_notify(kobj, dir, attr); If the dir is non-NULL, it is used to find a subdirectory which contains the attribute (presumably created by sysfs_create_group). This has a cost of one int per attribute, one wait_queuehead per kobject, one int per open file. The name "sysfs_notify" may be confused with the inotify functionality. Maybe it would be nice to support inotify for sysfs attributes as well? This patch also uses sysfs_notify to allow /sys/block/md*/md/sync_action to be pollable Signed-off-by: Neil Brown Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 1 + fs/sysfs/file.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/sysfs/sysfs.h | 1 + 3 files changed, 78 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 6cfdc9a8777..610b5bdbe75 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, memset(sd, 0, sizeof(*sd)); atomic_set(&sd->s_count, 1); + atomic_set(&sd->s_event, 0); INIT_LIST_HEAD(&sd->s_children); list_add(&sd->s_sibling, &parent_sd->s_children); sd->s_element = element; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index f1cb1ddde51..cf3786625bf 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,7 @@ struct sysfs_buffer { struct sysfs_ops * ops; struct semaphore sem; int needs_read_fill; + int event; }; @@ -72,6 +74,7 @@ struct sysfs_buffer { */ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) { + struct sysfs_dirent * sd = dentry->d_fsdata; struct attribute * attr = to_attr(dentry); struct kobject * kobj = to_kobj(dentry->d_parent); struct sysfs_ops * ops = buffer->ops; @@ -83,6 +86,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer if (!buffer->page) return -ENOMEM; + buffer->event = atomic_read(&sd->s_event); count = ops->show(kobj,attr,buffer->page); buffer->needs_read_fill = 0; BUG_ON(count > (ssize_t)PAGE_SIZE); @@ -348,12 +352,84 @@ static int sysfs_release(struct inode * inode, struct file * filp) return 0; } +/* Sysfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, as simply seeking and reading + * again will not get new data, or reset the state of 'poll'. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Nether 'poll' or 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int sysfs_poll(struct file *filp, poll_table *wait) +{ + struct sysfs_buffer * buffer = filp->private_data; + struct kobject * kobj = to_kobj(filp->f_dentry->d_parent); + struct sysfs_dirent * sd = filp->f_dentry->d_fsdata; + int res = 0; + + poll_wait(filp, &kobj->poll, wait); + + if (buffer->event != atomic_read(&sd->s_event)) { + res = POLLERR|POLLPRI; + buffer->needs_read_fill = 1; + } + + return res; +} + + +static struct dentry *step_down(struct dentry *dir, const char * name) +{ + struct dentry * de; + + if (dir == NULL || dir->d_inode == NULL) + return NULL; + + mutex_lock(&dir->d_inode->i_mutex); + de = lookup_one_len(name, dir, strlen(name)); + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + if (IS_ERR(de)) + return NULL; + if (de->d_inode == NULL) { + dput(de); + return NULL; + } + return de; +} + +void sysfs_notify(struct kobject * k, char *dir, char *attr) +{ + struct dentry *de = k->dentry; + if (de) + dget(de); + if (de && dir) + de = step_down(de, dir); + if (de && attr) + de = step_down(de, attr); + if (de) { + struct sysfs_dirent * sd = de->d_fsdata; + if (sd) + atomic_inc(&sd->s_event); + wake_up_interruptible(&k->poll); + dput(de); + } +} +EXPORT_SYMBOL_GPL(sysfs_notify); + const struct file_operations sysfs_file_operations = { .read = sysfs_read_file, .write = sysfs_write_file, .llseek = generic_file_llseek, .open = sysfs_open_file, .release = sysfs_release, + .poll = sysfs_poll, }; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 32958a7c50e..3651ffb5ec0 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -11,6 +11,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, extern int sysfs_add_file(struct dentry *, const struct attribute *, int); extern void sysfs_hash_and_remove(struct dentry * dir, const char * name); +extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name); extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); extern void sysfs_remove_subdir(struct dentry *); -- cgit v1.2.3 From d4d7e5dffc4844ef51fe11f497bd774c04413a00 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 24 Mar 2006 20:45:35 +0100 Subject: [PATCH] BLOCK: delay all uevents until partition table is scanned [BLOCK] delay all uevents until partition table is scanned Here we delay the annoucement of all block device events until the disk's partition table is scanned and all partition devices are already created and sysfs is populated. We have a bunch of old bugs for removable storage handling where we probe successfully for a filesystem on the raw disk, but at the same time the kernel recognizes a partition table and creates partition devices. Currently there is no sane way to tell if partitions will show up or not at the time the disk device is announced to userspace. With the delayed events we can simply skip any probe for a filesystem on the raw disk when we find already present partitions. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- fs/partitions/check.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index af0cb4b9e78..f3b6af07172 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -331,7 +331,9 @@ void delete_partition(struct gendisk *disk, int part) devfs_remove("%s/part%d", disk->devfs_name, part); if (p->holder_dir) kobject_unregister(p->holder_dir); - kobject_unregister(&p->kobj); + kobject_uevent(&p->kobj, KOBJ_REMOVE); + kobject_del(&p->kobj); + kobject_put(&p->kobj); } void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) @@ -357,7 +359,10 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part); p->kobj.parent = &disk->kobj; p->kobj.ktype = &ktype_part; - kobject_register(&p->kobj); + kobject_init(&p->kobj); + kobject_add(&p->kobj); + if (!disk->part_uevent_suppress) + kobject_uevent(&p->kobj, KOBJ_ADD); partition_sysfs_add_subdir(p); disk->part[part-1] = p; } @@ -395,6 +400,8 @@ void register_disk(struct gendisk *disk) { struct block_device *bdev; char *s; + int i; + struct hd_struct *p; int err; strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN); @@ -406,13 +413,12 @@ void register_disk(struct gendisk *disk) return; disk_sysfs_symlinks(disk); disk_sysfs_add_subdirs(disk); - kobject_uevent(&disk->kobj, KOBJ_ADD); /* No minors to use for partitions */ if (disk->minors == 1) { if (disk->devfs_name[0] != '\0') devfs_add_disk(disk); - return; + goto exit; } /* always add handle for the whole disk */ @@ -420,16 +426,32 @@ void register_disk(struct gendisk *disk) /* No such device (e.g., media were just removed) */ if (!get_capacity(disk)) - return; + goto exit; bdev = bdget_disk(disk, 0); if (!bdev) - return; + goto exit; + /* scan partition table, but suppress uevents */ bdev->bd_invalidated = 1; - if (blkdev_get(bdev, FMODE_READ, 0) < 0) - return; + disk->part_uevent_suppress = 1; + err = blkdev_get(bdev, FMODE_READ, 0); + disk->part_uevent_suppress = 0; + if (err < 0) + goto exit; blkdev_put(bdev); + +exit: + /* announce disk after possible partitions are already created */ + kobject_uevent(&disk->kobj, KOBJ_ADD); + + /* announce possible partitions */ + for (i = 1; i < disk->minors; i++) { + p = disk->part[i-1]; + if (!p || !p->nr_sects) + continue; + kobject_uevent(&p->kobj, KOBJ_ADD); + } } int rescan_partitions(struct gendisk *disk, struct block_device *bdev) -- cgit v1.2.3 From 2436f039d26a91e5404974ee0cb789b17db46168 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 10 Apr 2006 00:17:20 -0700 Subject: [PATCH] Fix block device symlink name As noted further on the this file, some block devices have a / in their name, so fix the "block:..." symlink name the same as the /sys/block name. Signed-off-by: Stephen Rothwell Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/partitions/check.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index f3b6af07172..45ae7dd3c65 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -372,6 +372,7 @@ static char *make_block_name(struct gendisk *disk) char *name; static char *block_str = "block:"; int size; + char *s; size = strlen(block_str) + strlen(disk->disk_name) + 1; name = kmalloc(size, GFP_KERNEL); @@ -379,6 +380,10 @@ static char *make_block_name(struct gendisk *disk) return NULL; strcpy(name, block_str); strcat(name, disk->disk_name); + /* ewww... some of these buggers have / in name... */ + s = strchr(name, '/'); + if (s) + *s = '!'; return name; } -- cgit v1.2.3 From 75616cf9854b83eb83a968b1338ae0ee11c9673c Mon Sep 17 00:00:00 2001 From: "Ananiev, Leonid I" Date: Mon, 10 Apr 2006 22:54:38 -0700 Subject: [PATCH] ext3: Fix missed mutex unlock Missed unlock_super()call is added in error condition code path. Signed-off-by: Leonid Ananiev Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ext3/resize.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 14f5f6ea3e7..c5ffa852396 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -767,6 +767,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (input->group != sbi->s_groups_count) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); + unlock_super(sb); err = -EBUSY; goto exit_journal; } -- cgit v1.2.3 From 0a489cb3b6a7b277030cdbc97c2c65905db94536 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Apr 2006 13:02:48 -0700 Subject: x86: don't allow tail-calls in sys_ftruncate[64]() Gcc thinks it owns the incoming argument stack, but that's not true for "asmlinkage" functions, and it corrupts the caller-set-up argument stack when it pushes the third argument onto the stack. Which can result in %ebx getting corrupted in user space. Now, normally nobody sane would ever notice, since libc will save and restore %ebx anyway over the system call, but it's still wrong. I'd much rather have "asmlinkage" tell gcc directly that it doesn't own the stack, but no such attribute exists, so we're stuck with our hacky manual "prevent_tail_call()" macro once more (we've had the same issue before with sys_waitpid() and sys_wait4()). Thanks to Hans-Werner Hilse for reporting the issue and testing the fix. Signed-off-by: Linus Torvalds --- fs/open.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index c32c89d6d8d..8279c65d3be 100644 --- a/fs/open.c +++ b/fs/open.c @@ -331,7 +331,9 @@ out: asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) { - return do_sys_ftruncate(fd, length, 1); + long ret = do_sys_ftruncate(fd, length, 1); + prevent_tail_call(ret); + return ret; } /* LFS versions of truncate are only needed on 32 bit machines */ @@ -343,7 +345,9 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length) asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) { - return do_sys_ftruncate(fd, length, 0); + long ret = do_sys_ftruncate(fd, length, 0); + prevent_tail_call(ret); + return ret; } #endif -- cgit v1.2.3 From 385910f2b275a636238f70844f1b6da9fda6f2da Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Apr 2006 13:22:59 -0700 Subject: x86: be careful about tailcall breakage for sys_open[at] too Came up through a quick grep for other cases similar to the ftruncate() one in commit 0a489cb3b6a7b277030cdbc97c2c65905db94536. Also, add a comment, so that people who read the code understand why we do what looks like a no-op. (Again, this won't actually matter to any sane user, since libc will save and restore the register gcc stomps on, but it's still wrong to stomp on it) Signed-off-by: Linus Torvalds --- fs/open.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 8279c65d3be..53ec28c3677 100644 --- a/fs/open.c +++ b/fs/open.c @@ -332,6 +332,7 @@ out: asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) { long ret = do_sys_ftruncate(fd, length, 1); + /* avoid REGPARM breakage on x86: */ prevent_tail_call(ret); return ret; } @@ -346,6 +347,7 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length) asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) { long ret = do_sys_ftruncate(fd, length, 0); + /* avoid REGPARM breakage on x86: */ prevent_tail_call(ret); return ret; } @@ -1097,20 +1099,30 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) asmlinkage long sys_open(const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(AT_FDCWD, filename, flags, mode); + ret = do_sys_open(AT_FDCWD, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } EXPORT_SYMBOL_GPL(sys_open); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(dfd, filename, flags, mode); + ret = do_sys_open(dfd, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } EXPORT_SYMBOL_GPL(sys_openat); -- cgit v1.2.3 From 91ad66ef4469cb631ec0ccd131b07f16770773f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:55:10 +0200 Subject: [PATCH] splice: close i_size truncate races on read We need to check i_size after doing a blocking readpage. Signed-off-by: Jens Axboe --- fs/splice.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 8d57e89924a..7e858557472 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -145,8 +145,8 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, - int nr_pages, unsigned long offset, - unsigned long len, unsigned int flags) + int nr_pages, unsigned long len, + unsigned int offset, unsigned int flags) { int ret, do_wakeup, i; @@ -243,14 +243,16 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int offset, nr_pages; + unsigned int loff, offset, nr_pages; struct page *pages[PIPE_BUFFERS]; struct page *page; - pgoff_t index; + pgoff_t index, end_index; + loff_t isize; + size_t bytes; int i, error; index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + loff = offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) @@ -268,6 +270,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * Now fill in the holes: */ error = 0; + bytes = 0; for (i = 0; i < nr_pages; i++, index++) { find_page: /* @@ -336,13 +339,41 @@ readpage: goto find_page; break; } + + /* + * i_size must be checked after ->readpage(). + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + break; + } + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); + if (bytes + loff > isize) { + page_cache_release(page); + break; + } + /* + * force quit after adding this page + */ + nr_pages = i; + } } fill_it: pages[i] = page; + bytes += PAGE_CACHE_SIZE - loff; + loff = 0; } if (i) - return move_to_pipe(pipe, pages, i, offset, len, flags); + return move_to_pipe(pipe, pages, i, bytes, offset, flags); return error; } -- cgit v1.2.3 From c4f895cbe1e95aab633207fb19c650b7c984c01a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:56:12 +0200 Subject: [PATCH] splice: cleanup the SPLICE_F_NONBLOCK handling - generic_file_splice_read() more readable and correct - Don't bail on page allocation with NONBLOCK set, just don't allow direct blocking on IO (eg lock_page). Signed-off-by: Jens Axboe --- fs/splice.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 7e858557472..78cd264340f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -278,14 +278,6 @@ find_page: */ page = find_get_page(mapping, index); if (!page) { - /* - * If in nonblock mode then dont block on - * readpage (we've kicked readahead so there - * will be asynchronous progress): - */ - if (flags & SPLICE_F_NONBLOCK) - break; - /* * page didn't exist, allocate one */ @@ -307,6 +299,13 @@ find_page: * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { + /* + * If in nonblock mode then dont block on waiting + * for an in-flight io page + */ + if (flags & SPLICE_F_NONBLOCK) + break; + lock_page(page); /* @@ -400,17 +399,20 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, while (len) { ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - if (ret <= 0) + if (ret < 0) break; + else if (!ret) { + if (spliced) + break; + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } *ppos += ret; len -= ret; spliced += ret; - - if (!(flags & SPLICE_F_NONBLOCK)) - continue; - ret = -EAGAIN; - break; } if (spliced) -- cgit v1.2.3 From 2a27250e6cf47ca1ea3bea0a55e4b7889c097627 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:56:40 +0200 Subject: [PATCH] tee: link_pipe() must be careful when dropping one of the pipe locks We need to ensure that we only drop a lock that is ordered last, to avoid ABBA deadlocks with competing processes. Signed-off-by: Jens Axboe --- fs/splice.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 78cd264340f..4f5e6b09fb2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1012,7 +1012,9 @@ static int link_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret = 0, do_wakeup = 0, i; + int ret, do_wakeup, i, ipipe_first; + + ret = do_wakeup = ipipe_first = 0; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1020,6 +1022,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, * could deadlock (one doing tee from A -> B, the other from B -> A). */ if (ipipe->inode < opipe->inode) { + ipipe_first = 1; mutex_lock(&ipipe->inode->i_mutex); mutex_lock(&opipe->inode->i_mutex); } else { @@ -1068,9 +1071,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, /* * We have input available, but no output room. - * If we already copied data, return that. + * If we already copied data, return that. If we + * need to drop the opipe lock, it must be ordered + * last to avoid deadlocks. */ - if (flags & SPLICE_F_NONBLOCK) { + if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { if (!ret) ret = -EAGAIN; break; @@ -1104,7 +1109,12 @@ static int link_pipe(struct pipe_inode_info *ipipe, if (ret) break; } - if (flags & SPLICE_F_NONBLOCK) { + /* + * pipe_wait() drops the ipipe mutex. To avoid deadlocks + * with another process, we can only safely do that if + * the ipipe lock is ordered last. + */ + if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { if (!ret) ret = -EAGAIN; break; -- cgit v1.2.3 From a4514ebd8e12c63c09ab02be518db545bd1d24af Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:57:05 +0200 Subject: [PATCH] splice: offset fixes - We need to adjust *ppos for writes as well. - Copy back modified offset value if one was passed in, similar to what sendfile does. Signed-off-by: Jens Axboe --- fs/splice.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 4f5e6b09fb2..27d6408ff49 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -720,22 +720,26 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ssize_t ret; ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); - - /* - * If file or inode is SYNC and we actually wrote some data, sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) - && ret > 0) { + if (ret > 0) { struct inode *inode = mapping->host; - int err; - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(mapping->host, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); + *ppos += ret; + + /* + * If file or inode is SYNC and we actually wrote some data, + * sync it. + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + mutex_lock(&inode->i_mutex); + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + mutex_unlock(&inode->i_mutex); - if (err) - ret = err; + if (err) + ret = err; + } } return ret; @@ -937,6 +941,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *pipe; loff_t offset, *off; + long ret; pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { @@ -951,7 +956,12 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &out->f_pos; - return do_splice_from(pipe, out, off, len, flags); + ret = do_splice_from(pipe, out, off, len, flags); + + if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; } pipe = out->f_dentry->d_inode->i_pipe; @@ -967,7 +977,12 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &in->f_pos; - return do_splice_to(in, off, pipe, len, flags); + ret = do_splice_to(in, off, pipe, len, flags); + + if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; } return -EINVAL; -- cgit v1.2.3 From 9e0267c26e237f84f608a68e579bf4eb89dad819 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:57:31 +0200 Subject: [PATCH] splice: fixup writeout path after ->map changes Since ->map() no longer locks the page, we need to adjust the handling of those pages (and stealing) a little. This now passes full regressions again. Signed-off-by: Jens Axboe --- fs/splice.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 27d6408ff49..22fac87e90b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -50,7 +50,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, struct page *page = buf->page; struct address_space *mapping = page_mapping(page); - WARN_ON(!PageLocked(page)); + lock_page(page); + WARN_ON(!PageUptodate(page)); /* @@ -65,8 +66,10 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, if (PagePrivate(page)) try_to_release_page(page, mapping_gfp_mask(mapping)); - if (!remove_mapping(mapping, page)) + if (!remove_mapping(mapping, page)) { + unlock_page(page); return 1; + } buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; return 0; @@ -507,14 +510,12 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, if (sd->flags & SPLICE_F_MOVE) { /* * If steal succeeds, buf->page is now pruned from the vm - * side (LRU and page cache) and we can reuse it. + * side (LRU and page cache) and we can reuse it. The page + * will also be looked on successful return. */ if (buf->ops->steal(info, buf)) goto find_page; - /* - * this will also set the page locked - */ page = buf->page; if (add_to_page_cache(page, mapping, index, gfp_mask)) goto find_page; @@ -523,15 +524,27 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, lru_cache_add(page); } else { find_page: - ret = -ENOMEM; - page = find_or_create_page(mapping, index, gfp_mask); - if (!page) - goto out_nomem; + page = find_lock_page(mapping, index); + if (!page) { + ret = -ENOMEM; + page = page_cache_alloc_cold(mapping); + if (unlikely(!page)) + goto out_nomem; + + /* + * This will also lock the page + */ + ret = add_to_page_cache_lru(page, mapping, index, + gfp_mask); + if (unlikely(ret)) + goto out; + } /* - * If the page is uptodate, it is also locked. If it isn't - * uptodate, we can mark it uptodate if we are filling the - * full page. Otherwise we need to read it in first... + * We get here with the page locked. If the page is also + * uptodate, we don't need to do more. If it isn't, we + * may need to bring it in if we are not going to overwrite + * the full page. */ if (!PageUptodate(page)) { if (sd->len < PAGE_CACHE_SIZE) { @@ -553,10 +566,8 @@ find_page: ret = -EIO; goto out; } - } else { - WARN_ON(!PageLocked(page)); + } else SetPageUptodate(page); - } } } @@ -585,10 +596,10 @@ find_page: mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { + if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) page_cache_release(page); - unlock_page(page); - } + + unlock_page(page); out_nomem: buf->ops->unmap(info, buf); return ret; -- cgit v1.2.3 From 5e85d4abe3f43bb5362f384bab0e20ef082ce0b5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 18 Apr 2006 22:20:16 -0700 Subject: [PATCH] task: Make task list manipulations RCU safe While we can currently walk through thread groups, process groups, and sessions with just the rcu_read_lock, this opens the door to walking the entire task list. We already have all of the other RCU guarantees so there is no cost in doing this, this should be enough so that proc can stop taking the tasklist lock during readdir. prev_task was killed because it has no users, and using it will miss new tasks when doing an rcu traversal. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 4121bb55973..3a79d97ac23 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -712,7 +712,7 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); - list_add_tail(¤t->tasks, &init_task.tasks); + list_add_tail_rcu(¤t->tasks, &init_task.tasks); current->group_leader = current; leader->group_leader = current; -- cgit v1.2.3 From dda27d1a55e185b0c5fd184b86ac26c66846f095 Mon Sep 17 00:00:00 2001 From: Arthur Othieno Date: Tue, 18 Apr 2006 22:20:57 -0700 Subject: [PATCH] hugetlbfs: add Kconfig help text In kernel bugzilla #6248 (http://bugzilla.kernel.org/show_bug.cgi?id=6248), Adrian Bunk notes that CONFIG_HUGETLBFS is missing Kconfig help text. Signed-off-by: Arthur Othieno Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 2524629dc83..f9b5842c8d2 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -842,6 +842,12 @@ config TMPFS config HUGETLBFS bool "HugeTLB file system support" depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN + help + hugetlbfs is a filesystem backing for HugeTLB pages, based on + ramfs. For architectures that support it, say Y here and read + for details. + + If unsure, say N. config HUGETLB_PAGE def_bool HUGETLBFS -- cgit v1.2.3 From ca99c1da080345e227cfb083c330a184d42e27f3 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Tue, 18 Apr 2006 22:21:46 -0700 Subject: [PATCH] Fix file lookup without ref There are places in the kernel where we look up files in fd tables and access the file structure without holding refereces to the file. So, we need special care to avoid the race between looking up files in the fd table and tearing down of the file in another CPU. Otherwise, one might see a NULL f_dentry or such torn down version of the file. This patch fixes those special places where such a race may happen. Signed-off-by: Dipankar Sarma Acked-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 9 +++++++-- fs/proc/base.c | 21 +++++++++++++++------ 2 files changed, 22 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index dda83d6cd48..efad798824d 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2230,7 +2230,12 @@ void steal_locks(fl_owner_t from) lock_kernel(); j = 0; - rcu_read_lock(); + + /* + * We are not taking a ref to the file structures, so + * we need to acquire ->file_lock. + */ + spin_lock(&files->file_lock); fdt = files_fdtable(files); for (;;) { unsigned long set; @@ -2248,7 +2253,7 @@ void steal_locks(fl_owner_t from) set >>= 1; } } - rcu_read_unlock(); + spin_unlock(&files->file_lock); unlock_kernel(); } EXPORT_SYMBOL(steal_locks); diff --git a/fs/proc/base.c b/fs/proc/base.c index a3a3eecef68..6cc77dc3f3f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -297,16 +297,20 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm files = get_files_struct(task); if (files) { - rcu_read_lock(); + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (file) { *mnt = mntget(file->f_vfsmnt); *dentry = dget(file->f_dentry); - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); return 0; } - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); } return -ENOENT; @@ -1523,7 +1527,12 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, if (!files) goto out_unlock; inode->i_mode = S_IFLNK; - rcu_read_lock(); + + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (!file) goto out_unlock2; @@ -1531,7 +1540,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, inode->i_mode |= S_IRUSR | S_IXUSR; if (file->f_mode & 2) inode->i_mode |= S_IWUSR | S_IXUSR; - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -1541,7 +1550,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, return NULL; out_unlock2: - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); out_unlock: iput(inode); -- cgit v1.2.3 From 95cf959b245832ad49bb333bf88f9805244b225d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Apr 2006 13:14:06 -0400 Subject: VFS: Fix another open intent Oops If the call to nfs_intent_set_file() fails to open a file in nfs4_proc_create(), we should return an error. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 47ece1dd3c6..d86c0db7b1e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1218,7 +1218,7 @@ out: return status; } -static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) +static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) { struct file *filp; @@ -1227,8 +1227,10 @@ static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, st struct nfs_open_context *ctx; ctx = (struct nfs_open_context *)filp->private_data; ctx->state = state; - } else - nfs4_close_state(state, nd->intent.open.flags); + return 0; + } + nfs4_close_state(state, nd->intent.open.flags); + return PTR_ERR(filp); } struct dentry * @@ -1835,7 +1837,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_setattr_update_inode(state->inode, sattr); } if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN)) - nfs4_intent_set_file(nd, dentry, state); + status = nfs4_intent_set_file(nd, dentry, state); else nfs4_close_state(state, flags); out: -- cgit v1.2.3 From e99170ff3b799a9fd43d538932a9231fac1de9d4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Apr 2006 13:21:42 -0400 Subject: NFS,SUNRPC: Fix compiler warnings if CONFIG_PROC_FS & CONFIG_SYSCTL are unset Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 8 +++----- fs/nfs/file.c | 5 ++--- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0f583cb16dd..3c72b0c0728 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -112,10 +112,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode */ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { - struct dentry *dentry = iocb->ki_filp->f_dentry; - dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", - dentry->d_name.name, (long long) pos, nr_segs); + iocb->ki_filp->f_dentry->d_name.name, + (long long) pos, nr_segs); return -EINVAL; } @@ -468,7 +467,6 @@ static const struct rpc_call_ops nfs_commit_direct_ops = { static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) { struct nfs_write_data *data = dreq->commit_data; - struct rpc_task *task = &data->task; data->inode = dreq->inode; data->cred = dreq->ctx->cred; @@ -489,7 +487,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ dreq->commit_data = NULL; - dprintk("NFS: %5u initiated commit call\n", task->tk_pid); + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); lock_kernel(); rpc_execute(&data->task); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f1df2c8d925..fade02c15e6 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -534,10 +534,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) */ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode * inode = filp->f_mapping->host; - dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", - inode->i_sb->s_id, inode->i_ino, + filp->f_dentry->d_inode->i_sb->s_id, + filp->f_dentry->d_inode->i_ino, fl->fl_type, fl->fl_flags); /* -- cgit v1.2.3 From ec535ce154f2eaad3d97f2f20a76a6d8bdac33e5 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 18 Apr 2006 13:21:50 -0400 Subject: NFS: make 2 functions static Signed-off-by: Adrian Bunk Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/lockd/svclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index d2b66bad7d5..3ef739120df 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -650,7 +650,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) svc_wake_up(block->b_daemon); } -void nlmsvc_grant_release(void *data) +static void nlmsvc_grant_release(void *data) { struct nlm_rqst *call = data; -- cgit v1.2.3 From b9d9506d944865876e67281a4e4269d823ce5381 Mon Sep 17 00:00:00 2001 From: John Hawkes Date: Wed, 19 Apr 2006 13:06:20 -0400 Subject: NFS: nfs_show_stats; for_each_possible_cpu(), not NR_CPUS Convert a for-loop that explicitly references "NR_CPUS" into the potentially more efficient for_each_possible_cpu() construct. Signed-off-by: John Hawkes Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2f7656b911b..d0b991a9232 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -700,12 +700,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) /* * Display superblock I/O counters */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { struct nfs_iostats *stats; - if (!cpu_possible(cpu)) - continue; - preempt_disable(); stats = per_cpu_ptr(nfss->io_stats, cpu); -- cgit v1.2.3 From 7451c4f0ee53e36fd74168af8df75b28fd04a2aa Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 19 Apr 2006 13:06:37 -0400 Subject: NFS: remove needless check in nfs_opendir() Local variable res was initialized to 0 - no check needed here. Signed-off-by: Carsten Otte Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a23f3489416..cae74dd4c7f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -128,15 +128,14 @@ struct inode_operations nfs4_dir_inode_operations = { static int nfs_opendir(struct inode *inode, struct file *filp) { - int res = 0; + int res; dfprintk(VFS, "NFS: opendir(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); lock_kernel(); /* Call generic open code in order to cache credentials */ - if (!res) - res = nfs_open(inode, filp); + res = nfs_open(inode, filp); unlock_kernel(); return res; } -- cgit v1.2.3 From 82aa5d6183667aa2a5f3c61e390934b0273d2ad7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Apr 2006 13:05:48 +0200 Subject: [PATCH] splice: fix smaller sized splice reads Signed-off-by: Jens Axboe --- fs/splice.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 22fac87e90b..0559e7577a0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -275,6 +275,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, error = 0; bytes = 0; for (i = 0; i < nr_pages; i++, index++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min(len, PAGE_CACHE_SIZE - loff); find_page: /* * lookup the page for this index @@ -366,11 +375,13 @@ readpage: * force quit after adding this page */ nr_pages = i; + this_len = min(this_len, loff); } } fill_it: pages[i] = page; - bytes += PAGE_CACHE_SIZE - loff; + bytes += this_len; + len -= this_len; loff = 0; } -- cgit v1.2.3