From 29e350944fdc2dfca102500790d8ad6d6ff4f69d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 2 Apr 2006 12:46:35 -0700 Subject: splice: add SPLICE_F_NONBLOCK flag It doesn't make the splice itself necessarily nonblocking (because the actual file descriptors that are spliced from/to may block unless they have the O_NONBLOCK flag set), but it makes the splice pipe operations nonblocking. Signed-off-by: Linus Torvalds --- fs/splice.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 7c2bbf18d7a..6081cf7d2d1 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -106,7 +106,7 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { static ssize_t move_to_pipe(struct inode *inode, struct page **pages, int nr_pages, unsigned long offset, - unsigned long len) + unsigned long len, unsigned int flags) { struct pipe_inode_info *info; int ret, do_wakeup, i; @@ -159,6 +159,12 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, break; } + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; @@ -191,7 +197,7 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, } static int __generic_file_splice_read(struct file *in, struct inode *pipe, - size_t len) + size_t len, unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; @@ -279,7 +285,7 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, * Now we splice them into the pipe.. */ splice_them: - return move_to_pipe(pipe, pages, i, offset, len); + return move_to_pipe(pipe, pages, i, offset, len, flags); } ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, @@ -291,7 +297,7 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, ret = 0; spliced = 0; while (len) { - ret = __generic_file_splice_read(in, pipe, len); + ret = __generic_file_splice_read(in, pipe, len, flags); if (ret <= 0) break; @@ -299,6 +305,11 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, in->f_pos += ret; len -= ret; spliced += ret; + + if (!(flags & SPLICE_F_NONBLOCK)) + continue; + ret = -EAGAIN; + break; } if (spliced) @@ -527,6 +538,12 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, break; } + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; -- cgit v1.2.3 From 53cd9ae886273d6c2b8ba4aa63d6cd6b1217b57f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:04:21 +0200 Subject: [PATCH] splice: fix shadow[] filling logic Clear the entire range, and don't increment pidx or we keep filling the same position again and again. Thanks to KAMEZAWA Hiroyuki. Signed-off-by: Jens Axboe --- fs/splice.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 6081cf7d2d1..a555d0a83fe 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -237,9 +237,9 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, * fill shadow[] with pages at the right locations, so we only * have to fill holes */ - memset(shadow, 0, i * sizeof(struct page *)); - for (j = 0, pidx = index; j < i; pidx++, j++) - shadow[pages[j]->index - pidx] = pages[j]; + memset(shadow, 0, nr_pages * sizeof(struct page *)); + for (j = 0; j < i; j++) + shadow[pages[j]->index - index] = pages[j]; /* * now fill in the holes -- cgit v1.2.3 From 4f6f0bd2ffa4e31c3524f5e65c84a29b6ab73307 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:04:46 +0200 Subject: [PATCH] splice: improve writeback and clean up page stealing By cleaning up the writeback logic (killing write_one_page() and the manual set_page_dirty()), we can get rid of ->stolen inside the pipe_buffer and just keep it local in pipe_to_file(). This also adds dirty page balancing logic and O_SYNC handling. Signed-off-by: Jens Axboe --- fs/splice.c | 64 +++++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 16 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index a555d0a83fe..07f4d863c2d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -22,7 +22,10 @@ #include #include #include +#include +#include #include +#include /* * Passed to the actors @@ -38,11 +41,15 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, struct pipe_buffer *buf) { struct page *page = buf->page; + struct address_space *mapping = page_mapping(page); WARN_ON(!PageLocked(page)); WARN_ON(!PageUptodate(page)); - if (!remove_mapping(page_mapping(page), page)) + if (PagePrivate(page)) + try_to_release_page(page, mapping_gfp_mask(mapping)); + + if (!remove_mapping(mapping, page)) return 1; if (PageLRU(page)) { @@ -55,7 +62,6 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, spin_unlock_irq(&zone->lru_lock); } - buf->stolen = 1; return 0; } @@ -64,7 +70,6 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, { page_cache_release(buf->page); buf->page = NULL; - buf->stolen = 0; } static void *page_cache_pipe_buf_map(struct file *file, @@ -91,8 +96,7 @@ static void *page_cache_pipe_buf_map(struct file *file, static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) { - if (!buf->stolen) - unlock_page(buf->page); + unlock_page(buf->page); kunmap(buf->page); } @@ -319,7 +323,8 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, } /* - * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage(). + * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' + * using sendpage(). */ static int pipe_to_sendpage(struct pipe_inode_info *info, struct pipe_buffer *buf, struct splice_desc *sd) @@ -379,7 +384,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, struct page *page; pgoff_t index; char *src; - int ret; + int ret, stolen; /* * after this, page will be locked and unmapped @@ -390,6 +395,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; + stolen = 0; /* * reuse buf page, if SPLICE_F_MOVE is set @@ -399,6 +405,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, goto find_page; page = buf->page; + stolen = 1; if (add_to_page_cache_lru(page, mapping, index, mapping_gfp_mask(mapping))) goto find_page; @@ -443,10 +450,13 @@ find_page: } ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); - if (ret) + if (ret == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } else if (ret) goto out; - if (!buf->stolen) { + if (!stolen) { char *dst = kmap_atomic(page, KM_USER0); memcpy(dst + offset, src + buf->offset, sd->len); @@ -455,16 +465,18 @@ find_page: } ret = mapping->a_ops->commit_write(file, page, 0, sd->len); - if (ret < 0) + if (ret == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } else if (ret) goto out; - set_page_dirty(page); - ret = write_one_page(page, 0); + balance_dirty_pages_ratelimited(mapping); out: - if (ret < 0) - unlock_page(page); - if (!buf->stolen) + if (!stolen) { page_cache_release(page); + unlock_page(page); + } buf->ops->unmap(info, buf); return ret; } @@ -576,7 +588,27 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, ssize_t generic_file_splice_write(struct inode *inode, struct file *out, size_t len, unsigned int flags) { - return move_from_pipe(inode, out, len, flags, pipe_to_file); + struct address_space *mapping = out->f_mapping; + ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file); + + /* + * if file or inode is SYNC and we actually wrote some data, sync it + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) + && ret > 0) { + struct inode *inode = mapping->host; + int err; + + mutex_lock(&inode->i_mutex); + err = generic_osync_inode(mapping->host, mapping, + OSYNC_METADATA|OSYNC_DATA); + mutex_unlock(&inode->i_mutex); + + if (err) + ret = err; + } + + return ret; } ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, -- cgit v1.2.3 From 83f9135bddffded9f1716519b6c147bcf046c87e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:05:09 +0200 Subject: [PATCH] splice: add comments documenting more of the code Hopefully this will make Andrew a little more happy. Signed-off-by: Jens Axboe --- fs/splice.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 4 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 07f4d863c2d..34591924c78 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -37,6 +37,12 @@ struct splice_desc { loff_t pos; /* file position */ }; +/* + * Attempt to steal a page from a pipe buffer. This should perhaps go into + * a vm helper function, it's already simplified quite a bit by the + * addition of remove_mapping(). If success is returned, the caller may + * attempt to reuse this page for another destination. + */ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, struct pipe_buffer *buf) { @@ -108,6 +114,10 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { .steal = page_cache_pipe_buf_steal, }; +/* + * Pipe output worker. This sets up our pipe format with the page cache + * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). + */ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, int nr_pages, unsigned long offset, unsigned long len, unsigned int flags) @@ -292,6 +302,16 @@ splice_them: return move_to_pipe(pipe, pages, i, offset, len, flags); } +/** + * generic_file_splice_read - splice data from file to a pipe + * @in: file to splice from + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will read pages from given file and fill them into a pipe. + * + */ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, size_t len, unsigned int flags) { @@ -370,10 +390,12 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, * - Destination page does not exist, we can add the pipe page to * the page cache and avoid the copy. * - * For now we just do the slower thing and always copy pages over, it's - * easier than migrating pages from the pipe to the target file. For the - * case of doing file | file splicing, the migrate approach had some LRU - * nastiness... + * If asked to move pages to the output file (SPLICE_F_MOVE is set in + * sd->flags), we attempt to migrate pages from the pipe to the output + * file address space page cache. This is possible if no one else has + * the pipe page referenced outside of the pipe and page cache. If + * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create + * a new page in the output file page cache and fill/dirty that. */ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, struct splice_desc *sd) @@ -401,6 +423,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, * reuse buf page, if SPLICE_F_MOVE is set */ if (sd->flags & SPLICE_F_MOVE) { + /* + * If steal succeeds, buf->page is now pruned from the vm + * side (LRU and page cache) and we can reuse it. + */ if (buf->ops->steal(info, buf)) goto find_page; @@ -484,6 +510,11 @@ out: typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, struct splice_desc *); +/* + * Pipe input worker. Most of this logic works like a regular pipe, the + * key here is the 'actor' worker passed in that actually moves the data + * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. + */ static ssize_t move_from_pipe(struct inode *inode, struct file *out, size_t len, unsigned int flags, splice_actor *actor) @@ -585,6 +616,17 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, } +/** + * generic_file_splice_write - splice data from a pipe to a file + * @inode: pipe inode + * @out: file to write to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. + * + */ ssize_t generic_file_splice_write(struct inode *inode, struct file *out, size_t len, unsigned int flags) { @@ -611,6 +653,17 @@ ssize_t generic_file_splice_write(struct inode *inode, struct file *out, return ret; } +/** + * generic_splice_sendpage - splice data from a pipe to a socket + * @inode: pipe inode + * @out: socket to write to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will send @len bytes from the pipe to a network socket. No data copying + * is involved. + * + */ ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, size_t len, unsigned int flags) { @@ -620,6 +673,9 @@ ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, EXPORT_SYMBOL(generic_file_splice_write); EXPORT_SYMBOL(generic_file_splice_read); +/* + * Attempt to initiate a splice from pipe to file. + */ static long do_splice_from(struct inode *pipe, struct file *out, size_t len, unsigned int flags) { @@ -640,6 +696,9 @@ static long do_splice_from(struct inode *pipe, struct file *out, size_t len, return out->f_op->splice_write(pipe, out, len, flags); } +/* + * Attempt to initiate a splice from a file to a pipe. + */ static long do_splice_to(struct file *in, struct inode *pipe, size_t len, unsigned int flags) { @@ -668,6 +727,9 @@ static long do_splice_to(struct file *in, struct inode *pipe, size_t len, return in->f_op->splice_read(in, pipe, len, flags); } +/* + * Determine where to splice to/from. + */ static long do_splice(struct file *in, struct file *out, size_t len, unsigned int flags) { -- cgit v1.2.3 From b2b39fa478db6db89b7ccafb0649973845b0eb75 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:05:41 +0200 Subject: [PATCH] splice: add a SPLICE_F_MORE flag This lets userspace indicate whether more data will be coming in a subsequent splice call. Signed-off-by: Jens Axboe --- fs/splice.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 34591924c78..e8454455894 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -354,6 +354,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, unsigned int offset; ssize_t ret; void *ptr; + int more; /* * sub-optimal, but we are limited by the pipe ->map. we don't @@ -366,9 +367,9 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, return PTR_ERR(ptr); offset = pos & ~PAGE_CACHE_MASK; + more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos, - sd->len < sd->total_len); + ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); buf->ops->unmap(info, buf); if (ret == sd->len) -- cgit v1.2.3 From 059a8f3734a66cb2c41731083b0cbf836721485b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:06:05 +0200 Subject: [PATCH] splice: export generic_splice_sendpage Forgot that one, thanks Jeff. Also move the other EXPORT_SYMBOL to right below the functions. Signed-off-by: Jens Axboe --- fs/splice.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index e8454455894..7c2abd4504d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -342,6 +342,8 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, return ret; } +EXPORT_SYMBOL(generic_file_splice_read); + /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). @@ -654,6 +656,8 @@ ssize_t generic_file_splice_write(struct inode *inode, struct file *out, return ret; } +EXPORT_SYMBOL(generic_file_splice_write); + /** * generic_splice_sendpage - splice data from a pipe to a socket * @inode: pipe inode @@ -671,8 +675,7 @@ ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); } -EXPORT_SYMBOL(generic_file_splice_write); -EXPORT_SYMBOL(generic_file_splice_read); +EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. -- cgit v1.2.3 From ad8d6f0a783ffa2ff9b0cf09910b889715772201 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:10:32 +0200 Subject: [PATCH] splice: page stealing needs to wait_on_page_writeback() Thanks to Andrew for the good explanation of why this is so. akpm writes: If a page is under writeback and we remove it from pagecache, it's still going to get written to disk. But the VFS no longer knows about that page, nor that this page is about to modify disk blocks. So there might be scenarios in which those blocks-which-are-about-to-be-written-to get reused for something else. When writeback completes, it'll scribble on those blocks. This won't happen in ext2/ext3-style filesystems in normal mode because the page has buffers and try_to_release_page() will fail. But ext2 in nobh mode doesn't attach buffers at all - it just sticks the page in a BIO, finds some new blocks, points the BIO at those blocks and lets it rip. While that write IO's in flight, someone could truncate the file. Truncate won't block on the writeout because the page isn't in pagecache any more. So truncate will the free the blocks from the file under the page's feet. Then something else can reallocate those blocks. Then write data to them. Now, the original write completes, corrupting the filesystem. Signed-off-by: Jens Axboe --- fs/splice.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 7c2abd4504d..b5fb2f3e3ac 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -52,6 +52,15 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, WARN_ON(!PageLocked(page)); WARN_ON(!PageUptodate(page)); + /* + * At least for ext2 with nobh option, we need to wait on writeback + * completing on this page, since we'll remove it from the pagecache. + * Otherwise truncate wont wait on the page, allowing the disk + * blocks to be reused by someone else before we actually wrote our + * data to them. fs corruption ensues. + */ + wait_on_page_writeback(page); + if (PagePrivate(page)) try_to_release_page(page, mapping_gfp_mask(mapping)); -- cgit v1.2.3 From 3e7ee3e7b36fa4e2d88d8fb0a2577be95fc4636d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:11:04 +0200 Subject: [PATCH] splice: fix page stealing LRU handling. Originally from Nick Piggin, just adapted to the newer branch. You can't check PageLRU without holding zone->lru_lock. The page release code can get away with it only because the page refcount is 0 at that point. Also, you can't reliably remove pages from the LRU unless the refcount is 0. Ever. Signed-off-by: Nick Piggin Signed-off-by: Jens Axboe --- fs/splice.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index b5fb2f3e3ac..bfa42a277bb 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -67,16 +67,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, if (!remove_mapping(mapping, page)) return 1; - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - BUG_ON(!PageLRU(page)); - __ClearPageLRU(page); - del_page_from_lru(zone, page); - spin_unlock_irq(&zone->lru_lock); - } - + buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; return 0; } @@ -85,6 +76,7 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, { page_cache_release(buf->page); buf->page = NULL; + buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU); } static void *page_cache_pipe_buf_map(struct file *file, @@ -414,11 +406,12 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, { struct file *file = sd->file; struct address_space *mapping = file->f_mapping; + gfp_t gfp_mask = mapping_gfp_mask(mapping); unsigned int offset; struct page *page; pgoff_t index; char *src; - int ret, stolen; + int ret; /* * after this, page will be locked and unmapped @@ -429,7 +422,6 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; - stolen = 0; /* * reuse buf page, if SPLICE_F_MOVE is set @@ -443,15 +435,15 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, goto find_page; page = buf->page; - stolen = 1; - if (add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping))) + if (add_to_page_cache(page, mapping, index, gfp_mask)) goto find_page; + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add(page); } else { find_page: ret = -ENOMEM; - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); + page = find_or_create_page(mapping, index, gfp_mask); if (!page) goto out; @@ -494,7 +486,7 @@ find_page: } else if (ret) goto out; - if (!stolen) { + if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { char *dst = kmap_atomic(page, KM_USER0); memcpy(dst + offset, src + buf->offset, sd->len); @@ -511,7 +503,7 @@ find_page: balance_dirty_pages_ratelimited(mapping); out: - if (!stolen) { + if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { page_cache_release(page); unlock_page(page); } -- cgit v1.2.3 From c7f21e4f5a3d4e378e4d453b2be209dcfd1bb964 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:01:01 +0200 Subject: [PATCH] splice: mark the io page as accessed We should do that, since we do the LRU manipulation ourselves now. Suggested by Nick Piggin. Signed-off-by: Jens Axboe --- fs/splice.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index bfa42a277bb..b450acdff39 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -501,6 +501,7 @@ find_page: } else if (ret) goto out; + mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { -- cgit v1.2.3 From 9aefe431f5a000884db7ae74ac208de814fe5913 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 10 Apr 2006 09:02:40 +0200 Subject: [PATCH] splice: potential !page dereference We can get to out: with a NULL page, which we probably don't want to be calling page_cache_release() on. Signed-off-by: Dave Jones Signed-off-by: Jens Axboe --- fs/splice.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index b450acdff39..26f5f7ecee5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -445,7 +445,7 @@ find_page: ret = -ENOMEM; page = find_or_create_page(mapping, index, gfp_mask); if (!page) - goto out; + goto out_nomem; /* * If the page is uptodate, it is also locked. If it isn't @@ -508,6 +508,7 @@ out: page_cache_release(page); unlock_page(page); } +out_nomem: buf->ops->unmap(info, buf); return ret; } -- cgit v1.2.3 From c0bd1f650bd06a43435808d44f1e9520ea806206 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:03:32 +0200 Subject: [PATCH] splice: only call wake_up_interruptible() when we really have to __wake_up_common() is pretty heavy in the kernel profiles, this brings it down to a more acceptable level. Signed-off-by: Jens Axboe --- fs/splice.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 26f5f7ecee5..9f796b1034d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -187,7 +187,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible_sync(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); do_wakeup = 0; @@ -201,7 +203,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, mutex_unlock(PIPE_MUTEX(*inode)); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); } @@ -600,7 +604,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible_sync(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); do_wakeup = 0; } @@ -611,7 +617,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, mutex_unlock(PIPE_MUTEX(*inode)); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } -- cgit v1.2.3 From 16c523ddabcce5d3d817f4a2491d628f84dfaaa1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:03:58 +0200 Subject: [PATCH] splice: cleanup __generic_file_splice_read() The whole shadow/pages logic got overly complex, and this simpler approach is actually faster in testing. Signed-off-by: Jens Axboe --- fs/splice.c | 59 ++++++++++------------------------------------------------- 1 file changed, 10 insertions(+), 49 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 9f796b1034d..8b5efcc906d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -220,10 +220,10 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; - struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS]; + struct page *pages[PIPE_BUFFERS]; struct page *page; - pgoff_t index, pidx; - int i, j; + pgoff_t index; + int i; index = in->f_pos >> PAGE_CACHE_SHIFT; offset = in->f_pos & ~PAGE_CACHE_MASK; @@ -237,42 +237,14 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, */ do_page_cache_readahead(mapping, in, index, nr_pages); - /* - * Get as many pages from the page cache as possible.. - * Start IO on the page cache entries we create (we - * can assume that any pre-existing ones we find have - * already had IO started on them). - */ - i = find_get_pages(mapping, index, nr_pages, pages); - - /* - * common case - we found all pages and they are contiguous, - * kick them off - */ - if (i && (pages[i - 1]->index == index + i - 1)) - goto splice_them; - - /* - * fill shadow[] with pages at the right locations, so we only - * have to fill holes - */ - memset(shadow, 0, nr_pages * sizeof(struct page *)); - for (j = 0; j < i; j++) - shadow[pages[j]->index - index] = pages[j]; - /* * now fill in the holes */ - for (i = 0, pidx = index; i < nr_pages; pidx++, i++) { - int error; - - if (shadow[i]) - continue; - + for (i = 0; i < nr_pages; i++, index++) { /* * no page there, look one up / create it */ - page = find_or_create_page(mapping, pidx, + page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); if (!page) break; @@ -280,31 +252,20 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, if (PageUptodate(page)) unlock_page(page); else { - error = mapping->a_ops->readpage(in, page); + int error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { page_cache_release(page); break; } } - shadow[i] = page; - } - - if (!i) { - for (i = 0; i < nr_pages; i++) { - if (shadow[i]) - page_cache_release(shadow[i]); - } - return 0; + pages[i] = page; } - memcpy(pages, shadow, i * sizeof(struct page *)); + if (i) + return move_to_pipe(pipe, pages, i, offset, len, flags); - /* - * Now we splice them into the pipe.. - */ -splice_them: - return move_to_pipe(pipe, pages, i, offset, len, flags); + return 0; } /** -- cgit v1.2.3 From 49d0b21be21efc07526d637e0ae935019667e532 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:04:41 +0200 Subject: [PATCH] splice: optimize the splice buffer mapping We don't really need to lock down the pages, just make sure they are uptodate. Signed-off-by: Jens Axboe --- fs/splice.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 8b5efcc906d..50c43a1e092 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -84,26 +84,43 @@ static void *page_cache_pipe_buf_map(struct file *file, struct pipe_buffer *buf) { struct page *page = buf->page; - - lock_page(page); + int err; if (!PageUptodate(page)) { - unlock_page(page); - return ERR_PTR(-EIO); - } + lock_page(page); + + /* + * Page got truncated/unhashed. This will cause a 0-byte + * splice, if this is the first page + */ + if (!page->mapping) { + err = -ENODATA; + goto error; + } + + /* + * uh oh, read-error from disk + */ + if (!PageUptodate(page)) { + err = -EIO; + goto error; + } - if (!page->mapping) { + /* + * page is ok afterall, fall through to mapping + */ unlock_page(page); - return ERR_PTR(-ENODATA); } - return kmap(buf->page); + return kmap(page); +error: + unlock_page(page); + return ERR_PTR(err); } static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) { - unlock_page(buf->page); kunmap(buf->page); } @@ -379,7 +396,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, int ret; /* - * after this, page will be locked and unmapped + * make sure the data in this buffer is uptodate */ src = buf->ops->map(file, info, buf); if (IS_ERR(src)) @@ -399,6 +416,9 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, if (buf->ops->steal(info, buf)) goto find_page; + /* + * this will also set the page locked + */ page = buf->page; if (add_to_page_cache(page, mapping, index, gfp_mask)) goto find_page; -- cgit v1.2.3 From 0b749ce3802428007a37870eb51ba3c0bdf90857 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:05:04 +0200 Subject: [PATCH] splice: be smarter about calling do_page_cache_readahead() We don't want to call into the read-ahead logic unless we are at the start of a page, _or_ we have multiple pages to read. Signed-off-by: Jens Axboe --- fs/splice.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 50c43a1e092..9bfd6af0cf4 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -250,9 +250,12 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, nr_pages = PIPE_BUFFERS; /* - * initiate read-ahead on this page range + * initiate read-ahead on this page range. however, don't call into + * read-ahead if this is a non-zero offset (we are likely doing small + * chunk splice and the page is already there) for a single page. */ - do_page_cache_readahead(mapping, in, index, nr_pages); + if (!offset || nr_pages > 1) + do_page_cache_readahead(mapping, in, index, nr_pages); /* * now fill in the holes -- cgit v1.2.3 From 3a326a2ce88e71d00ac0d133e314a3342a7709f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Apr 2006 15:18:35 +0200 Subject: [PATCH] introduce a "kernel-internal pipe object" abstraction separate out the 'internal pipe object' abstraction, and make it usable to splice. This cleans up and fixes several aspects of the internal splice APIs and the pipe code: - pipes: the allocation and freeing of pipe_inode_info is now more symmetric and more streamlined with existing kernel practices. - splice: small micro-optimization: less pointer dereferencing in splice methods Signed-off-by: Ingo Molnar Update XFS for the ->splice_read/->splice_write changes. Signed-off-by: Jens Axboe --- fs/splice.c | 122 +++++++++++++++++++++++++++++++----------------------------- 1 file changed, 63 insertions(+), 59 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 9bfd6af0cf4..ed91a62402e 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -136,34 +136,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct inode *inode, struct page **pages, +static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, int nr_pages, unsigned long offset, unsigned long len, unsigned int flags) { - struct pipe_inode_info *info; int ret, do_wakeup, i; ret = 0; do_wakeup = 0; i = 0; - mutex_lock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); - info = inode->i_pipe; for (;;) { int bufs; - if (!PIPE_READERS(*inode)) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - bufs = info->nrbufs; + bufs = pipe->nrbufs; if (bufs < PIPE_BUFFERS) { - int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1); - struct pipe_buffer *buf = info->bufs + newbuf; + int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pages[i++]; unsigned long this_len; @@ -175,7 +174,7 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, buf->offset = offset; buf->len = this_len; buf->ops = &page_cache_pipe_buf_ops; - info->nrbufs = ++bufs; + pipe->nrbufs = ++bufs; do_wakeup = 1; ret += this_len; @@ -205,25 +204,25 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, - POLL_IN); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); - PIPE_WAITING_WRITERS(*inode)--; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; } - mutex_unlock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } while (i < nr_pages) @@ -232,8 +231,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, return ret; } -static int __generic_file_splice_read(struct file *in, struct inode *pipe, - size_t len, unsigned int flags) +static int +__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; @@ -298,7 +298,7 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, * Will read pages from given file and fill them into a pipe. * */ -ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, +ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { ssize_t spliced; @@ -306,6 +306,7 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, ret = 0; spliced = 0; + while (len) { ret = __generic_file_splice_read(in, pipe, len, flags); @@ -509,11 +510,10 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -static ssize_t move_from_pipe(struct inode *inode, struct file *out, +static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, size_t len, unsigned int flags, splice_actor *actor) { - struct pipe_inode_info *info; int ret, do_wakeup, err; struct splice_desc sd; @@ -525,22 +525,22 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, sd.file = out; sd.pos = out->f_pos; - mutex_lock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); - info = inode->i_pipe; for (;;) { - int bufs = info->nrbufs; + int bufs = pipe->nrbufs; if (bufs) { - int curbuf = info->curbuf; - struct pipe_buffer *buf = info->bufs + curbuf; + int curbuf = pipe->curbuf; + struct pipe_buffer *buf = pipe->bufs + curbuf; struct pipe_buf_operations *ops = buf->ops; sd.len = buf->len; if (sd.len > sd.total_len) sd.len = sd.total_len; - err = actor(info, buf, &sd); + err = actor(pipe, buf, &sd); if (err) { if (!ret && err != -ENODATA) ret = err; @@ -553,10 +553,10 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, buf->len -= sd.len; if (!buf->len) { buf->ops = NULL; - ops->release(info, buf); + ops->release(pipe, buf); curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); - info->curbuf = curbuf; - info->nrbufs = --bufs; + pipe->curbuf = curbuf; + pipe->nrbufs = --bufs; do_wakeup = 1; } @@ -568,9 +568,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, if (bufs) continue; - if (!PIPE_WRITERS(*inode)) + if (!pipe->writers) break; - if (!PIPE_WAITING_WRITERS(*inode)) { + if (!pipe->waiting_writers) { if (ret) break; } @@ -589,22 +589,23 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); do_wakeup = 0; } - pipe_wait(inode); + pipe_wait(pipe); } - mutex_unlock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } mutex_lock(&out->f_mapping->host->i_mutex); @@ -616,7 +617,7 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, /** * generic_file_splice_write - splice data from a pipe to a file - * @inode: pipe inode + * @pipe: pipe info * @out: file to write to * @len: number of bytes to splice * @flags: splice modifier flags @@ -625,11 +626,14 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, * the given pipe inode to the given file. * */ -ssize_t generic_file_splice_write(struct inode *inode, struct file *out, - size_t len, unsigned int flags) +ssize_t +generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, + size_t len, unsigned int flags) { struct address_space *mapping = out->f_mapping; - ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file); + ssize_t ret; + + ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); /* * if file or inode is SYNC and we actually wrote some data, sync it @@ -664,10 +668,10 @@ EXPORT_SYMBOL(generic_file_splice_write); * is involved. * */ -ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, +ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, size_t len, unsigned int flags) { - return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); + return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -675,8 +679,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. */ -static long do_splice_from(struct inode *pipe, struct file *out, size_t len, - unsigned int flags) +static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + size_t len, unsigned int flags) { loff_t pos; int ret; @@ -698,8 +702,8 @@ static long do_splice_from(struct inode *pipe, struct file *out, size_t len, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct inode *pipe, size_t len, - unsigned int flags) +static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { loff_t pos, isize, left; int ret; @@ -732,14 +736,14 @@ static long do_splice_to(struct file *in, struct inode *pipe, size_t len, static long do_splice(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct inode *pipe; + struct pipe_inode_info *pipe; - pipe = in->f_dentry->d_inode; - if (pipe->i_pipe) + pipe = in->f_dentry->d_inode->i_pipe; + if (pipe) return do_splice_from(pipe, out, len, flags); - pipe = out->f_dentry->d_inode; - if (pipe->i_pipe) + pipe = out->f_dentry->d_inode->i_pipe; + if (pipe) return do_splice_to(in, pipe, len, flags); return -EINVAL; -- cgit v1.2.3 From 529565dcb1581c9a1e3f6df1c1763ca3e0f0d512 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Apr 2006 15:18:58 +0200 Subject: [PATCH] splice: add optional input and output offsets add optional input and output offsets to sys_splice(), for seekable file descriptors: asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, size_t len, unsigned int flags); semantics are straightforward: f_pos will be updated with the offset provided by user-space, before the splice transfer is about to begin. Providing a NULL offset pointer means the existing f_pos will be used (and updated in situ). Providing an offset for a pipe results in -ESPIPE. Providing an invalid offset pointer results in -EFAULT. Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/splice.c | 54 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 13 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index ed91a62402e..a5326127aad 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -680,7 +680,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t __user *off_out, size_t len, + unsigned int flags) { loff_t pos; int ret; @@ -691,7 +692,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (!(out->f_mode & FMODE_WRITE)) return -EBADF; + if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t))) + return -EFAULT; + pos = out->f_pos; + ret = rw_verify_area(WRITE, out, &pos, len); if (unlikely(ret < 0)) return ret; @@ -702,8 +707,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +static long do_splice_to(struct file *in, loff_t __user *off_in, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { loff_t pos, isize, left; int ret; @@ -714,7 +720,11 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, if (!(in->f_mode & FMODE_READ)) return -EBADF; + if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + return -EFAULT; + pos = in->f_pos; + ret = rw_verify_area(READ, in, &pos, len); if (unlikely(ret < 0)) return ret; @@ -733,23 +743,39 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, /* * Determine where to splice to/from. */ -static long do_splice(struct file *in, struct file *out, size_t len, - unsigned int flags) +static long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) { struct pipe_inode_info *pipe; + if (off_out && out->f_op->llseek == no_llseek) + return -EINVAL; + if (off_in && in->f_op->llseek == no_llseek) + return -EINVAL; + pipe = in->f_dentry->d_inode->i_pipe; - if (pipe) - return do_splice_from(pipe, out, len, flags); + if (pipe) { + if (off_in) + return -ESPIPE; + + return do_splice_from(pipe, out, off_out, len, flags); + } pipe = out->f_dentry->d_inode->i_pipe; - if (pipe) - return do_splice_to(in, pipe, len, flags); + if (pipe) { + if (off_out) + return -ESPIPE; + + return do_splice_to(in, off_in, pipe, len, flags); + } return -EINVAL; } -asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) +asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, + int fd_out, loff_t __user *off_out, + size_t len, unsigned int flags) { long error; struct file *in, *out; @@ -759,13 +785,15 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) return 0; error = -EBADF; - in = fget_light(fdin, &fput_in); + in = fget_light(fd_in, &fput_in); if (in) { if (in->f_mode & FMODE_READ) { - out = fget_light(fdout, &fput_out); + out = fget_light(fd_out, &fput_out); if (out) { if (out->f_mode & FMODE_WRITE) - error = do_splice(in, out, len, flags); + error = do_splice(in, off_in, + out, off_out, + len, flags); fput_light(out, fput_out); } } -- cgit v1.2.3 From b92ce55893745e011edae70830b8bc863be881f9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:52:07 +0200 Subject: [PATCH] splice: add direct fd <-> fd splicing support It's more efficient for sendfile() emulation. Basically we cache an internal private pipe and just use that as the intermediate area for pages. Direct splicing is not available from sys_splice(), it is only meant to be used for sendfile() emulation. Additional patch from Ingo Molnar to avoid the PIPE_BUFFERS loop at exit for the normal fast path. Signed-off-by: Jens Axboe --- fs/splice.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 130 insertions(+), 18 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index a5326127aad..c47b561edac 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -680,8 +680,7 @@ EXPORT_SYMBOL(generic_splice_sendpage); * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t __user *off_out, size_t len, - unsigned int flags) + size_t len, unsigned int flags) { loff_t pos; int ret; @@ -692,9 +691,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (!(out->f_mode & FMODE_WRITE)) return -EBADF; - if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t))) - return -EFAULT; - pos = out->f_pos; ret = rw_verify_area(WRITE, out, &pos, len); @@ -707,9 +703,8 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, loff_t __user *off_in, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { loff_t pos, isize, left; int ret; @@ -720,9 +715,6 @@ static long do_splice_to(struct file *in, loff_t __user *off_in, if (!(in->f_mode & FMODE_READ)) return -EBADF; - if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) - return -EFAULT; - pos = in->f_pos; ret = rw_verify_area(READ, in, &pos, len); @@ -740,6 +732,118 @@ static long do_splice_to(struct file *in, loff_t __user *off_in, return in->f_op->splice_read(in, pipe, len, flags); } +long do_splice_direct(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *pipe; + long ret, bytes; + umode_t i_mode; + int i; + + /* + * We require the input being a regular file, as we don't want to + * randomly drop data for eg socket -> socket splicing. Use the + * piped splicing for that! + */ + i_mode = in->f_dentry->d_inode->i_mode; + if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) + return -EINVAL; + + /* + * neither in nor out is a pipe, setup an internal pipe attached to + * 'out' and transfer the wanted data from 'in' to 'out' through that + */ + pipe = current->splice_pipe; + if (!pipe) { + pipe = alloc_pipe_info(NULL); + if (!pipe) + return -ENOMEM; + + /* + * We don't have an immediate reader, but we'll read the stuff + * out of the pipe right after the move_to_pipe(). So set + * PIPE_READERS appropriately. + */ + pipe->readers = 1; + + current->splice_pipe = pipe; + } + + /* + * do the splice + */ + ret = 0; + bytes = 0; + + while (len) { + size_t read_len, max_read_len; + + /* + * Do at most PIPE_BUFFERS pages worth of transfer: + */ + max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); + + ret = do_splice_to(in, pipe, max_read_len, flags); + if (unlikely(ret < 0)) + goto out_release; + + read_len = ret; + + /* + * NOTE: nonblocking mode only applies to the input. We + * must not do the output in nonblocking mode as then we + * could get stuck data in the internal pipe: + */ + ret = do_splice_from(pipe, out, read_len, + flags & ~SPLICE_F_NONBLOCK); + if (unlikely(ret < 0)) + goto out_release; + + bytes += ret; + len -= ret; + + /* + * In nonblocking mode, if we got back a short read then + * that was due to either an IO error or due to the + * pagecache entry not being there. In the IO error case + * the _next_ splice attempt will produce a clean IO error + * return value (not a short read), so in both cases it's + * correct to break out of the loop here: + */ + if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) + break; + } + + pipe->nrbufs = pipe->curbuf = 0; + + return bytes; + +out_release: + /* + * If we did an incomplete transfer we must release + * the pipe buffers in question: + */ + for (i = 0; i < PIPE_BUFFERS; i++) { + struct pipe_buffer *buf = pipe->bufs + i; + + if (buf->ops) { + buf->ops->release(pipe, buf); + buf->ops = NULL; + } + } + pipe->nrbufs = pipe->curbuf = 0; + + /* + * If we transferred some data, return the number of bytes: + */ + if (bytes > 0) + return bytes; + + return ret; +} + +EXPORT_SYMBOL(do_splice_direct); + /* * Determine where to splice to/from. */ @@ -749,25 +853,33 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *pipe; - if (off_out && out->f_op->llseek == no_llseek) - return -EINVAL; - if (off_in && in->f_op->llseek == no_llseek) - return -EINVAL; - pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { if (off_in) return -ESPIPE; + if (off_out) { + if (out->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&out->f_pos, off_out, + sizeof(loff_t))) + return -EFAULT; + } - return do_splice_from(pipe, out, off_out, len, flags); + return do_splice_from(pipe, out, len, flags); } pipe = out->f_dentry->d_inode->i_pipe; if (pipe) { if (off_out) return -ESPIPE; + if (off_in) { + if (in->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + return -EFAULT; + } - return do_splice_to(in, off_in, pipe, len, flags); + return do_splice_to(in, pipe, len, flags); } return -EINVAL; -- cgit v1.2.3 From 7480a90435673b4c717b6caf1350ec577d5f1adf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:52:47 +0200 Subject: [PATCH] splice: speedup __generic_file_splice_read Using find_get_page() is a lot faster than find_or_create_page(). This gets splice a lot closer to sendfile() for fd -> socket transfers. Signed-off-by: Jens Axboe --- fs/splice.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 11 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index c47b561edac..e30743c2c06 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -240,7 +240,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, struct page *pages[PIPE_BUFFERS]; struct page *page; pgoff_t index; - int i; + int i, error; index = in->f_pos >> PAGE_CACHE_SHIFT; offset = in->f_pos & ~PAGE_CACHE_MASK; @@ -260,32 +260,84 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, /* * now fill in the holes */ + error = 0; for (i = 0; i < nr_pages; i++, index++) { +find_page: /* - * no page there, look one up / create it + * lookup the page for this index */ - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - if (!page) - break; + page = find_get_page(mapping, index); + if (!page) { + /* + * If in nonblock mode then dont block on + * readpage (we've kicked readahead so there + * will be asynchronous progress): + */ + if (flags & SPLICE_F_NONBLOCK) + break; + + /* + * page didn't exist, allocate one + */ + page = page_cache_alloc_cold(mapping); + if (!page) + break; + + error = add_to_page_cache_lru(page, mapping, index, + mapping_gfp_mask(mapping)); + if (unlikely(error)) { + page_cache_release(page); + break; + } - if (PageUptodate(page)) - unlock_page(page); - else { - int error = mapping->a_ops->readpage(in, page); + goto readpage; + } + + /* + * If the page isn't uptodate, we may need to start io on it + */ + if (!PageUptodate(page)) { + lock_page(page); + + /* + * page was truncated, stop here. if this isn't the + * first page, we'll just complete what we already + * added + */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + break; + } + /* + * page was already under io and is now done, great + */ + if (PageUptodate(page)) { + unlock_page(page); + goto fill_it; + } + +readpage: + /* + * need to read in the page + */ + error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { page_cache_release(page); + if (error == AOP_TRUNCATED_PAGE) + goto find_page; break; } } +fill_it: pages[i] = page; } if (i) return move_to_pipe(pipe, pages, i, offset, len, flags); - return 0; + return error; } /** -- cgit v1.2.3 From 6f767b0425f5902e4817648632230b512e81c963 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:53:56 +0200 Subject: [PATCH] splice: speedups and optimizations - Kill the local variables that cache ->nrbufs, they just take up space. - Only set do_wakeup for a real pipe. This is a big win for direct splicing. - Kill i_mutex lock around ->f_pos update, regular io paths don't do this either. Signed-off-by: Jens Axboe --- fs/splice.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index e30743c2c06..36bc262dfbd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -150,8 +150,6 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, mutex_lock(&pipe->inode->i_mutex); for (;;) { - int bufs; - if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) @@ -159,9 +157,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; } - bufs = pipe->nrbufs; - if (bufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS - 1); + if (pipe->nrbufs < PIPE_BUFFERS) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pages[i++]; unsigned long this_len; @@ -174,8 +171,9 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, buf->offset = offset; buf->len = this_len; buf->ops = &page_cache_pipe_buf_ops; - pipe->nrbufs = ++bufs; - do_wakeup = 1; + pipe->nrbufs++; + if (pipe->inode) + do_wakeup = 1; ret += this_len; len -= this_len; @@ -184,7 +182,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; if (!len) break; - if (bufs < PIPE_BUFFERS) + if (pipe->nrbufs < PIPE_BUFFERS) continue; break; @@ -581,11 +579,8 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, mutex_lock(&pipe->inode->i_mutex); for (;;) { - int bufs = pipe->nrbufs; - - if (bufs) { - int curbuf = pipe->curbuf; - struct pipe_buffer *buf = pipe->bufs + curbuf; + if (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; struct pipe_buf_operations *ops = buf->ops; sd.len = buf->len; @@ -606,10 +601,10 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); - pipe->curbuf = curbuf; - pipe->nrbufs = --bufs; - do_wakeup = 1; + pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->nrbufs--; + if (pipe->inode) + do_wakeup = 1; } sd.pos += sd.len; @@ -618,7 +613,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, break; } - if (bufs) + if (pipe->nrbufs) continue; if (!pipe->writers) break; @@ -660,9 +655,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_lock(&out->f_mapping->host->i_mutex); out->f_pos = sd.pos; - mutex_unlock(&out->f_mapping->host->i_mutex); return ret; } -- cgit v1.2.3 From 49570e9b29a3d78950b5eba6b73bdcca955f0877 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:56:09 +0200 Subject: [PATCH] splice: unlikely() optimizations Also corrects a few comments. Patch mainly from Ingo, changes by me. Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/splice.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 36bc262dfbd..77b026baff7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -346,7 +346,6 @@ fill_it: * @flags: splice modifier flags * * Will read pages from given file and fill them into a pipe. - * */ ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -690,7 +689,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, mutex_lock(&inode->i_mutex); err = generic_osync_inode(mapping->host, mapping, - OSYNC_METADATA|OSYNC_DATA); + OSYNC_METADATA|OSYNC_DATA); mutex_unlock(&inode->i_mutex); if (err) @@ -730,10 +729,10 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t pos; int ret; - if (!out->f_op || !out->f_op->splice_write) + if (unlikely(!out->f_op || !out->f_op->splice_write)) return -EINVAL; - if (!(out->f_mode & FMODE_WRITE)) + if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; pos = out->f_pos; @@ -754,10 +753,10 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, loff_t pos, isize, left; int ret; - if (!in->f_op || !in->f_op->splice_read) + if (unlikely(!in->f_op || !in->f_op->splice_read)) return -EINVAL; - if (!(in->f_mode & FMODE_READ)) + if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; pos = in->f_pos; @@ -771,7 +770,7 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, return 0; left = isize - in->f_pos; - if (left < len) + if (unlikely(left < len)) len = left; return in->f_op->splice_read(in, pipe, len, flags); @@ -799,7 +798,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, * 'out' and transfer the wanted data from 'in' to 'out' through that */ pipe = current->splice_pipe; - if (!pipe) { + if (unlikely(!pipe)) { pipe = alloc_pipe_info(NULL); if (!pipe) return -ENOMEM; -- cgit v1.2.3 From c2058e0611c2fb4a00e63db61ef3a638cf760a98 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:56:34 +0200 Subject: [PATCH] splice: add Ingo as addition copyright holder The comment is also somewhat out of date, correct that as well. Signed-off-by: Jens Axboe --- fs/splice.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 77b026baff7..48b4382cc11 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -9,11 +9,12 @@ * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by - * Jens to support splicing to files and fixing the initial implementation - * bugs. + * Jens to support splicing to files, network, direct splicing, etc and + * fixing lots of bugs. * - * Copyright (C) 2005 Jens Axboe - * Copyright (C) 2005 Linus Torvalds + * Copyright (C) 2005-2006 Jens Axboe + * Copyright (C) 2005-2006 Linus Torvalds + * Copyright (C) 2006 Ingo Molnar * */ #include -- cgit v1.2.3 From 73d62d83ec3627782ba6f55defc76f3ffbef46ee Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Apr 2006 13:57:21 +0200 Subject: [PATCH] splice: comment styles - capitalize consistently - end sentences in one way or another - update comment text to match the implementation Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/splice.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 48b4382cc11..e50a460239d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -92,7 +92,7 @@ static void *page_cache_pipe_buf_map(struct file *file, /* * Page got truncated/unhashed. This will cause a 0-byte - * splice, if this is the first page + * splice, if this is the first page. */ if (!page->mapping) { err = -ENODATA; @@ -100,7 +100,7 @@ static void *page_cache_pipe_buf_map(struct file *file, } /* - * uh oh, read-error from disk + * Uh oh, read-error from disk. */ if (!PageUptodate(page)) { err = -EIO; @@ -108,7 +108,7 @@ static void *page_cache_pipe_buf_map(struct file *file, } /* - * page is ok afterall, fall through to mapping + * Page is ok afterall, fall through to mapping. */ unlock_page(page); } @@ -249,7 +249,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, nr_pages = PIPE_BUFFERS; /* - * initiate read-ahead on this page range. however, don't call into + * Initiate read-ahead on this page range. however, don't call into * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ @@ -257,7 +257,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, do_page_cache_readahead(mapping, in, index, nr_pages); /* - * now fill in the holes + * Now fill in the holes: */ error = 0; for (i = 0; i < nr_pages; i++, index++) { @@ -396,10 +396,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, int more; /* - * sub-optimal, but we are limited by the pipe ->map. we don't + * Sub-optimal, but we are limited by the pipe ->map. We don't * need a kmap'ed buffer here, we just want to make sure we * have the page pinned if the pipe page originates from the - * page cache + * page cache. */ ptr = buf->ops->map(file, info, buf); if (IS_ERR(ptr)) @@ -460,7 +460,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, offset = sd->pos & ~PAGE_CACHE_MASK; /* - * reuse buf page, if SPLICE_F_MOVE is set + * Reuse buf page, if SPLICE_F_MOVE is set. */ if (sd->flags & SPLICE_F_MOVE) { /* @@ -501,7 +501,7 @@ find_page: if (!PageUptodate(page)) { /* - * page got invalidated, repeat + * Page got invalidated, repeat. */ if (!page->mapping) { unlock_page(page); @@ -598,6 +598,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, ret += sd.len; buf->offset += sd.len; buf->len -= sd.len; + if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); @@ -681,7 +682,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); /* - * if file or inode is SYNC and we actually wrote some data, sync it + * If file or inode is SYNC and we actually wrote some data, sync it. */ if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) && ret > 0) { @@ -815,7 +816,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, } /* - * do the splice + * Do the splice. */ ret = 0; bytes = 0; -- cgit v1.2.3 From cbb7e577e732f576b9f399bc2600bdc0626c68dc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 14:57:50 +0200 Subject: [PATCH] splice: pass offset around for ->splice_read() and ->splice_write() We need not use ->f_pos as the offset for the file input/output. If the user passed an offset pointer in through sys_splice(), just use that and leave ->f_pos alone. Signed-off-by: Jens Axboe --- fs/splice.c | 86 +++++++++++++++++++++++++++++++------------------------------ 1 file changed, 44 insertions(+), 42 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index e50a460239d..5d3eda64703 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -231,8 +231,9 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, } static int -__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +__generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; @@ -241,8 +242,8 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, pgoff_t index; int i, error; - index = in->f_pos >> PAGE_CACHE_SHIFT; - offset = in->f_pos & ~PAGE_CACHE_MASK; + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) @@ -348,8 +349,9 @@ fill_it: * * Will read pages from given file and fill them into a pipe. */ -ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { ssize_t spliced; int ret; @@ -358,12 +360,12 @@ ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, spliced = 0; while (len) { - ret = __generic_file_splice_read(in, pipe, len, flags); + ret = __generic_file_splice_read(in, ppos, pipe, len, flags); if (ret <= 0) break; - in->f_pos += ret; + *ppos += ret; len -= ret; spliced += ret; @@ -561,7 +563,7 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags, + loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { int ret, do_wakeup, err; @@ -573,7 +575,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, sd.total_len = len; sd.flags = flags; sd.file = out; - sd.pos = out->f_pos; + sd.pos = *ppos; if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); @@ -656,9 +658,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - out->f_pos = sd.pos; return ret; - } /** @@ -674,12 +674,12 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, */ ssize_t generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { struct address_space *mapping = out->f_mapping; ssize_t ret; - ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); + ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); /* * If file or inode is SYNC and we actually wrote some data, sync it. @@ -715,9 +715,9 @@ EXPORT_SYMBOL(generic_file_splice_write); * */ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { - return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage); + return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -726,9 +726,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { - loff_t pos; int ret; if (unlikely(!out->f_op || !out->f_op->splice_write)) @@ -737,22 +736,21 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; - pos = out->f_pos; - - ret = rw_verify_area(WRITE, out, &pos, len); + ret = rw_verify_area(WRITE, out, ppos, len); if (unlikely(ret < 0)) return ret; - return out->f_op->splice_write(pipe, out, len, flags); + return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +static long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { - loff_t pos, isize, left; + loff_t isize, left; int ret; if (unlikely(!in->f_op || !in->f_op->splice_read)) @@ -761,28 +759,27 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; - pos = in->f_pos; - - ret = rw_verify_area(READ, in, &pos, len); + ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; isize = i_size_read(in->f_mapping->host); - if (unlikely(in->f_pos >= isize)) + if (unlikely(*ppos >= isize)) return 0; - left = isize - in->f_pos; + left = isize - *ppos; if (unlikely(left < len)) len = left; - return in->f_op->splice_read(in, pipe, len, flags); + return in->f_op->splice_read(in, ppos, pipe, len, flags); } -long do_splice_direct(struct file *in, struct file *out, size_t len, - unsigned int flags) +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + size_t len, unsigned int flags) { struct pipe_inode_info *pipe; long ret, bytes; + loff_t out_off; umode_t i_mode; int i; @@ -820,6 +817,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, */ ret = 0; bytes = 0; + out_off = 0; while (len) { size_t read_len, max_read_len; @@ -829,7 +827,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, */ max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); - ret = do_splice_to(in, pipe, max_read_len, flags); + ret = do_splice_to(in, ppos, pipe, max_read_len, flags); if (unlikely(ret < 0)) goto out_release; @@ -840,7 +838,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ - ret = do_splice_from(pipe, out, read_len, + ret = do_splice_from(pipe, out, &out_off, read_len, flags & ~SPLICE_F_NONBLOCK); if (unlikely(ret < 0)) goto out_release; @@ -898,6 +896,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, size_t len, unsigned int flags) { struct pipe_inode_info *pipe; + loff_t offset, *off; pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { @@ -906,12 +905,13 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_out) { if (out->f_op->llseek == no_llseek) return -EINVAL; - if (copy_from_user(&out->f_pos, off_out, - sizeof(loff_t))) + if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; - } + off = &offset; + } else + off = &out->f_pos; - return do_splice_from(pipe, out, len, flags); + return do_splice_from(pipe, out, off, len, flags); } pipe = out->f_dentry->d_inode->i_pipe; @@ -921,11 +921,13 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_in) { if (in->f_op->llseek == no_llseek) return -EINVAL; - if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; - } + off = &offset; + } else + off = &in->f_pos; - return do_splice_to(in, pipe, len, flags); + return do_splice_to(in, off, pipe, len, flags); } return -EINVAL; -- cgit v1.2.3 From 70524490ee2ea1bbf6cee6c106597b3ac25a3fc2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 15:51:17 +0200 Subject: [PATCH] splice: add support for sys_tee() Basically an in-kernel implementation of tee, which uses splice and the pipe buffers as an intelligent way to pass data around by reference. Where the user space tee consumes the input and produces a stdout and file output, this syscall merely duplicates the data inside a pipe to another pipe. No data is copied, the output just grabs a reference to the input pipe data. Signed-off-by: Jens Axboe --- fs/splice.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 5d3eda64703..8d57e89924a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -125,12 +125,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, kunmap(buf->page); } +static void page_cache_pipe_buf_get(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + static struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = page_cache_pipe_buf_map, .unmap = page_cache_pipe_buf_unmap, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, + .get = page_cache_pipe_buf_get, }; /* @@ -963,3 +970,182 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, return error; } + +/* + * Link contents of ipipe to opipe. + */ +static int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, do_wakeup = 0, i; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by inode address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + if (ipipe->inode < opipe->inode) { + mutex_lock(&ipipe->inode->i_mutex); + mutex_lock(&opipe->inode->i_mutex); + } else { + mutex_lock(&opipe->inode->i_mutex); + mutex_lock(&ipipe->inode->i_mutex); + } + + for (i = 0;; i++) { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + if (ipipe->nrbufs - i) { + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); + + /* + * If we have room, fill this buffer + */ + if (opipe->nrbufs < PIPE_BUFFERS) { + int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + + if (obuf->len > len) + obuf->len = len; + + opipe->nrbufs++; + do_wakeup = 1; + ret += obuf->len; + len -= obuf->len; + + if (!len) + break; + if (opipe->nrbufs < PIPE_BUFFERS) + continue; + } + + /* + * We have input available, but no output room. + * If we already copied data, return that. + */ + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + + opipe->waiting_writers++; + pipe_wait(opipe); + opipe->waiting_writers--; + continue; + } + + /* + * No input buffers, do the usual checks for available + * writers and blocking and wait if necessary + */ + if (!ipipe->writers) + break; + if (!ipipe->waiting_writers) { + if (ret) + break; + } + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (waitqueue_active(&ipipe->wait)) + wake_up_interruptible_sync(&ipipe->wait); + kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + + pipe_wait(ipipe); + } + + mutex_unlock(&ipipe->inode->i_mutex); + mutex_unlock(&opipe->inode->i_mutex); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + } + + return ret; +} + +/* + * This is a tee(1) implementation that works on pipes. It doesn't copy + * any data, it simply references the 'in' pages on the 'out' pipe. + * The 'flags' used are the SPLICE_F_* variants, currently the only + * applicable one is SPLICE_F_NONBLOCK. + */ +static long do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; + struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + + /* + * Link ipipe to the two output pipes, consuming as we go along. + */ + if (ipipe && opipe) + return link_pipe(ipipe, opipe, len, flags); + + return -EINVAL; +} + +asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) +{ + struct file *in; + int error, fput_in; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fget_light(fdin, &fput_in); + if (in) { + if (in->f_mode & FMODE_READ) { + int fput_out; + struct file *out = fget_light(fdout, &fput_out); + + if (out) { + if (out->f_mode & FMODE_WRITE) + error = do_tee(in, out, len, flags); + fput_light(out, fput_out); + } + } + fput_light(in, fput_in); + } + + return error; +} -- cgit v1.2.3 From 91ad66ef4469cb631ec0ccd131b07f16770773f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:55:10 +0200 Subject: [PATCH] splice: close i_size truncate races on read We need to check i_size after doing a blocking readpage. Signed-off-by: Jens Axboe --- fs/splice.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 8d57e89924a..7e858557472 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -145,8 +145,8 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, - int nr_pages, unsigned long offset, - unsigned long len, unsigned int flags) + int nr_pages, unsigned long len, + unsigned int offset, unsigned int flags) { int ret, do_wakeup, i; @@ -243,14 +243,16 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int offset, nr_pages; + unsigned int loff, offset, nr_pages; struct page *pages[PIPE_BUFFERS]; struct page *page; - pgoff_t index; + pgoff_t index, end_index; + loff_t isize; + size_t bytes; int i, error; index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + loff = offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) @@ -268,6 +270,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * Now fill in the holes: */ error = 0; + bytes = 0; for (i = 0; i < nr_pages; i++, index++) { find_page: /* @@ -336,13 +339,41 @@ readpage: goto find_page; break; } + + /* + * i_size must be checked after ->readpage(). + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + break; + } + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); + if (bytes + loff > isize) { + page_cache_release(page); + break; + } + /* + * force quit after adding this page + */ + nr_pages = i; + } } fill_it: pages[i] = page; + bytes += PAGE_CACHE_SIZE - loff; + loff = 0; } if (i) - return move_to_pipe(pipe, pages, i, offset, len, flags); + return move_to_pipe(pipe, pages, i, bytes, offset, flags); return error; } -- cgit v1.2.3 From c4f895cbe1e95aab633207fb19c650b7c984c01a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:56:12 +0200 Subject: [PATCH] splice: cleanup the SPLICE_F_NONBLOCK handling - generic_file_splice_read() more readable and correct - Don't bail on page allocation with NONBLOCK set, just don't allow direct blocking on IO (eg lock_page). Signed-off-by: Jens Axboe --- fs/splice.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 7e858557472..78cd264340f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -278,14 +278,6 @@ find_page: */ page = find_get_page(mapping, index); if (!page) { - /* - * If in nonblock mode then dont block on - * readpage (we've kicked readahead so there - * will be asynchronous progress): - */ - if (flags & SPLICE_F_NONBLOCK) - break; - /* * page didn't exist, allocate one */ @@ -307,6 +299,13 @@ find_page: * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { + /* + * If in nonblock mode then dont block on waiting + * for an in-flight io page + */ + if (flags & SPLICE_F_NONBLOCK) + break; + lock_page(page); /* @@ -400,17 +399,20 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, while (len) { ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - if (ret <= 0) + if (ret < 0) break; + else if (!ret) { + if (spliced) + break; + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } *ppos += ret; len -= ret; spliced += ret; - - if (!(flags & SPLICE_F_NONBLOCK)) - continue; - ret = -EAGAIN; - break; } if (spliced) -- cgit v1.2.3 From 2a27250e6cf47ca1ea3bea0a55e4b7889c097627 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:56:40 +0200 Subject: [PATCH] tee: link_pipe() must be careful when dropping one of the pipe locks We need to ensure that we only drop a lock that is ordered last, to avoid ABBA deadlocks with competing processes. Signed-off-by: Jens Axboe --- fs/splice.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 78cd264340f..4f5e6b09fb2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1012,7 +1012,9 @@ static int link_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret = 0, do_wakeup = 0, i; + int ret, do_wakeup, i, ipipe_first; + + ret = do_wakeup = ipipe_first = 0; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1020,6 +1022,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, * could deadlock (one doing tee from A -> B, the other from B -> A). */ if (ipipe->inode < opipe->inode) { + ipipe_first = 1; mutex_lock(&ipipe->inode->i_mutex); mutex_lock(&opipe->inode->i_mutex); } else { @@ -1068,9 +1071,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, /* * We have input available, but no output room. - * If we already copied data, return that. + * If we already copied data, return that. If we + * need to drop the opipe lock, it must be ordered + * last to avoid deadlocks. */ - if (flags & SPLICE_F_NONBLOCK) { + if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { if (!ret) ret = -EAGAIN; break; @@ -1104,7 +1109,12 @@ static int link_pipe(struct pipe_inode_info *ipipe, if (ret) break; } - if (flags & SPLICE_F_NONBLOCK) { + /* + * pipe_wait() drops the ipipe mutex. To avoid deadlocks + * with another process, we can only safely do that if + * the ipipe lock is ordered last. + */ + if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { if (!ret) ret = -EAGAIN; break; -- cgit v1.2.3 From a4514ebd8e12c63c09ab02be518db545bd1d24af Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:57:05 +0200 Subject: [PATCH] splice: offset fixes - We need to adjust *ppos for writes as well. - Copy back modified offset value if one was passed in, similar to what sendfile does. Signed-off-by: Jens Axboe --- fs/splice.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 4f5e6b09fb2..27d6408ff49 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -720,22 +720,26 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ssize_t ret; ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); - - /* - * If file or inode is SYNC and we actually wrote some data, sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) - && ret > 0) { + if (ret > 0) { struct inode *inode = mapping->host; - int err; - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(mapping->host, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); + *ppos += ret; + + /* + * If file or inode is SYNC and we actually wrote some data, + * sync it. + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + mutex_lock(&inode->i_mutex); + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + mutex_unlock(&inode->i_mutex); - if (err) - ret = err; + if (err) + ret = err; + } } return ret; @@ -937,6 +941,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *pipe; loff_t offset, *off; + long ret; pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { @@ -951,7 +956,12 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &out->f_pos; - return do_splice_from(pipe, out, off, len, flags); + ret = do_splice_from(pipe, out, off, len, flags); + + if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; } pipe = out->f_dentry->d_inode->i_pipe; @@ -967,7 +977,12 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &in->f_pos; - return do_splice_to(in, off, pipe, len, flags); + ret = do_splice_to(in, off, pipe, len, flags); + + if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; } return -EINVAL; -- cgit v1.2.3 From 9e0267c26e237f84f608a68e579bf4eb89dad819 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:57:31 +0200 Subject: [PATCH] splice: fixup writeout path after ->map changes Since ->map() no longer locks the page, we need to adjust the handling of those pages (and stealing) a little. This now passes full regressions again. Signed-off-by: Jens Axboe --- fs/splice.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 27d6408ff49..22fac87e90b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -50,7 +50,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, struct page *page = buf->page; struct address_space *mapping = page_mapping(page); - WARN_ON(!PageLocked(page)); + lock_page(page); + WARN_ON(!PageUptodate(page)); /* @@ -65,8 +66,10 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, if (PagePrivate(page)) try_to_release_page(page, mapping_gfp_mask(mapping)); - if (!remove_mapping(mapping, page)) + if (!remove_mapping(mapping, page)) { + unlock_page(page); return 1; + } buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; return 0; @@ -507,14 +510,12 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, if (sd->flags & SPLICE_F_MOVE) { /* * If steal succeeds, buf->page is now pruned from the vm - * side (LRU and page cache) and we can reuse it. + * side (LRU and page cache) and we can reuse it. The page + * will also be looked on successful return. */ if (buf->ops->steal(info, buf)) goto find_page; - /* - * this will also set the page locked - */ page = buf->page; if (add_to_page_cache(page, mapping, index, gfp_mask)) goto find_page; @@ -523,15 +524,27 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, lru_cache_add(page); } else { find_page: - ret = -ENOMEM; - page = find_or_create_page(mapping, index, gfp_mask); - if (!page) - goto out_nomem; + page = find_lock_page(mapping, index); + if (!page) { + ret = -ENOMEM; + page = page_cache_alloc_cold(mapping); + if (unlikely(!page)) + goto out_nomem; + + /* + * This will also lock the page + */ + ret = add_to_page_cache_lru(page, mapping, index, + gfp_mask); + if (unlikely(ret)) + goto out; + } /* - * If the page is uptodate, it is also locked. If it isn't - * uptodate, we can mark it uptodate if we are filling the - * full page. Otherwise we need to read it in first... + * We get here with the page locked. If the page is also + * uptodate, we don't need to do more. If it isn't, we + * may need to bring it in if we are not going to overwrite + * the full page. */ if (!PageUptodate(page)) { if (sd->len < PAGE_CACHE_SIZE) { @@ -553,10 +566,8 @@ find_page: ret = -EIO; goto out; } - } else { - WARN_ON(!PageLocked(page)); + } else SetPageUptodate(page); - } } } @@ -585,10 +596,10 @@ find_page: mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { + if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) page_cache_release(page); - unlock_page(page); - } + + unlock_page(page); out_nomem: buf->ops->unmap(info, buf); return ret; -- cgit v1.2.3 From 82aa5d6183667aa2a5f3c61e390934b0273d2ad7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Apr 2006 13:05:48 +0200 Subject: [PATCH] splice: fix smaller sized splice reads Signed-off-by: Jens Axboe --- fs/splice.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 22fac87e90b..0559e7577a0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -275,6 +275,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, error = 0; bytes = 0; for (i = 0; i < nr_pages; i++, index++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min(len, PAGE_CACHE_SIZE - loff); find_page: /* * lookup the page for this index @@ -366,11 +375,13 @@ readpage: * force quit after adding this page */ nr_pages = i; + this_len = min(this_len, loff); } } fill_it: pages[i] = page; - bytes += PAGE_CACHE_SIZE - loff; + bytes += this_len; + len -= this_len; loff = 0; } -- cgit v1.2.3 From ba5f5d90c45a30e4e9a1bd136acf1b3973c905c8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 25 Apr 2006 15:33:34 +0200 Subject: [PATCH] splice: fix min() warning Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 0559e7577a0..4aa67254740 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -283,7 +283,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, /* * this_len is the max we'll use from this page */ - this_len = min(len, PAGE_CACHE_SIZE - loff); + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); find_page: /* * lookup the page for this index -- cgit v1.2.3 From 016b661e2f717168e600f3c85f29e1a49f88e004 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Apr 2006 15:42:00 +0200 Subject: [PATCH] splice: fix offset problems Make the move_from_pipe() actors return number of bytes processed, then move_from_pipe() can decide more cleverly when to move on to the next buffer. This fixes problems with pipe offset and differing file offset. Signed-off-by: Jens Axboe --- fs/splice.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 4aa67254740..8c6030c762e 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -439,14 +439,13 @@ EXPORT_SYMBOL(generic_file_splice_read); /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' - * using sendpage(). + * using sendpage(). Return the number of bytes sent. */ static int pipe_to_sendpage(struct pipe_inode_info *info, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; loff_t pos = sd->pos; - unsigned int offset; ssize_t ret; void *ptr; int more; @@ -461,16 +460,13 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, if (IS_ERR(ptr)) return PTR_ERR(ptr); - offset = pos & ~PAGE_CACHE_MASK; more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); + ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len, + &pos, more); buf->ops->unmap(info, buf); - if (ret == sd->len) - return 0; - - return -EIO; + return ret; } /* @@ -499,7 +495,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, struct file *file = sd->file; struct address_space *mapping = file->f_mapping; gfp_t gfp_mask = mapping_gfp_mask(mapping); - unsigned int offset; + unsigned int offset, this_len; struct page *page; pgoff_t index; char *src; @@ -515,6 +511,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; + this_len = sd->len; + if (this_len + offset > PAGE_CACHE_SIZE) + this_len = PAGE_CACHE_SIZE - offset; + /* * Reuse buf page, if SPLICE_F_MOVE is set. */ @@ -558,7 +558,7 @@ find_page: * the full page. */ if (!PageUptodate(page)) { - if (sd->len < PAGE_CACHE_SIZE) { + if (this_len < PAGE_CACHE_SIZE) { ret = mapping->a_ops->readpage(file, page); if (unlikely(ret)) goto out; @@ -582,7 +582,7 @@ find_page: } } - ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); + ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; @@ -592,18 +592,22 @@ find_page: if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { char *dst = kmap_atomic(page, KM_USER0); - memcpy(dst + offset, src + buf->offset, sd->len); + memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst, KM_USER0); } - ret = mapping->a_ops->commit_write(file, page, 0, sd->len); + ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; } else if (ret) goto out; + /* + * Return the number of bytes written. + */ + ret = this_len; mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: @@ -652,16 +656,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, sd.len = sd.total_len; err = actor(pipe, buf, &sd); - if (err) { + if (err <= 0) { if (!ret && err != -ENODATA) ret = err; break; } - ret += sd.len; - buf->offset += sd.len; - buf->len -= sd.len; + ret += err; + buf->offset += err; + buf->len -= err; + + sd.len -= err; + sd.pos += err; + sd.total_len -= err; + if (sd.len) + continue; if (!buf->len) { buf->ops = NULL; @@ -672,8 +682,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, do_wakeup = 1; } - sd.pos += sd.len; - sd.total_len -= sd.len; if (!sd.total_len) break; } -- cgit v1.2.3 From 912d35f86781e64d73be1ef358f703c08905ac37 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Apr 2006 10:59:21 +0200 Subject: [PATCH] Add support for the sys_vmsplice syscall sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice() moves data to a pipe, with the input being a user address range instead. This uses an approach suggested by Linus, where we can hold partial ranges inside the pages[] map. Hopefully this will be useful for network receive support as well. Signed-off-by: Jens Axboe --- fs/splice.c | 292 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 253 insertions(+), 39 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 8c6030c762e..0b2c1f060ca 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * Passed to the actors @@ -38,6 +39,22 @@ struct splice_desc { loff_t pos; /* file position */ }; +struct partial_page { + unsigned int offset; + unsigned int len; +}; + +/* + * Passed to move_to_pipe + */ +struct splice_pipe_desc { + struct page **pages; /* page map */ + struct partial_page *partial; /* pages[] may not be contig */ + int nr_pages; /* number of pages in map */ + unsigned int flags; /* splice flags */ + struct pipe_buf_operations *ops;/* ops associated with output pipe */ +}; + /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the @@ -128,6 +145,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, kunmap(buf->page); } +static void *user_page_pipe_buf_map(struct file *file, + struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return kmap(buf->page); +} + +static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + kunmap(buf->page); +} + static void page_cache_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) { @@ -143,19 +173,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { .get = page_cache_pipe_buf_get, }; +static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static struct pipe_buf_operations user_page_pipe_buf_ops = { + .can_merge = 0, + .map = user_page_pipe_buf_map, + .unmap = user_page_pipe_buf_unmap, + .release = page_cache_pipe_buf_release, + .steal = user_page_pipe_buf_steal, + .get = page_cache_pipe_buf_get, +}; + /* * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, - int nr_pages, unsigned long len, - unsigned int offset, unsigned int flags) +static ssize_t move_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { - int ret, do_wakeup, i; + int ret, do_wakeup, page_nr; ret = 0; do_wakeup = 0; - i = 0; + page_nr = 0; if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); @@ -171,27 +215,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, if (pipe->nrbufs < PIPE_BUFFERS) { int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; - struct page *page = pages[i++]; - unsigned long this_len; - this_len = PAGE_CACHE_SIZE - offset; - if (this_len > len) - this_len = len; - - buf->page = page; - buf->offset = offset; - buf->len = this_len; - buf->ops = &page_cache_pipe_buf_ops; + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->ops = spd->ops; pipe->nrbufs++; + page_nr++; + ret += buf->len; + if (pipe->inode) do_wakeup = 1; - ret += this_len; - len -= this_len; - offset = 0; - if (!--nr_pages) - break; - if (!len) + if (!--spd->nr_pages) break; if (pipe->nrbufs < PIPE_BUFFERS) continue; @@ -199,7 +235,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; } - if (flags & SPLICE_F_NONBLOCK) { + if (spd->flags & SPLICE_F_NONBLOCK) { if (!ret) ret = -EAGAIN; break; @@ -234,8 +270,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } - while (i < nr_pages) - page_cache_release(pages[i++]); + while (page_nr < spd->nr_pages) + page_cache_release(spd->pages[page_nr++]); return ret; } @@ -246,17 +282,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int loff, offset, nr_pages; + unsigned int loff, nr_pages; struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; - size_t bytes; - int i, error; + size_t total_len; + int error; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + }; index = *ppos >> PAGE_CACHE_SHIFT; - loff = offset = *ppos & ~PAGE_CACHE_MASK; - nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) nr_pages = PIPE_BUFFERS; @@ -266,15 +309,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ - if (!offset || nr_pages > 1) - do_page_cache_readahead(mapping, in, index, nr_pages); + if (!loff || spd.nr_pages > 1) + do_page_cache_readahead(mapping, in, index, spd.nr_pages); /* * Now fill in the holes: */ error = 0; - bytes = 0; - for (i = 0; i < nr_pages; i++, index++) { + total_len = 0; + for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) { unsigned int this_len; if (!len) @@ -367,26 +410,29 @@ readpage: */ if (end_index == index) { loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); - if (bytes + loff > isize) { + if (total_len + loff > isize) { page_cache_release(page); break; } /* * force quit after adding this page */ - nr_pages = i; + nr_pages = spd.nr_pages; this_len = min(this_len, loff); + loff = 0; } } fill_it: - pages[i] = page; - bytes += this_len; + pages[spd.nr_pages] = page; + partial[spd.nr_pages].offset = loff; + partial[spd.nr_pages].len = this_len; len -= this_len; + total_len += this_len; loff = 0; } - if (i) - return move_to_pipe(pipe, pages, i, bytes, offset, flags); + if (spd.nr_pages) + return move_to_pipe(pipe, &spd); return error; } @@ -1018,6 +1064,174 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; } +/* + * Map an iov into an array of pages and offset/length tupples. With the + * partial_page structure, we can map several non-contiguous ranges into + * our ones pages[] map instead of splitting that operation into pieces. + * Could easily be exported as a generic helper for other users, in which + * case one would probably want to add a 'max_nr_pages' parameter as well. + */ +static int get_iovec_page_array(const struct iovec __user *iov, + unsigned int nr_vecs, struct page **pages, + struct partial_page *partial) +{ + int buffers = 0, error = 0; + + /* + * It's ok to take the mmap_sem for reading, even + * across a "get_user()". + */ + down_read(¤t->mm->mmap_sem); + + while (nr_vecs) { + unsigned long off, npages; + void __user *base; + size_t len; + int i; + + /* + * Get user address base and length for this iovec. + */ + error = get_user(base, &iov->iov_base); + if (unlikely(error)) + break; + error = get_user(len, &iov->iov_len); + if (unlikely(error)) + break; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + if (unlikely(!len)) + break; + error = -EFAULT; + if (unlikely(!base)) + break; + + /* + * Get this base offset and number of pages, then map + * in the user pages. + */ + off = (unsigned long) base & ~PAGE_MASK; + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages > PIPE_BUFFERS - buffers) + npages = PIPE_BUFFERS - buffers; + + error = get_user_pages(current, current->mm, + (unsigned long) base, npages, 0, 0, + &pages[buffers], NULL); + + if (unlikely(error <= 0)) + break; + + /* + * Fill this contiguous range into the partial page map. + */ + for (i = 0; i < error; i++) { + const int plen = min_t(size_t, len, PAGE_SIZE) - off; + + partial[buffers].offset = off; + partial[buffers].len = plen; + + off = 0; + len -= plen; + buffers++; + } + + /* + * We didn't complete this iov, stop here since it probably + * means we have to move some of this into a pipe to + * be able to continue. + */ + if (len) + break; + + /* + * Don't continue if we mapped fewer pages than we asked for, + * or if we mapped the max number of pages that we have + * room for. + */ + if (error < npages || buffers == PIPE_BUFFERS) + break; + + nr_vecs--; + iov++; + } + + up_read(¤t->mm->mmap_sem); + + if (buffers) + return buffers; + + return error; +} + +/* + * vmsplice splices a user address range into a pipe. It can be thought of + * as splice-from-memory, where the regular splice is splice-from-file (or + * to file). In both cases the output is a pipe, naturally. + * + * Note that vmsplice only supports splicing _from_ user memory to a pipe, + * not the other way around. Splicing from user memory is a simple operation + * that can be supported without any funky alignment restrictions or nasty + * vm tricks. We simply map in the user memory and fill them into a pipe. + * The reverse isn't quite as easy, though. There are two possible solutions + * for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Alas, it isn't here. + * + */ +static long do_vmsplice(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; + struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &user_page_pipe_buf_ops, + }; + + if (unlikely(!pipe)) + return -EBADF; + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial); + if (spd.nr_pages <= 0) + return spd.nr_pages; + + return move_to_pipe(pipe, &spd); +} + +asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct file *file; + long error; + int fput; + + error = -EBADF; + file = fget_light(fd, &fput); + if (file) { + if (file->f_mode & FMODE_WRITE) + error = do_vmsplice(file, iov, nr_segs, flags); + + fput_light(file, fput); + } + + return error; +} + asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, size_t len, unsigned int flags) -- cgit v1.2.3 From 00522fb41a2a9bf0f98a007c0e2b516a3873148c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Apr 2006 14:39:29 +0200 Subject: [PATCH] splice: rearrange moving to/from pipe helpers We need these for people writing their own ->splice_read/write hooks. Signed-off-by: Jens Axboe --- fs/splice.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 0b2c1f060ca..447ebc0a37f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -29,23 +29,13 @@ #include #include -/* - * Passed to the actors - */ -struct splice_desc { - unsigned int len, total_len; /* current and remaining length */ - unsigned int flags; /* splice flags */ - struct file *file; /* file to read/write */ - loff_t pos; /* file position */ -}; - struct partial_page { unsigned int offset; unsigned int len; }; /* - * Passed to move_to_pipe + * Passed to splice_to_pipe */ struct splice_pipe_desc { struct page **pages; /* page map */ @@ -192,8 +182,8 @@ static struct pipe_buf_operations user_page_pipe_buf_ops = { * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) +static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { int ret, do_wakeup, page_nr; @@ -432,7 +422,7 @@ fill_it: } if (spd.nr_pages) - return move_to_pipe(pipe, &spd); + return splice_to_pipe(pipe, &spd); return error; } @@ -666,17 +656,14 @@ out_nomem: return ret; } -typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, - struct splice_desc *); - /* * Pipe input worker. Most of this logic works like a regular pipe, the * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags, - splice_actor *actor) +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor) { int ret, do_wakeup, err; struct splice_desc sd; @@ -795,7 +782,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, struct address_space *mapping = out->f_mapping; ssize_t ret; - ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); + ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); if (ret > 0) { struct inode *inode = mapping->host; @@ -837,7 +824,7 @@ EXPORT_SYMBOL(generic_file_splice_write); ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); + return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -924,7 +911,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, /* * We don't have an immediate reader, but we'll read the stuff - * out of the pipe right after the move_to_pipe(). So set + * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; @@ -1210,7 +1197,7 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, if (spd.nr_pages <= 0) return spd.nr_pages; - return move_to_pipe(pipe, &spd); + return splice_to_pipe(pipe, &spd); } asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, -- cgit v1.2.3 From eb645a24de82496434cc81171d7f350edb327399 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Apr 2006 08:44:27 +0200 Subject: [PATCH] splice: switch to using page_cache_readahead() Avoids doing useless work, when the file is fully cached. Signed-off-by: Jens Axboe --- fs/splice.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index 447ebc0a37f..dc205f64309 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -299,8 +299,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ - if (!loff || spd.nr_pages > 1) - do_page_cache_readahead(mapping, in, index, spd.nr_pages); + if (!loff || nr_pages > 1) + page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); /* * Now fill in the holes: -- cgit v1.2.3 From eb20796bf6fdb95ccf51440ba2a827149bdc037f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Apr 2006 11:05:22 +0200 Subject: [PATCH] splice: make the read-side do batched page lookups Use the new find_get_pages_contig() to potentially look up the entire splice range in one single call. This speeds up generic_file_splice_read() quite a bit. Signed-off-by: Jens Axboe --- fs/splice.c | 95 ++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 30 deletions(-) (limited to 'fs/splice.c') diff --git a/fs/splice.c b/fs/splice.c index dc205f64309..a46ddd28561 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -279,7 +279,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, pgoff_t index, end_index; loff_t isize; size_t total_len; - int error; + int error, page_nr; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, @@ -307,39 +307,67 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, */ error = 0; total_len = 0; - for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) { - unsigned int this_len; - if (!len) - break; + /* + * Lookup the (hopefully) full range of pages we need. + */ + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); + /* + * If find_get_pages_contig() returned fewer pages than we needed, + * allocate the rest. + */ + index += spd.nr_pages; + while (spd.nr_pages < nr_pages) { /* - * this_len is the max we'll use from this page - */ - this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); -find_page: - /* - * lookup the page for this index + * Page could be there, find_get_pages_contig() breaks on + * the first hole. */ page = find_get_page(mapping, index); if (!page) { /* - * page didn't exist, allocate one + * page didn't exist, allocate one. */ page = page_cache_alloc_cold(mapping); if (!page) break; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping)); + mapping_gfp_mask(mapping)); if (unlikely(error)) { page_cache_release(page); break; } - - goto readpage; + /* + * add_to_page_cache() locks the page, unlock it + * to avoid convoluting the logic below even more. + */ + unlock_page(page); } + pages[spd.nr_pages++] = page; + index++; + } + + /* + * Now loop over the map and see if we need to start IO on any + * pages, fill in the partial map, etc. + */ + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = pages[page_nr]; + /* * If the page isn't uptodate, we may need to start io on it */ @@ -360,7 +388,6 @@ find_page: */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); break; } /* @@ -371,16 +398,20 @@ find_page: goto fill_it; } -readpage: /* * need to read in the page */ error = mapping->a_ops->readpage(in, page); - if (unlikely(error)) { - page_cache_release(page); + /* + * We really should re-lookup the page here, + * but it complicates things a lot. Instead + * lets just do what we already stored, and + * we'll get it the next time we are called. + */ if (error == AOP_TRUNCATED_PAGE) - goto find_page; + error = 0; + break; } @@ -389,10 +420,8 @@ readpage: */ isize = i_size_read(mapping->host); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!isize || index > end_index)) { - page_cache_release(page); + if (unlikely(!isize || index > end_index)) break; - } /* * if this is the last page, see if we need to shrink @@ -400,27 +429,33 @@ readpage: */ if (end_index == index) { loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); - if (total_len + loff > isize) { - page_cache_release(page); + if (total_len + loff > isize) break; - } /* * force quit after adding this page */ - nr_pages = spd.nr_pages; + len = this_len; this_len = min(this_len, loff); loff = 0; } } fill_it: - pages[spd.nr_pages] = page; - partial[spd.nr_pages].offset = loff; - partial[spd.nr_pages].len = this_len; + partial[page_nr].offset = loff; + partial[page_nr].len = this_len; len -= this_len; total_len += this_len; loff = 0; + spd.nr_pages++; + index++; } + /* + * Release any pages at the end, if we quit early. 'i' is how far + * we got, 'nr_pages' is how many pages are in the map. + */ + while (page_nr < nr_pages) + page_cache_release(pages[page_nr++]); + if (spd.nr_pages) return splice_to_pipe(pipe, &spd); -- cgit v1.2.3