From 48ce8b056c88920c8ac187781048f5dae33c81b9 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Mon, 5 Jun 2006 08:21:03 -0500 Subject: JFS: commit_mutex cleanups I look at code, and see that 1)locks wasn't release in the opposite order in which they were taken 2)in jfs_rename we lock new_ip, and in "error path" we didn't unlock it 3)I see strange expression: "! !" May be this worth to fix? Signed-off-by: Evgeniy Dushistov Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_txnmgr.c | 2 +- fs/jfs/namei.c | 33 ++++++++++++++++----------------- 2 files changed, 17 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index ac3d66948e8..49618dd94f9 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2944,7 +2944,7 @@ int jfs_sync(void *arg) * Inode is being freed */ list_del_init(&jfs_ip->anon_inode_list); - } else if (! !mutex_trylock(&jfs_ip->commit_mutex)) { + } else if (mutex_trylock(&jfs_ip->commit_mutex)) { /* * inode will be removed from anonymous list * when it is committed diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 09ea03f6227..295268ad231 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -165,8 +165,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, out3: txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; @@ -300,8 +300,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) out3: txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; @@ -384,8 +384,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) if (rc == -EIO) txAbort(tid, 1); txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); goto out2; } @@ -422,8 +422,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); /* * Truncating the directory index table is not guaranteed. It @@ -503,8 +503,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) if (rc == -EIO) txAbort(tid, 1); /* Marks FS Dirty */ txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); IWRITE_UNLOCK(ip); goto out1; } @@ -527,8 +527,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) if ((new_size = commitZeroLink(tid, ip)) < 0) { txAbort(tid, 1); /* Marks FS Dirty */ txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); IWRITE_UNLOCK(ip); rc = new_size; goto out1; @@ -556,9 +556,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); - + mutex_unlock(&JFS_IP(dip)->commit_mutex); while (new_size && (rc == 0)) { tid = txBegin(dip->i_sb, 0); @@ -847,8 +846,8 @@ static int jfs_link(struct dentry *old_dentry, out: txEnd(tid); - mutex_unlock(&JFS_IP(dir)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dir)->commit_mutex); jfs_info("jfs_link: rc:%d", rc); return rc; @@ -1037,8 +1036,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, out3: txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; @@ -1160,10 +1159,11 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(new_ip->i_mode)) { new_ip->i_nlink--; if (new_ip->i_nlink) { - mutex_unlock(&JFS_IP(new_dir)->commit_mutex); - mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); if (old_dir != new_dir) mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); if (!S_ISDIR(old_ip->i_mode) && new_ip) IWRITE_UNLOCK(new_ip); jfs_error(new_ip->i_sb, @@ -1281,13 +1281,12 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, out4: txEnd(tid); - - mutex_unlock(&JFS_IP(new_dir)->commit_mutex); - mutex_unlock(&JFS_IP(old_ip)->commit_mutex); - if (old_dir != new_dir) - mutex_unlock(&JFS_IP(old_dir)->commit_mutex); if (new_ip) mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + if (old_dir != new_dir) + mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); while (new_size && (rc == 0)) { tid = txBegin(new_ip->i_sb, 0); -- cgit v1.2.3 From 8ba10ab128e88bfbe58f7164543827ef3c3a2c88 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 8 Jul 2006 02:17:40 +0000 Subject: [CIFS] CIFS_DEBUG2 depends on CIFS Signed-off-by: Steve French --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 00aa3d5c5a8..7db05742a92 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1791,6 +1791,7 @@ config CIFS_POSIX config CIFS_DEBUG2 bool "Enable additional CIFS debugging routines" + depends on CIFS help Enabling this option adds a few more debugging routines to the cifs code which slightly increases the size of -- cgit v1.2.3 From aadd06e5c56b9ff5117ec77e59eada43dc46e2fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Jul 2006 11:00:01 +0200 Subject: [PATCH] splice: fix problems with sys_tee() Several issues noticed/fixed: - We cannot reliably block in link_pipe() while holding both input and output mutexes. So do preparatory checks before locking down both mutexes and doing the link. - The ipipe->nrbufs vs i check was bad, because we could have dropped the ipipe lock in-between. This causes us to potentially look at unknown buffers if we were racing with someone else reading this pipe. Signed-off-by: Jens Axboe --- fs/splice.c | 238 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 133 insertions(+), 105 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 05fd2787be9..684bca3d3a1 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1306,6 +1306,85 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, return error; } +/* + * Make sure there's data to read. Wait for input if we can, otherwise + * return an appropriate error. + */ +static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs) + return 0; + + ret = 0; + mutex_lock(&pipe->inode->i_mutex); + + while (!pipe->nrbufs) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } + pipe_wait(pipe); + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + +/* + * Make sure there's writeable room. Wait for room if we can, otherwise + * return an appropriate error. + */ +static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs < PIPE_BUFFERS) + return 0; + + ret = 0; + mutex_lock(&pipe->inode->i_mutex); + + while (pipe->nrbufs >= PIPE_BUFFERS) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + break; + } + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + /* * Link contents of ipipe to opipe. */ @@ -1314,9 +1393,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret, do_wakeup, i, ipipe_first; - - ret = do_wakeup = ipipe_first = 0; + int ret = 0, i = 0, nbuf; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1324,126 +1401,62 @@ static int link_pipe(struct pipe_inode_info *ipipe, * could deadlock (one doing tee from A -> B, the other from B -> A). */ if (ipipe->inode < opipe->inode) { - ipipe_first = 1; - mutex_lock(&ipipe->inode->i_mutex); - mutex_lock(&opipe->inode->i_mutex); + mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_CHILD); } else { - mutex_lock(&opipe->inode->i_mutex); - mutex_lock(&ipipe->inode->i_mutex); + mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_CHILD); } - for (i = 0;; i++) { + do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - if (ipipe->nrbufs - i) { - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); - /* - * If we have room, fill this buffer - */ - if (opipe->nrbufs < PIPE_BUFFERS) { - int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); - - /* - * Get a reference to this pipe buffer, - * so we can copy the contents over. - */ - ibuf->ops->get(ipipe, ibuf); - - obuf = opipe->bufs + nbuf; - *obuf = *ibuf; - - /* - * Don't inherit the gift flag, we need to - * prevent multiple steals of this page. - */ - obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - - if (obuf->len > len) - obuf->len = len; - - opipe->nrbufs++; - do_wakeup = 1; - ret += obuf->len; - len -= obuf->len; - - if (!len) - break; - if (opipe->nrbufs < PIPE_BUFFERS) - continue; - } - - /* - * We have input available, but no output room. - * If we already copied data, return that. If we - * need to drop the opipe lock, it must be ordered - * last to avoid deadlocks. - */ - if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { - if (!ret) - ret = -EAGAIN; - break; - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&opipe->wait)) - wake_up_interruptible(&opipe->wait); - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); - do_wakeup = 0; - } + /* + * If we have iterated all input buffers or ran out of + * output room, break. + */ + if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) + break; - opipe->waiting_writers++; - pipe_wait(opipe); - opipe->waiting_writers--; - continue; - } + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); /* - * No input buffers, do the usual checks for available - * writers and blocking and wait if necessary + * Get a reference to this pipe buffer, + * so we can copy the contents over. */ - if (!ipipe->writers) - break; - if (!ipipe->waiting_writers) { - if (ret) - break; - } + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + /* - * pipe_wait() drops the ipipe mutex. To avoid deadlocks - * with another process, we can only safely do that if - * the ipipe lock is ordered last. + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. */ - if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { - if (!ret) - ret = -EAGAIN; - break; - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - if (waitqueue_active(&ipipe->wait)) - wake_up_interruptible_sync(&ipipe->wait); - kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + if (obuf->len > len) + obuf->len = len; - pipe_wait(ipipe); - } + opipe->nrbufs++; + ret += obuf->len; + len -= obuf->len; + i++; + } while (len); mutex_unlock(&ipipe->inode->i_mutex); mutex_unlock(&opipe->inode->i_mutex); - if (do_wakeup) { + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) { smp_mb(); if (waitqueue_active(&opipe->wait)) wake_up_interruptible(&opipe->wait); @@ -1464,14 +1477,29 @@ static long do_tee(struct file *in, struct file *out, size_t len, { struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + int ret = -EINVAL; /* - * Link ipipe to the two output pipes, consuming as we go along. + * Duplicate the contents of ipipe to opipe without actually + * copying the data. */ - if (ipipe && opipe) - return link_pipe(ipipe, opipe, len, flags); + if (ipipe && opipe && ipipe != opipe) { + /* + * Keep going, unless we encounter an error. The ipipe/opipe + * ordering doesn't really matter. + */ + ret = link_ipipe_prep(ipipe, flags); + if (!ret) { + ret = link_opipe_prep(opipe, flags); + if (!ret) { + ret = link_pipe(ipipe, opipe, len, flags); + if (!ret && (flags & SPLICE_F_NONBLOCK)) + ret = -EAGAIN; + } + } + } - return -EINVAL; + return ret; } asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) -- cgit v1.2.3 From 73ce5934e2d855db436566297f12966eb507a435 Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Mon, 10 Jul 2006 04:43:56 -0700 Subject: [PATCH] reiserfs: fix journaling issue regarding fsync() When write() extends a file(i_size is increased) and fsync() is called, change of inode must be written to journaling area through fsync(). But,currently the i_trans_id is not correctly updated when i_size is increased. So fsync() does not kick the journal writer. Reiserfs_file_write() already updates the transaction when blocks are allocated, but the case when i_size increases and new blocks are not added is not correctly treated. Following patch fix this bug. Signed-off-by: Hisashi Hifumi Cc: Jeff Mahoney Cc: Chris Mason Cc: Hans Reiser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 752cea12e30..f318b58510f 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -860,8 +860,12 @@ static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_han // this sets the proper flags for O_SYNC to trigger a commit mark_inode_dirty(inode); reiserfs_write_unlock(inode->i_sb); - } else + } else { + reiserfs_write_lock(inode->i_sb); + reiserfs_update_inode_transaction(inode); mark_inode_dirty(inode); + reiserfs_write_unlock(inode->i_sb); + } sd_update = 1; } -- cgit v1.2.3 From 25e206b54b9a20e63b6f5194aeebfa13d37e015c Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Mon, 10 Jul 2006 04:44:00 -0700 Subject: [PATCH] partitions: let partitions inherit policy from disk Change the partition code in fs/partitions/check.c to initialize a newly detected partition's policy field with that of the containing block device (see patch below). My reasoning is that function set_disk_ro() in block/genhd.c modifies the policy field (read-only indicator) of a disk and all contained partitions. When a partition is detected after the call to set_disk_ro(), the policy field of this partition will currently not inherit the disk's policy field. This behavior poses a problem in cases where a block device can be 'logically de- and reactivated' like e.g. the s390 DASD driver because partition detection may run after the policy field has been modified. Signed-off-by: Peter Oberparleiter Acked-by: Al Viro Makes-sense-to: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/partitions/check.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 839634026eb..51c6a748df4 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -339,6 +339,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) p->start_sect = start; p->nr_sects = len; p->partno = part; + p->policy = disk->policy; if (isdigit(disk->kobj.name[strlen(disk->kobj.name)-1])) snprintf(p->kobj.name,KOBJ_NAME_LEN,"%sp%d",disk->kobj.name,part); -- cgit v1.2.3 From 69c3a5b8fd8cfa67be22f6d7ae5c681c6777d817 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Jul 2006 04:44:23 -0700 Subject: [PATCH] fs/read_write.c: EXPORT_UNUSED_SYMBOL This patch marks an unused export as EXPORT_UNUSED_SYMBOL. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/read_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 5bc0e9234f9..d4cb3183c99 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -436,7 +436,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) return seg; } -EXPORT_SYMBOL(iov_shorten); +EXPORT_UNUSED_SYMBOL(iov_shorten); /* June 2006 */ /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) -- cgit v1.2.3 From b6174df5eec9cdfd598c03d6d0807e344e109213 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Mon, 10 Jul 2006 04:44:49 -0700 Subject: [PATCH] mmap zero-length hugetlb file with PROT_NONE to protect a hugetlb virtual area Sometimes, applications need below call to be successful although "/mnt/hugepages/file1" doesn't exist. fd = open("/mnt/hugepages/file1", O_CREAT|O_RDWR, 0755); *addr = mmap(NULL, 0x1024*1024*256, PROT_NONE, 0, fd, 0); As for regular pages (or files), above call does work, but as for huge pages, above call would fail because hugetlbfs_file_mmap would fail if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size). This capability on huge page is useful on ia64 when the process wants to protect one area on region 4, so other threads couldn't read/write this area. A famous JVM (Java Virtual Machine) implementation on IA64 needs the capability. Signed-off-by: Zhang Yanmin Cc: David Gibson Cc: Hugh Dickins [ Expand-on-mmap semantics again... this time matching normal fs's. wli ] Acked-by: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6449cb69796..c3920c96dad 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -83,8 +83,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) - goto out; if (vma->vm_flags & VM_MAYSHARE && hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), @@ -93,7 +91,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; hugetlb_prefault_arch_hook(vma->vm_mm); - if (inode->i_size < len) + if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: mutex_unlock(&inode->i_mutex); -- cgit v1.2.3 From 1aeb21d626327ee909fef03f72aea6e8a60e6c0c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:50 -0700 Subject: [PATCH] FDPIC: Fix FDPIC compile errors Fix FDPIC compile errors. (akpm: we suspect it fixes a warning) Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index eba4e23b9ca..07624b95aee 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -459,6 +459,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, */ hwcap = ELF_HWCAP; k_platform = ELF_PLATFORM; + u_platform = NULL; if (k_platform) { platform_len = strlen(k_platform) + 1; -- cgit v1.2.3 From 21ff821630c0e64f5d2fab96ced72000d77fa90b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:52 -0700 Subject: [PATCH] NOMMU: Fix execution off of ramfs with mmap() Fix execution through the FDPIC binfmt of programs stored on ramfs by preventing the ramfs mmap() returning successfully on a private mapping of a ramfs file. This causes NOMMU mmap to make a copy of the mapped portion of the file and map that instead. This could be improved by granting direct mapping access to read-only private mappings for which the data is stored on a contiguous run of pages. However, this is only likely to be the case if the file was extended with truncate before being written. ramfs is left to map the file directly for shared mappings so that SYSV IPC and POSIX shared memory both still work. Signed-off-by: David Howells Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 99fffc9e1bf..677139b48e0 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -283,9 +283,9 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, /*****************************************************************************/ /* - * set up a mapping + * set up a mapping for shared memory segments */ int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { - return 0; + return vma->vm_flags & VM_SHARED ? 0 : -ENOSYS; } -- cgit v1.2.3 From 8a2ab7f5df76b920d62b908919d987d3b8a82856 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:53 -0700 Subject: [PATCH] FDPIC: Adjust the ELF-FDPIC driver to conform more to the CodingStyle Adjust the ELF-FDPIC binfmt driver to conform much more to the CodingStyle, silly though it may be. Further changes: (*) Drop the casts to long for addresses in kdebug() statements (they're unsigned long already). (*) Use extra variables to avoid expressions longer than 80 chars by splitting the statement into multiple statements and letting the compiler optimise them back together. (*) Eliminate duplicate call of ksize() when working out how much space was actually allocated for the stack. (*) Discard the commented-out load_shlib prototype and op pointer as this will not be supported in ELF-FDPIC for the foreseeable future. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 305 +++++++++++++++++++++++++++----------------------- 1 file changed, 168 insertions(+), 137 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 07624b95aee..a4ff8738982 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1,6 +1,6 @@ /* binfmt_elf_fdpic.c: FDPIC ELF binary format * - * Copyright (C) 2003, 2004 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * Derived from binfmt_elf.c * @@ -50,43 +50,45 @@ typedef char *elf_caddr_t; MODULE_LICENSE("GPL"); -static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs); -//static int load_elf_fdpic_library(struct file *); -static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file); -static int elf_fdpic_map_file(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm, - const char *what); +static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); +static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); +static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, + struct mm_struct *, const char *); -static int create_elf_fdpic_tables(struct linux_binprm *bprm, - struct mm_struct *mm, - struct elf_fdpic_params *exec_params, - struct elf_fdpic_params *interp_params); +static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *, + struct elf_fdpic_params *, + struct elf_fdpic_params *); #ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *_sp); -static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm); +static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *, + unsigned long *); +static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, + struct file *, + struct mm_struct *); #endif -static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm); +static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, + struct file *, struct mm_struct *); static struct linux_binfmt elf_fdpic_format = { .module = THIS_MODULE, .load_binary = load_elf_fdpic_binary, -// .load_shlib = load_elf_fdpic_library, // .core_dump = elf_fdpic_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, }; -static int __init init_elf_fdpic_binfmt(void) { return register_binfmt(&elf_fdpic_format); } -static void __exit exit_elf_fdpic_binfmt(void) { unregister_binfmt(&elf_fdpic_format); } +static int __init init_elf_fdpic_binfmt(void) +{ + return register_binfmt(&elf_fdpic_format); +} -module_init(init_elf_fdpic_binfmt) -module_exit(exit_elf_fdpic_binfmt) +static void __exit exit_elf_fdpic_binfmt(void) +{ + unregister_binfmt(&elf_fdpic_format); +} + +module_init(init_elf_fdpic_binfmt); +module_exit(exit_elf_fdpic_binfmt); static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) { @@ -105,7 +107,8 @@ static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) /* * read the program headers table into memory */ -static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file) +static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, + struct file *file) { struct elf32_phdr *phdr; unsigned long size; @@ -121,7 +124,8 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *f if (!params->phdrs) return -ENOMEM; - retval = kernel_read(file, params->hdr.e_phoff, (char *) params->phdrs, size); + retval = kernel_read(file, params->hdr.e_phoff, + (char *) params->phdrs, size); if (retval < 0) return retval; @@ -141,17 +145,24 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *f } return 0; -} /* end elf_fdpic_fetch_phdrs() */ +} /*****************************************************************************/ /* * load an fdpic binary into various bits of memory */ -static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_elf_fdpic_binary(struct linux_binprm *bprm, + struct pt_regs *regs) { struct elf_fdpic_params exec_params, interp_params; struct elf_phdr *phdr; - unsigned long stack_size; + unsigned long stack_size, entryaddr; +#ifndef CONFIG_MMU + unsigned long fullsize; +#endif +#ifdef ELF_FDPIC_PLAT_INIT + unsigned long dynaddr; +#endif struct file *interpreter = NULL; /* to shut gcc up */ char *interpreter_name = NULL; int executable_stack; @@ -212,7 +223,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs goto error; } - retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE); + retval = kernel_read(interpreter, 0, bprm->buf, + BINPRM_BUF_SIZE); if (retval < 0) goto error; @@ -295,7 +307,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs ¤t->mm->start_stack, ¤t->mm->start_brk); - retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); + retval = setup_arg_pages(bprm, current->mm->start_stack, + executable_stack); if (retval < 0) { send_sig(SIGKILL, current, 0); goto error_kill; @@ -303,7 +316,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs #endif /* load the executable and interpreter into memory */ - retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, "executable"); + retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, + "executable"); if (retval < 0) goto error_kill; @@ -324,7 +338,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs if (!current->mm->start_brk) current->mm->start_brk = current->mm->end_data; - current->mm->brk = current->mm->start_brk = PAGE_ALIGN(current->mm->start_brk); + current->mm->brk = current->mm->start_brk = + PAGE_ALIGN(current->mm->start_brk); #else /* create a stack and brk area big enough for everyone @@ -336,47 +351,45 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs stack_size = PAGE_SIZE * 2; down_write(¤t->mm->mmap_sem); - current->mm->start_brk = do_mmap(NULL, - 0, - stack_size, + current->mm->start_brk = do_mmap(NULL, 0, stack_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0); - if (IS_ERR((void *) current->mm->start_brk)) { + if (IS_ERR_VALUE(current->mm->start_brk)) { up_write(¤t->mm->mmap_sem); retval = current->mm->start_brk; current->mm->start_brk = 0; goto error_kill; } - if (do_mremap(current->mm->start_brk, - stack_size, - ksize((char *) current->mm->start_brk), - 0, 0 - ) == current->mm->start_brk - ) - stack_size = ksize((char *) current->mm->start_brk); + /* expand the stack mapping to use up the entire allocation granule */ + fullsize = ksize((char *) current->mm->start_brk); + if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size, + fullsize, 0, 0))) + stack_size = fullsize; up_write(¤t->mm->mmap_sem); current->mm->brk = current->mm->start_brk; current->mm->context.end_brk = current->mm->start_brk; - current->mm->context.end_brk += (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; + current->mm->context.end_brk += + (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; current->mm->start_stack = current->mm->start_brk + stack_size; #endif compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; - if (create_elf_fdpic_tables(bprm, current->mm, &exec_params, &interp_params) < 0) + if (create_elf_fdpic_tables(bprm, current->mm, + &exec_params, &interp_params) < 0) goto error_kill; - kdebug("- start_code %lx", (long) current->mm->start_code); - kdebug("- end_code %lx", (long) current->mm->end_code); - kdebug("- start_data %lx", (long) current->mm->start_data); - kdebug("- end_data %lx", (long) current->mm->end_data); - kdebug("- start_brk %lx", (long) current->mm->start_brk); - kdebug("- brk %lx", (long) current->mm->brk); - kdebug("- start_stack %lx", (long) current->mm->start_stack); + kdebug("- start_code %lx", current->mm->start_code); + kdebug("- end_code %lx", current->mm->end_code); + kdebug("- start_data %lx", current->mm->start_data); + kdebug("- end_data %lx", current->mm->end_data); + kdebug("- start_brk %lx", current->mm->start_brk); + kdebug("- brk %lx", current->mm->brk); + kdebug("- start_stack %lx", current->mm->start_stack); #ifdef ELF_FDPIC_PLAT_INIT /* @@ -385,21 +398,18 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs * example. This macro performs whatever initialization to * the regs structure is required. */ - ELF_FDPIC_PLAT_INIT(regs, - exec_params.map_addr, - interp_params.map_addr, - interp_params.dynamic_addr ?: exec_params.dynamic_addr - ); + dynaddr = interp_params.dynamic_addr ?: exec_params.dynamic_addr; + ELF_FDPIC_PLAT_INIT(regs, exec_params.map_addr, interp_params.map_addr, + dynaddr); #endif /* everything is now ready... get the userspace context ready to roll */ - start_thread(regs, - interp_params.entry_addr ?: exec_params.entry_addr, - current->mm->start_stack); + entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; + start_thread(regs, entryaddr, current->mm->start_stack); if (unlikely(current->ptrace & PT_PTRACED)) { if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); + ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP); else send_sig(SIGTRAP, current, 0); } @@ -419,11 +429,11 @@ error: return retval; /* unrecoverable error - kill the process */ - error_kill: +error_kill: send_sig(SIGSEGV, current, 0); goto error; -} /* end load_elf_fdpic_binary() */ +} /*****************************************************************************/ /* @@ -471,11 +481,11 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, #if defined(__i386__) && defined(CONFIG_SMP) /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions - * by the processes running on the same package. One thing we can do - * is to shuffle the initial stack for them. + * by the processes running on the same package. One thing we can do is + * to shuffle the initial stack for them. * - * the conditionals here are unneeded, but kept in to make the - * code behaviour the same as pre change unless we have hyperthreaded + * the conditionals here are unneeded, but kept in to make the code + * behaviour the same as pre change unless we have hyperthreaded * processors. This keeps Mr Marcelo Person happier but should be * removed for 2.5 */ @@ -498,11 +508,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, if (interp_params->loadmap) { len = sizeof(struct elf32_fdpic_loadmap); - len += sizeof(struct elf32_fdpic_loadseg) * interp_params->loadmap->nsegs; + len += sizeof(struct elf32_fdpic_loadseg) * + interp_params->loadmap->nsegs; sp = (sp - len) & ~7UL; interp_params->map_addr = sp; - if (copy_to_user((void __user *) sp, interp_params->loadmap, len) != 0) + if (copy_to_user((void __user *) sp, interp_params->loadmap, + len) != 0) return -EFAULT; current->mm->context.interp_fdpic_loadmap = (unsigned long) sp; @@ -526,34 +538,37 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, sp -= sp & 15UL; /* put the ELF interpreter info on the stack */ -#define NEW_AUX_ENT(nr, id, val) \ - do { \ - struct { unsigned long _id, _val; } __user *ent = (void __user *) csp; \ - __put_user((id), &ent[nr]._id); \ - __put_user((val), &ent[nr]._val); \ +#define NEW_AUX_ENT(nr, id, val) \ + do { \ + struct { unsigned long _id, _val; } __user *ent; \ + \ + ent = (void __user *) csp; \ + __put_user((id), &ent[nr]._id); \ + __put_user((val), &ent[nr]._val); \ } while (0) csp -= 2 * sizeof(unsigned long); NEW_AUX_ENT(0, AT_NULL, 0); if (k_platform) { csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(0, AT_PLATFORM, (elf_addr_t)(unsigned long) u_platform); + NEW_AUX_ENT(0, AT_PLATFORM, + (elf_addr_t) (unsigned long) u_platform); } csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); - NEW_AUX_ENT( 0, AT_HWCAP, hwcap); - NEW_AUX_ENT( 1, AT_PAGESZ, PAGE_SIZE); - NEW_AUX_ENT( 2, AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT( 3, AT_PHDR, exec_params->ph_addr); - NEW_AUX_ENT( 4, AT_PHENT, sizeof(struct elf_phdr)); - NEW_AUX_ENT( 5, AT_PHNUM, exec_params->hdr.e_phnum); - NEW_AUX_ENT( 6, AT_BASE, interp_params->elfhdr_addr); - NEW_AUX_ENT( 7, AT_FLAGS, 0); - NEW_AUX_ENT( 8, AT_ENTRY, exec_params->entry_addr); - NEW_AUX_ENT( 9, AT_UID, (elf_addr_t) current->uid); - NEW_AUX_ENT(10, AT_EUID, (elf_addr_t) current->euid); - NEW_AUX_ENT(11, AT_GID, (elf_addr_t) current->gid); - NEW_AUX_ENT(12, AT_EGID, (elf_addr_t) current->egid); + NEW_AUX_ENT( 0, AT_HWCAP, hwcap); + NEW_AUX_ENT( 1, AT_PAGESZ, PAGE_SIZE); + NEW_AUX_ENT( 2, AT_CLKTCK, CLOCKS_PER_SEC); + NEW_AUX_ENT( 3, AT_PHDR, exec_params->ph_addr); + NEW_AUX_ENT( 4, AT_PHENT, sizeof(struct elf_phdr)); + NEW_AUX_ENT( 5, AT_PHNUM, exec_params->hdr.e_phnum); + NEW_AUX_ENT( 6, AT_BASE, interp_params->elfhdr_addr); + NEW_AUX_ENT( 7, AT_FLAGS, 0); + NEW_AUX_ENT( 8, AT_ENTRY, exec_params->entry_addr); + NEW_AUX_ENT( 9, AT_UID, (elf_addr_t) current->uid); + NEW_AUX_ENT(10, AT_EUID, (elf_addr_t) current->euid); + NEW_AUX_ENT(11, AT_GID, (elf_addr_t) current->gid); + NEW_AUX_ENT(12, AT_EGID, (elf_addr_t) current->egid); #ifdef ARCH_DLINFO /* ARCH_DLINFO must come last so platform specific code can enforce @@ -579,7 +594,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, #ifdef CONFIG_MMU current->mm->arg_start = bprm->p; #else - current->mm->arg_start = current->mm->start_stack - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); + current->mm->arg_start = current->mm->start_stack - + (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); #endif p = (char __user *) current->mm->arg_start; @@ -607,7 +623,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, mm->start_stack = (unsigned long) sp; return 0; -} /* end create_elf_fdpic_tables() */ +} /*****************************************************************************/ /* @@ -615,7 +631,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, * the stack */ #ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *_sp) +static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, + unsigned long *_sp) { unsigned long index, stop, sp; char *src; @@ -636,9 +653,9 @@ static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; - out: +out: return ret; -} /* end elf_fdpic_transfer_args_to_stack() */ +} #endif /*****************************************************************************/ @@ -713,17 +730,18 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, seg = loadmap->segs; for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { if (params->hdr.e_entry >= seg->p_vaddr && - params->hdr.e_entry < seg->p_vaddr + seg->p_memsz - ) { + params->hdr.e_entry < seg->p_vaddr + seg->p_memsz) { params->entry_addr = - (params->hdr.e_entry - seg->p_vaddr) + seg->addr; + (params->hdr.e_entry - seg->p_vaddr) + + seg->addr; break; } } } /* determine where the program header table has wound up if mapped */ - stop = params->hdr.e_phoff + params->hdr.e_phnum * sizeof (struct elf_phdr); + stop = params->hdr.e_phoff; + stop += params->hdr.e_phnum * sizeof (struct elf_phdr); phdr = params->phdrs; for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { @@ -737,9 +755,11 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, seg = loadmap->segs; for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { if (phdr->p_vaddr >= seg->p_vaddr && - phdr->p_vaddr + phdr->p_filesz <= seg->p_vaddr + seg->p_memsz - ) { - params->ph_addr = (phdr->p_vaddr - seg->p_vaddr) + seg->addr + + phdr->p_vaddr + phdr->p_filesz <= + seg->p_vaddr + seg->p_memsz) { + params->ph_addr = + (phdr->p_vaddr - seg->p_vaddr) + + seg->addr + params->hdr.e_phoff - phdr->p_offset; break; } @@ -756,18 +776,22 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, seg = loadmap->segs; for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { if (phdr->p_vaddr >= seg->p_vaddr && - phdr->p_vaddr + phdr->p_memsz <= seg->p_vaddr + seg->p_memsz - ) { - params->dynamic_addr = (phdr->p_vaddr - seg->p_vaddr) + seg->addr; - - /* check the dynamic section contains at least one item, and that - * the last item is a NULL entry */ + phdr->p_vaddr + phdr->p_memsz <= + seg->p_vaddr + seg->p_memsz) { + params->dynamic_addr = + (phdr->p_vaddr - seg->p_vaddr) + + seg->addr; + + /* check the dynamic section contains at least + * one item, and that the last item is a NULL + * entry */ if (phdr->p_memsz == 0 || phdr->p_memsz % sizeof(Elf32_Dyn) != 0) goto dynamic_error; tmp = phdr->p_memsz / sizeof(Elf32_Dyn); - if (((Elf32_Dyn *) params->dynamic_addr)[tmp - 1].d_tag != 0) + if (((Elf32_Dyn *) + params->dynamic_addr)[tmp - 1].d_tag != 0) goto dynamic_error; break; } @@ -776,8 +800,8 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, } /* now elide adjacent segments in the load map on MMU linux - * - on uClinux the holes between may actually be filled with system stuff or stuff from - * other processes + * - on uClinux the holes between may actually be filled with system + * stuff or stuff from other processes */ #ifdef CONFIG_MMU nloads = loadmap->nsegs; @@ -788,7 +812,9 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) { load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz); if (load_addr == (seg->addr & PAGE_MASK)) { - mseg->p_memsz += load_addr - (mseg->addr + mseg->p_memsz); + mseg->p_memsz += + load_addr - + (mseg->addr + mseg->p_memsz); mseg->p_memsz += seg->addr & ~PAGE_MASK; mseg->p_memsz += seg->p_memsz; loadmap->nsegs--; @@ -816,20 +842,21 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, return 0; - dynamic_error: +dynamic_error: printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", what, file->f_dentry->d_inode->i_ino); return -ELIBBAD; -} /* end elf_fdpic_map_file() */ +} /*****************************************************************************/ /* * map a file with constant displacement under uClinux */ #ifndef CONFIG_MMU -static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm) +static int elf_fdpic_map_file_constdisp_on_uclinux( + struct elf_fdpic_params *params, + struct file *file, + struct mm_struct *mm) { struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; @@ -840,7 +867,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para load_addr = params->load_addr; seg = params->loadmap->segs; - /* determine the bounds of the contiguous overall allocation we must make */ + /* determine the bounds of the contiguous overall allocation we must + * make */ phdr = params->phdrs; for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { if (params->phdrs[loop].p_type != PT_LOAD) @@ -861,7 +889,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para maddr = do_mmap(NULL, load_addr, top - base, PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); up_write(&mm->mmap_sem); - if (IS_ERR((void *) maddr)) + if (IS_ERR_VALUE(maddr)) return (int) maddr; if (load_addr != 0) @@ -879,7 +907,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para seg->p_vaddr = phdr->p_vaddr; seg->p_memsz = phdr->p_memsz; - ret = file->f_op->read(file, (void *) seg->addr, phdr->p_filesz, &fpos); + ret = file->f_op->read(file, (void *) seg->addr, + phdr->p_filesz, &fpos); if (ret < 0) return ret; @@ -896,8 +925,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para if (phdr->p_flags & PF_X) { mm->start_code = seg->addr; mm->end_code = seg->addr + phdr->p_memsz; - } - else if (!mm->start_data) { + } else if (!mm->start_data) { mm->start_data = seg->addr; #ifndef CONFIG_MMU mm->end_data = seg->addr + phdr->p_memsz; @@ -914,7 +942,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para } return 0; -} /* end elf_fdpic_map_file_constdisp_on_uclinux() */ +} #endif /*****************************************************************************/ @@ -975,14 +1003,14 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, case ELF_FDPIC_FLAG_CONSTDISP: /* constant displacement - * - can be mapped anywhere, but must be mapped as a unit + * - can be mapped anywhere, but must be mapped as a + * unit */ if (!dvset) { maddr = load_addr; delta_vaddr = phdr->p_vaddr; dvset = 1; - } - else { + } else { maddr = load_addr + phdr->p_vaddr - delta_vaddr; flags |= MAP_FIXED; } @@ -1006,13 +1034,14 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, up_write(&mm->mmap_sem); kdebug("mmap[%d] sz=%lx pr=%x fl=%x of=%lx --> %08lx", - loop, phdr->p_memsz + disp, prot, flags, phdr->p_offset - disp, - maddr); + loop, phdr->p_memsz + disp, prot, flags, + phdr->p_offset - disp, maddr); - if (IS_ERR((void *) maddr)) + if (IS_ERR_VALUE(maddr)) return (int) maddr; - if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == ELF_FDPIC_FLAG_CONTIGUOUS) + if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == + ELF_FDPIC_FLAG_CONTIGUOUS) load_addr += PAGE_ALIGN(phdr->p_memsz + disp); seg->addr = maddr + disp; @@ -1023,7 +1052,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_offset == 0) params->elfhdr_addr = seg->addr; - /* clear the bit between beginning of mapping and beginning of PT_LOAD */ + /* clear the bit between beginning of mapping and beginning of + * PT_LOAD */ if (prot & PROT_WRITE && disp > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); clear_user((void __user *) maddr, disp); @@ -1039,19 +1069,20 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); #ifdef CONFIG_MMU - if (excess > excess1) { unsigned long xaddr = maddr + phdr->p_filesz + excess1; unsigned long xmaddr; flags |= MAP_FIXED | MAP_ANONYMOUS; down_write(&mm->mmap_sem); - xmaddr = do_mmap(NULL, xaddr, excess - excess1, prot, flags, 0); + xmaddr = do_mmap(NULL, xaddr, excess - excess1, + prot, flags, 0); up_write(&mm->mmap_sem); kdebug("mmap[%d] " " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", - loop, xaddr, excess - excess1, prot, flags, xmaddr); + loop, xaddr, excess - excess1, prot, flags, + xmaddr); if (xmaddr != xaddr) return -ENOMEM; @@ -1060,7 +1091,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (prot & PROT_WRITE && excess1 > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess1); - clear_user((void __user *) maddr + phdr->p_filesz, excess1); + clear_user((void __user *) maddr + phdr->p_filesz, + excess1); } #else @@ -1075,8 +1107,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_flags & PF_X) { mm->start_code = maddr; mm->end_code = maddr + phdr->p_memsz; - } - else if (!mm->start_data) { + } else if (!mm->start_data) { mm->start_data = maddr; mm->end_data = maddr + phdr->p_memsz; } @@ -1086,4 +1117,4 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, } return 0; -} /* end elf_fdpic_map_file_by_direct_mmap() */ +} -- cgit v1.2.3 From b4cac1a0227a6f84be0381cd350a3c8730a4a671 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:54 -0700 Subject: [PATCH] FDPIC: Move roundup() into linux/kernel.h Move the roundup() macro from binfmt_elf.c into linux/kernel.h as it's generally useful. [akpm@osdl.org: nuke all the other implementations] Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 -- fs/proc/kcore.c | 2 -- fs/xfs/linux-2.6/xfs_linux.h | 1 - 3 files changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f42e64210ee..672a3b90bc5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1185,8 +1185,6 @@ static int maydump(struct vm_area_struct *vma) return 1; } -#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) - /* An ELF note in memory */ struct memelfnote { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 036d14d8362..8d6d85d7400 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -42,8 +42,6 @@ const struct file_operations proc_kcore_operations = { #define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) #endif -#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) - /* An ELF note in memory */ struct memelfnote { diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 8c021dc57d1..a13f75c1a93 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -215,7 +215,6 @@ BUFFER_FNS(PrivateStart, unwritten); #define MIN(a,b) (min(a,b)) #define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) -#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* * Various platform dependent calls that don't fit anywhere else -- cgit v1.2.3 From 6d8c4e3b0150ff537902477ed62f8a8e9e70007b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:55 -0700 Subject: [PATCH] FDPIC: Add coredump capability for the ELF-FDPIC binfmt Add coredump capability for the ELF-FDPIC binfmt. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 676 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 674 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index a4ff8738982..2f336582922 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -24,7 +24,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -48,6 +50,12 @@ typedef char *elf_caddr_t; #define kdebug(fmt, ...) do {} while(0) #endif +#if 0 +#define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) +#else +#define kdcore(fmt, ...) do {} while(0) +#endif + MODULE_LICENSE("GPL"); static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); @@ -70,10 +78,16 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, struct file *, struct mm_struct *); +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) +static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *); +#endif + static struct linux_binfmt elf_fdpic_format = { .module = THIS_MODULE, .load_binary = load_elf_fdpic_binary, -// .core_dump = elf_fdpic_core_dump, +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) + .core_dump = elf_fdpic_core_dump, +#endif .min_coredump = ELF_EXEC_PAGESIZE, }; @@ -87,7 +101,7 @@ static void __exit exit_elf_fdpic_binfmt(void) unregister_binfmt(&elf_fdpic_format); } -module_init(init_elf_fdpic_binfmt); +core_initcall(init_elf_fdpic_binfmt); module_exit(exit_elf_fdpic_binfmt); static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) @@ -1118,3 +1132,661 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, return 0; } + +/*****************************************************************************/ +/* + * ELF-FDPIC core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge + * + * Modelled on fs/binfmt_elf.c core dumper + */ +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) + +/* + * These are the only things you should do on a core-file: use only these + * functions to write out all the necessary info. + */ +static int dump_write(struct file *file, const void *addr, int nr) +{ + return file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} + +static int dump_seek(struct file *file, loff_t off) +{ + if (file->f_op->llseek) { + if (file->f_op->llseek(file, off, SEEK_SET) != off) + return 0; + } else { + file->f_pos = off; + } + return 1; +} + +/* + * Decide whether a segment is worth dumping; default is yes to be + * sure (missing info is worse than too much; etc). + * Personally I'd include everything, and use the coredump limit... + * + * I think we should skip something. But I am not sure how. H.J. + */ +static int maydump(struct vm_area_struct *vma) +{ + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & (VM_IO | VM_RESERVED)) { + kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); + return 0; + } + + /* If we may not read the contents, don't allow us to dump + * them either. "dump_write()" can't handle it anyway. + */ + if (!(vma->vm_flags & VM_READ)) { + kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); + return 0; + } + + /* Dump shared memory only if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (vma->vm_file->f_dentry->d_inode->i_nlink == 0) { + kdcore("%08lx: %08lx: no (share)", vma->vm_start, vma->vm_flags); + return 1; + } + + kdcore("%08lx: %08lx: no (share)", vma->vm_start, vma->vm_flags); + return 0; + } + +#ifdef CONFIG_MMU + /* If it hasn't been written to, don't write it out */ + if (!vma->anon_vma) { + kdcore("%08lx: %08lx: no (!anon)", vma->vm_start, vma->vm_flags); + return 0; + } +#endif + + kdcore("%08lx: %08lx: yes", vma->vm_start, vma->vm_flags); + return 1; +} + +/* An ELF note in memory */ +struct memelfnote +{ + const char *name; + int type; + unsigned int datasz; + void *data; +}; + +static int notesize(struct memelfnote *en) +{ + int sz; + + sz = sizeof(struct elf_note); + sz += roundup(strlen(en->name) + 1, 4); + sz += roundup(en->datasz, 4); + + return sz; +} + +/* #define DEBUG */ + +#define DUMP_WRITE(addr, nr) \ + do { if (!dump_write(file, (addr), (nr))) return 0; } while(0) +#define DUMP_SEEK(off) \ + do { if (!dump_seek(file, (off))) return 0; } while(0) + +static int writenote(struct memelfnote *men, struct file *file) +{ + struct elf_note en; + + en.n_namesz = strlen(men->name) + 1; + en.n_descsz = men->datasz; + en.n_type = men->type; + + DUMP_WRITE(&en, sizeof(en)); + DUMP_WRITE(men->name, en.n_namesz); + /* XXX - cast from long long to long to avoid need for libgcc.a */ + DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ + DUMP_WRITE(men->data, men->datasz); + DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ + + return 1; +} +#undef DUMP_WRITE +#undef DUMP_SEEK + +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ + goto end_coredump; +#define DUMP_SEEK(off) \ + if (!dump_seek(file, (off))) \ + goto end_coredump; + +static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) +{ + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_FDPIC_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = segs; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + return; +} + +static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) +{ + phdr->p_type = PT_NOTE; + phdr->p_offset = offset; + phdr->p_vaddr = 0; + phdr->p_paddr = 0; + phdr->p_filesz = sz; + phdr->p_memsz = 0; + phdr->p_flags = 0; + phdr->p_align = 0; + return; +} + +static inline void fill_note(struct memelfnote *note, const char *name, int type, + unsigned int sz, void *data) +{ + note->name = name; + note->type = type; + note->datasz = sz; + note->data = data; + return; +} + +/* + * fill up all the fields in prstatus from the given task struct, except + * registers which need to be filled up seperately. + */ +static void fill_prstatus(struct elf_prstatus *prstatus, + struct task_struct *p, long signr) +{ + prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; + prstatus->pr_sigpend = p->pending.signal.sig[0]; + prstatus->pr_sighold = p->blocked.sig[0]; + prstatus->pr_pid = p->pid; + prstatus->pr_ppid = p->parent->pid; + prstatus->pr_pgrp = process_group(p); + prstatus->pr_sid = p->signal->session; + if (thread_group_leader(p)) { + /* + * This is the record for the group leader. Add in the + * cumulative times of previous dead threads. This total + * won't include the time of each live thread whose state + * is included in the core dump. The final total reported + * to our parent process when it calls wait4 will include + * those sums as well as the little bit more time it takes + * this and each other thread to finish dying after the + * core dump synchronization phase. + */ + cputime_to_timeval(cputime_add(p->utime, p->signal->utime), + &prstatus->pr_utime); + cputime_to_timeval(cputime_add(p->stime, p->signal->stime), + &prstatus->pr_stime); + } else { + cputime_to_timeval(p->utime, &prstatus->pr_utime); + cputime_to_timeval(p->stime, &prstatus->pr_stime); + } + cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); + cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); + + prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; + prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; +} + +static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, + struct mm_struct *mm) +{ + unsigned int i, len; + + /* first copy the parameters from user space */ + memset(psinfo, 0, sizeof(struct elf_prpsinfo)); + + len = mm->arg_end - mm->arg_start; + if (len >= ELF_PRARGSZ) + len = ELF_PRARGSZ - 1; + if (copy_from_user(&psinfo->pr_psargs, + (const char __user *) mm->arg_start, len)) + return -EFAULT; + for (i = 0; i < len; i++) + if (psinfo->pr_psargs[i] == 0) + psinfo->pr_psargs[i] = ' '; + psinfo->pr_psargs[len] = 0; + + psinfo->pr_pid = p->pid; + psinfo->pr_ppid = p->parent->pid; + psinfo->pr_pgrp = process_group(p); + psinfo->pr_sid = p->signal->session; + + i = p->state ? ffz(~p->state) + 1 : 0; + psinfo->pr_state = i; + psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; + psinfo->pr_zomb = psinfo->pr_sname == 'Z'; + psinfo->pr_nice = task_nice(p); + psinfo->pr_flag = p->flags; + SET_UID(psinfo->pr_uid, p->uid); + SET_GID(psinfo->pr_gid, p->gid); + strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + + return 0; +} + +/* Here is the structure in which status of each thread is captured. */ +struct elf_thread_status +{ + struct list_head list; + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + elf_fpregset_t fpu; /* NT_PRFPREG */ + struct task_struct *thread; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t xfpu; /* NT_PRXFPREG */ +#endif + struct memelfnote notes[3]; + int num_notes; +}; + +/* + * In order to add the specific thread information for the elf file format, + * we need to keep a linked list of every thread's pr_status and then create + * a single section for them in the final core file. + */ +static int elf_dump_thread_status(long signr, struct elf_thread_status *t) +{ + struct task_struct *p = t->thread; + int sz = 0; + + t->num_notes = 0; + + fill_prstatus(&t->prstatus, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + &t->prstatus); + t->num_notes++; + sz += notesize(&t->notes[0]); + + t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu); + if (t->prstatus.pr_fpvalid) { + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + &t->fpu); + t->num_notes++; + sz += notesize(&t->notes[1]); + } + +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(p, &t->xfpu)) { + fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu), + &t->xfpu); + t->num_notes++; + sz += notesize(&t->notes[2]); + } +#endif + return sz; +} + +/* + * dump the segments for an MMU process + */ +#ifdef CONFIG_MMU +static int elf_fdpic_dump_segments(struct file *file, struct mm_struct *mm, + size_t *size, unsigned long *limit) +{ + struct vm_area_struct *vma; + + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + unsigned long addr; + + if (!maydump(vma)) + continue; + + for (addr = vma->vm_start; + addr < vma->vm_end; + addr += PAGE_SIZE + ) { + struct vm_area_struct *vma; + struct page *page; + + if (get_user_pages(current, current->mm, addr, 1, 0, 1, + &page, &vma) <= 0) { + DUMP_SEEK(file->f_pos + PAGE_SIZE); + } + else if (page == ZERO_PAGE(addr)) { + DUMP_SEEK(file->f_pos + PAGE_SIZE); + page_cache_release(page); + } + else { + void *kaddr; + + flush_cache_page(vma, addr, page_to_pfn(page)); + kaddr = kmap(page); + if ((*size += PAGE_SIZE) > *limit || + !dump_write(file, kaddr, PAGE_SIZE) + ) { + kunmap(page); + page_cache_release(page); + return -EIO; + } + kunmap(page); + page_cache_release(page); + } + } + } + + return 0; + +end_coredump: + return -EFBIG; +} +#endif + +/* + * dump the segments for a NOMMU process + */ +#ifndef CONFIG_MMU +static int elf_fdpic_dump_segments(struct file *file, struct mm_struct *mm, + size_t *size, unsigned long *limit) +{ + struct vm_list_struct *vml; + + for (vml = current->mm->context.vmlist; vml; vml = vml->next) { + struct vm_area_struct *vma = vml->vma; + + if (!maydump(vma)) + continue; + + if ((*size += PAGE_SIZE) > *limit) + return -EFBIG; + + if (!dump_write(file, (void *) vma->vm_start, + vma->vm_end - vma->vm_start)) + return -EIO; + } + + return 0; +} +#endif + +/* + * Actual dumper + * + * This is a two-pass process; first we find the offsets of the bits, + * and then they are actually written out. If we run out of core limit + * we just truncate. + */ +static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, + struct file *file) +{ +#define NUM_NOTES 6 + int has_dumped = 0; + mm_segment_t fs; + int segs; + size_t size = 0; + int i; + struct vm_area_struct *vma; + struct elfhdr *elf = NULL; + loff_t offset = 0, dataoff; + unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; + int numnote; + struct memelfnote *notes = NULL; + struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ + struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */ + struct task_struct *g, *p; + LIST_HEAD(thread_list); + struct list_head *t; + elf_fpregset_t *fpu = NULL; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t *xfpu = NULL; +#endif + int thread_status_size = 0; +#ifndef CONFIG_MMU + struct vm_list_struct *vml; +#endif + elf_addr_t *auxv; + + /* + * We no longer stop all VM operations. + * + * This is because those proceses that could possibly change map_count + * or the mmap / vma pages are now blocked in do_exit on current + * finishing this core dump. + * + * Only ptrace can touch these memory addresses, but it doesn't change + * the map_count or the pages allocated. So no possibility of crashing + * exists while dumping the mm->vm_next areas to the core file. + */ + + /* alloc memory for large data structures: too large to be on stack */ + elf = kmalloc(sizeof(*elf), GFP_KERNEL); + if (!elf) + goto cleanup; + prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL); + if (!prstatus) + goto cleanup; + psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); + if (!psinfo) + goto cleanup; + notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL); + if (!notes) + goto cleanup; + fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); + if (!fpu) + goto cleanup; +#ifdef ELF_CORE_COPY_XFPREGS + xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL); + if (!xfpu) + goto cleanup; +#endif + + if (signr) { + struct elf_thread_status *tmp; + read_lock(&tasklist_lock); + do_each_thread(g,p) + if (current->mm == p->mm && current != p) { + tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); + if (!tmp) { + read_unlock(&tasklist_lock); + goto cleanup; + } + INIT_LIST_HEAD(&tmp->list); + tmp->thread = p; + list_add(&tmp->list, &thread_list); + } + while_each_thread(g,p); + read_unlock(&tasklist_lock); + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp; + int sz; + + tmp = list_entry(t, struct elf_thread_status, list); + sz = elf_dump_thread_status(signr, tmp); + thread_status_size += sz; + } + } + + /* now collect the dump for the current */ + fill_prstatus(prstatus, current, signr); + elf_core_copy_regs(&prstatus->pr_reg, regs); + +#ifdef CONFIG_MMU + segs = current->mm->map_count; +#else + segs = 0; + for (vml = current->mm->context.vmlist; vml; vml = vml->next) + segs++; +#endif +#ifdef ELF_CORE_EXTRA_PHDRS + segs += ELF_CORE_EXTRA_PHDRS; +#endif + + /* Set up header */ + fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ + + has_dumped = 1; + current->flags |= PF_DUMPCORE; + + /* + * Set up the notes in similar form to SVR4 core dumps made + * with info from their /proc. + */ + + fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus); + fill_psinfo(psinfo, current->group_leader, current->mm); + fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + + numnote = 2; + + auxv = (elf_addr_t *) current->mm->saved_auxv; + + i = 0; + do + i += 2; + while (auxv[i - 2] != AT_NULL); + fill_note(¬es[numnote++], "CORE", NT_AUXV, + i * sizeof(elf_addr_t), auxv); + + /* Try to dump the FPU. */ + if ((prstatus->pr_fpvalid = + elf_core_copy_task_fpregs(current, regs, fpu))) + fill_note(notes + numnote++, + "CORE", NT_PRFPREG, sizeof(*fpu), fpu); +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(current, xfpu)) + fill_note(notes + numnote++, + "LINUX", NT_PRXFPREG, sizeof(*xfpu), xfpu); +#endif + + fs = get_fs(); + set_fs(KERNEL_DS); + + DUMP_WRITE(elf, sizeof(*elf)); + offset += sizeof(*elf); /* Elf header */ + offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ + + /* Write notes phdr entry */ + { + struct elf_phdr phdr; + int sz = 0; + + for (i = 0; i < numnote; i++) + sz += notesize(notes + i); + + sz += thread_status_size; + + fill_elf_note_phdr(&phdr, sz, offset); + offset += sz; + DUMP_WRITE(&phdr, sizeof(phdr)); + } + + /* Page-align dumped data */ + dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); + + /* write program headers for segments dump */ + for ( +#ifdef CONFIG_MMU + vma = current->mm->mmap; vma; vma = vma->vm_next +#else + vml = current->mm->context.vmlist; vml; vml = vml->next +#endif + ) { + struct elf_phdr phdr; + size_t sz; + +#ifndef CONFIG_MMU + vma = vml->vma; +#endif + + sz = vma->vm_end - vma->vm_start; + + phdr.p_type = PT_LOAD; + phdr.p_offset = offset; + phdr.p_vaddr = vma->vm_start; + phdr.p_paddr = 0; + phdr.p_filesz = maydump(vma) ? sz : 0; + phdr.p_memsz = sz; + offset += phdr.p_filesz; + phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; + if (vma->vm_flags & VM_WRITE) + phdr.p_flags |= PF_W; + if (vma->vm_flags & VM_EXEC) + phdr.p_flags |= PF_X; + phdr.p_align = ELF_EXEC_PAGESIZE; + + DUMP_WRITE(&phdr, sizeof(phdr)); + } + +#ifdef ELF_CORE_WRITE_EXTRA_PHDRS + ELF_CORE_WRITE_EXTRA_PHDRS; +#endif + + /* write out the notes section */ + for (i = 0; i < numnote; i++) + if (!writenote(notes + i, file)) + goto end_coredump; + + /* write out the thread status notes section */ + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp = + list_entry(t, struct elf_thread_status, list); + + for (i = 0; i < tmp->num_notes; i++) + if (!writenote(&tmp->notes[i], file)) + goto end_coredump; + } + + DUMP_SEEK(dataoff); + + if (elf_fdpic_dump_segments(file, current->mm, &size, &limit) < 0) + goto end_coredump; + +#ifdef ELF_CORE_WRITE_EXTRA_DATA + ELF_CORE_WRITE_EXTRA_DATA; +#endif + + if (file->f_pos != offset) { + /* Sanity check */ + printk(KERN_WARNING + "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", + file->f_pos, offset); + } + +end_coredump: + set_fs(fs); + +cleanup: + while (!list_empty(&thread_list)) { + struct list_head *tmp = thread_list.next; + list_del(tmp); + kfree(list_entry(tmp, struct elf_thread_status, list)); + } + + kfree(elf); + kfree(prstatus); + kfree(psinfo); + kfree(notes); + kfree(fpu); +#ifdef ELF_CORE_COPY_XFPREGS + kfree(xfpu); +#endif + return has_dumped; +#undef NUM_NOTES +} + +#endif /* USE_ELF_CORE_DUMP */ -- cgit v1.2.3 From 92eb7a2f28d551acedeb5752263267a64b1f5ddf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Jul 2006 04:45:31 -0700 Subject: [PATCH] fix weird logic in alloc_fdtable() There's a fairly obvious infinite loop in there. Also, use roundup_pow_of_two() rather than open-coding stuff. Cc: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 55f4e702256..3f356086061 100644 --- a/fs/file.c +++ b/fs/file.c @@ -240,13 +240,9 @@ static struct fdtable *alloc_fdtable(int nr) if (!fdt) goto out; - nfds = 8 * L1_CACHE_BYTES; - /* Expand to the max in easy steps */ - while (nfds <= nr) { - nfds = nfds * 2; - if (nfds > NR_OPEN) - nfds = NR_OPEN; - } + nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nfds)); + if (nfds > NR_OPEN) + nfds = NR_OPEN; new_openset = alloc_fdset(nfds); new_execset = alloc_fdset(nfds); -- cgit v1.2.3 From 36cf96f5e7c098731a1ad9d79694d6f591b18e7f Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 10 Jul 2006 04:45:33 -0700 Subject: [PATCH] Remove leftover ext3 acl declarations These functions no longer exist; remove their declarations. Signed-off-by: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/acl.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 92d50b53a93..0d1e6279cbf 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -62,9 +62,6 @@ extern int ext3_permission (struct inode *, int, struct nameidata *); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); -extern int init_ext3_acl(void); -extern void exit_ext3_acl(void); - #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include #define ext3_permission NULL -- cgit v1.2.3 From e2b209509ca33743864846aef2e1b2afc21f7915 Mon Sep 17 00:00:00 2001 From: Shankar Anand Date: Mon, 10 Jul 2006 04:45:44 -0700 Subject: [PATCH] knfsd: nfsd4: add per-operation server stats Add an nfs4 operations count array to nfsd_stats structure. The count is incremented in nfsd4_proc_compound() where all the operations are handled by the nfsv4 server. This count of individual nfsv4 operations is also entered into /proc filesystem. Signed-off-by: Shankar Anand Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4proc.c | 8 ++++++++ fs/nfsd/stats.c | 10 ++++++++++ 2 files changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b0e095ea0c0..ee4eff27aed 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -721,6 +721,12 @@ nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) return nfs_ok; } +static inline void nfsd4_increment_op_stats(u32 opnum) +{ + if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) + nfsdstats.nfs4_opcount[opnum]++; +} + /* * COMPOUND call. @@ -930,6 +936,8 @@ encode_op: /* XXX Ugh, we need to get rid of this kind of special case: */ if (op->opnum == OP_READ && op->u.read.rd_filp) fput(op->u.read.rd_filp); + + nfsd4_increment_op_stats(op->opnum); } out: diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 57265d56380..71944cddf68 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -72,6 +72,16 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) /* show my rpc info */ svc_seq_show(seq, &nfsd_svcstats); +#ifdef CONFIG_NFSD_V4 + /* Show count for individual nfsv4 operations */ + /* Writing operation numbers 0 1 2 also for maintaining uniformity */ + seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); + for (i = 0; i <= LAST_NFS4_OP; i++) + seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]); + + seq_putc(seq, '\n'); +#endif + return 0; } -- cgit v1.2.3 From d579091b4385e9386e244622d593fe064aa8e8e7 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Wed, 12 Jul 2006 09:03:05 -0700 Subject: [PATCH] fix fdset leakage When found, it is obvious. nfds calculated when allocating fdsets is rewritten by calculation of size of fdtable, and when we are unlucky, we try to free fdsets of wrong size. Found due to OpenVZ resource management (User Beancounters). Signed-off-by: Alexey Kuznetsov Signed-off-by: Kirill Korotaev Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 3f356086061..c8f1b0af8e0 100644 --- a/fs/file.c +++ b/fs/file.c @@ -273,11 +273,13 @@ static struct fdtable *alloc_fdtable(int nr) } while (nfds <= nr); new_fds = alloc_fd_array(nfds); if (!new_fds) - goto out; + goto out2; fdt->fd = new_fds; fdt->max_fds = nfds; fdt->free_files = NULL; return fdt; +out2: + nfds = fdt->max_fdset; out: if (new_openset) free_fdset(new_openset, nfds); -- cgit v1.2.3 From 232ba9dbd68bb084d5d90c511f207d18eae614da Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 12 Jul 2006 09:03:06 -0700 Subject: [PATCH] lockdep: annotate the sysfs i_mutex to be a separate class sysfs has a different i_mutex lock order behavior for i_mutex than the other filesystems; sysfs i_mutex is called in many places with subsystem locks held. At the same time, many of the VFS locking rules do not apply to sysfs at all (cross directory rename for example). To untangle this mess (which gives false positives in lockdep), we're giving sysfs inodes their own class for i_mutex. Signed-off-by: Arjan van de Ven Cc: Ingo Molnar Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sysfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 5e0e31cc46f..9889e54e1f1 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -109,6 +109,17 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } + +/* + * sysfs has a different i_mutex lock order behavior for i_mutex than other + * filesystems; sysfs i_mutex is called in many places with subsystem locks + * held. At the same time, many of the VFS locking rules do not apply to + * sysfs at all (cross directory rename for example). To untangle this mess + * (which gives false positives in lockdep), we're giving sysfs inodes their + * own class for i_mutex. + */ +static struct lock_class_key sysfs_inode_imutex_key; + struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) { struct inode * inode = new_inode(sysfs_sb); @@ -118,6 +129,7 @@ struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) inode->i_mapping->a_ops = &sysfs_aops; inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; inode->i_op = &sysfs_inode_operations; + lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); if (sd->s_iattr) { /* sysfs_dirent has non-default attributes -- cgit v1.2.3 From 0635170b544b01b46a81b4ac5cff5020ab59d1fc Mon Sep 17 00:00:00 2001 From: "Adam B. Jerome" Date: Wed, 12 Jul 2006 09:03:07 -0700 Subject: [PATCH] /fs/proc/: 'larger than buffer size' memory accessed by clear_user() Address a potential 'larger than buffer size' memory access by clear_user(). Without this patch, this call to clear_user() can attempt to clear too many (tsz) bytes resulting in a wrong (-EFAULT) return code by read_kcore(). Signed-off-by: Adam B. Jerome Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 8d6d85d7400..6a984f64edd 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -382,7 +382,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) */ if (n) { if (clear_user(buffer + tsz - n, - tsz - n)) + n)) return -EFAULT; } } else { -- cgit v1.2.3 From a29b0b74e73b66674d20a170e463fe9032f2272a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 12 Jul 2006 09:03:08 -0700 Subject: [PATCH] alloc_fdtable() expansion fix We're supposed to go the next power of two if nfds==nr. Of `nr', not of `nfsd'. Spotted by Rene Scharfe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index c8f1b0af8e0..b3c6b82e6a9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -240,7 +240,7 @@ static struct fdtable *alloc_fdtable(int nr) if (!fdt) goto out; - nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nfds)); + nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nr + 1)); if (nfds > NR_OPEN) nfds = NR_OPEN; -- cgit v1.2.3 From 18b0bbd8ca6d3cb90425aa0d77b99a762c6d6de3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 14 Jul 2006 16:51:34 -0700 Subject: Fix nasty /proc vulnerability We have a bad interaction with both the kernel and user space being able to change some of the /proc file status. This fixes the most obvious part of it, but I expect we'll also make it harder for users to modify even their "own" files in /proc. Signed-off-by: Linus Torvalds --- fs/proc/base.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 243a94af042..0cb8f20d000 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1338,6 +1338,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) } else { inode->i_uid = 0; inode->i_gid = 0; + inode->i_mode = 0; } security_task_to_inode(task, inode); put_task_struct(task); -- cgit v1.2.3 From 9ee8ab9fbf21e6b87ad227cd46c0a4be41ab749b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 14 Jul 2006 21:48:03 -0700 Subject: Relax /proc fix a bit Clearign all of i_mode was a bit draconian. We only really care about S_ISUID/ISGID, after all. Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0cb8f20d000..474eae34506 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1338,8 +1338,8 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) } else { inode->i_uid = 0; inode->i_gid = 0; - inode->i_mode = 0; } + inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); put_task_struct(task); return 1; @@ -1390,6 +1390,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_uid = 0; inode->i_gid = 0; } + inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); put_task_struct(task); return 1; -- cgit v1.2.3 From de45921535bfc3b1f63b426c2a9739635f864283 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Fri, 14 Jul 2006 00:23:49 -0700 Subject: [PATCH] struct file leakage 2.6.16 leaks like hell. While testing, I found massive leakage (reproduced in openvz) in: *filp *size-4096 And 1 object leaks in *size-32 *size-64 *size-128 It is the fix for the first one. filp leaks in the bowels of namei.c. Seems, size-4096 is file table leaking in expand_fdtables. I have no idea what are the rest and why they show only accompanying another leaks. Some debugging structs? [akpm@osdl.org, Trond: remove the IS_ERR() check] Signed-off-by: Alexey Kuznetsov Cc: Kirill Korotaev Cc: Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index c9750d755af..e01070d7bf5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1712,8 +1712,14 @@ do_link: if (error) goto exit_dput; error = __do_follow_link(&path, nd); - if (error) + if (error) { + /* Does someone understand code flow here? Or it is only + * me so stupid? Anathema to whoever designed this non-sense + * with "intent.open". + */ + release_open_intent(nd); return error; + } nd->flags &= ~LOOKUP_PARENT; if (nd->last_type == LAST_BIND) goto ok; -- cgit v1.2.3 From 6fbe82a952790c634ea6035c223a01a81377daf1 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 14 Jul 2006 00:24:22 -0700 Subject: [PATCH] reiserfs: fix handling of device names with /'s in them On systems with block devices containing a slash (virtual dasd, cciss, etc), reiserfs will fail to initialize /proc/fs/reiserfs/ due to it being interpreted as a subdirectory. The generic block device code changes the / to ! for use in the sysfs tree. This patch uses that convention. Tested by making dm devices use dm/ rather than dm- [akpm@osdl.org: name variables consistently] Signed-off-by: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/procfs.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 5d8a8cfebc7..c533ec1bcae 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -492,9 +492,17 @@ static void add_file(struct super_block *sb, char *name, int reiserfs_proc_info_init(struct super_block *sb) { + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + spin_lock_init(&__PINFO(sb).lock); - REISERFS_SB(sb)->procdir = - proc_mkdir(reiserfs_bdevname(sb), proc_info_root); + REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root); if (REISERFS_SB(sb)->procdir) { REISERFS_SB(sb)->procdir->owner = THIS_MODULE; REISERFS_SB(sb)->procdir->data = sb; @@ -508,13 +516,22 @@ int reiserfs_proc_info_init(struct super_block *sb) return 0; } reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s", - proc_info_root_name, reiserfs_bdevname(sb)); + proc_info_root_name, b); return 1; } int reiserfs_proc_info_done(struct super_block *sb) { struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + if (de) { remove_proc_entry("journal", de); remove_proc_entry("oidmap", de); @@ -528,7 +545,7 @@ int reiserfs_proc_info_done(struct super_block *sb) __PINFO(sb).exiting = 1; spin_unlock(&__PINFO(sb).lock); if (proc_info_root) { - remove_proc_entry(reiserfs_bdevname(sb), proc_info_root); + remove_proc_entry(b, proc_info_root); REISERFS_SB(sb)->procdir = NULL; } return 0; -- cgit v1.2.3 From d247e2c661f28a21e5f9a8d672e1e88a7c1c5d4a Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 14 Jul 2006 00:24:23 -0700 Subject: [PATCH] add function documentation for register_chrdev() Documentation for register_chrdev() was missing completely. [akpm@osdl.org: kerneldocification] Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/char_dev.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs') diff --git a/fs/char_dev.c b/fs/char_dev.c index a4cbc6706ef..3483d3cf808 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -182,6 +182,28 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, return 0; } +/** + * register_chrdev() - Register a major number for character devices. + * @major: major device number or 0 for dynamic allocation + * @name: name of this range of devices + * @fops: file operations associated with this devices + * + * If @major == 0 this functions will dynamically allocate a major and return + * its number. + * + * If @major > 0 this function will attempt to reserve a device with the given + * major number and will return zero on success. + * + * Returns a -ve errno on failure. + * + * The name of this device has nothing to do with the name of the device in + * /dev. It only helps to keep track of the different owners of devices. If + * your module name has only one type of devices it's ok to use e.g. the name + * of the module here. + * + * This function registers a range of 256 minor numbers. The first minor number + * is 0. + */ int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) { -- cgit v1.2.3 From 25890454667b3295f67b3372352be90705f8667c Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:43 -0700 Subject: [PATCH] per-task-delay-accounting: /proc export of aggregated block I/O delays Export I/O delays seen by a task through /proc//stats for use in top etc. Note that delays for I/O done for swapping in pages (swapin I/O) is clubbed together with all other I/O here (this is not the case in the netlink interface where the swapin I/O is kept distinct) [akpm@osdl.org: printk warning fix] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 7495d3e2077..0b615d62a15 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -411,7 +412,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %llu\n", task->pid, tcomm, state, @@ -455,7 +456,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) task->exit_signal, task_cpu(task), task->rt_priority, - task->policy); + task->policy, + (unsigned long long)delayacct_blkio_ticks(task)); if(mm) mmput(mm); return res; -- cgit v1.2.3 From 92d032855e64834283de5acfb0463232e0ab128e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Jul 2006 12:20:05 -0700 Subject: Mark /proc MS_NOSUID and MS_NOEXEC Not that we really need this any more, but at the same time there's no reason not to do this. Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6dcef089e18..49dfb2ab783 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -192,7 +192,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent) { struct inode * root_inode; - s->s_flags |= MS_NODIRATIME; + s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = PROC_SUPER_MAGIC; -- cgit v1.2.3 From 6d76fa58b050044994fe25f8753b8023f2b36737 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Jul 2006 12:26:45 -0700 Subject: Don't allow chmod() on the /proc// files This just turns off chmod() on the /proc// files, since there is no good reason to allow it, and had we disallowed it originally, the nasty /proc race exploit wouldn't have been possible. The other patches already fixed the problem chmod() could cause, so this is really just some final mop-up.. This particular version is based off a patch by Eugene and Marcel which had much better naming than my original equivalent one. Signed-off-by: Eugene Teo Signed-off-by: Marcel Holtmann Signed-off-by: Linus Torvalds --- fs/proc/base.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 474eae34506..fe8d55fb17c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -551,6 +551,27 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } +static int proc_setattr(struct dentry *dentry, struct iattr *attr) +{ + int error; + struct inode *inode = dentry->d_inode; + + if (attr->ia_valid & ATTR_MODE) + return -EPERM; + + error = inode_change_ok(inode, attr); + if (!error) { + error = security_inode_setattr(dentry, attr); + if (!error) + error = inode_setattr(inode, attr); + } + return error; +} + +static struct inode_operations proc_def_inode_operations = { + .setattr = proc_setattr, +}; + extern struct seq_operations mounts_op; struct proc_mounts { struct seq_file m; @@ -1111,7 +1132,8 @@ out: static struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, - .follow_link = proc_pid_follow_link + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, }; static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) @@ -1285,6 +1307,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st ei = PROC_I(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); + inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. @@ -1529,11 +1552,13 @@ static struct file_operations proc_task_operations = { */ static struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, + .setattr = proc_setattr, }; static struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, + .setattr = proc_setattr, }; #ifdef CONFIG_SECURITY @@ -1847,11 +1872,13 @@ static struct file_operations proc_tid_base_operations = { static struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; static struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; #ifdef CONFIG_SECURITY @@ -1894,11 +1921,13 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir, static struct inode_operations proc_tgid_attr_inode_operations = { .lookup = proc_tgid_attr_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; static struct inode_operations proc_tid_attr_inode_operations = { .lookup = proc_tid_attr_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; #endif -- cgit v1.2.3