diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 5 | ||||
-rw-r--r-- | kernel/acct.c | 30 | ||||
-rw-r--r-- | kernel/compat.c | 2 | ||||
-rw-r--r-- | kernel/cpuset.c | 8 | ||||
-rw-r--r-- | kernel/exit.c | 12 | ||||
-rw-r--r-- | kernel/fork.c | 82 | ||||
-rw-r--r-- | kernel/futex.c | 2 | ||||
-rw-r--r-- | kernel/kallsyms.c | 1 | ||||
-rw-r--r-- | kernel/kmod.c | 62 | ||||
-rw-r--r-- | kernel/kprobes.c | 53 | ||||
-rw-r--r-- | kernel/latency.c | 279 | ||||
-rw-r--r-- | kernel/lockdep.c | 6 | ||||
-rw-r--r-- | kernel/module.c | 37 | ||||
-rw-r--r-- | kernel/nsproxy.c | 139 | ||||
-rw-r--r-- | kernel/pid.c | 111 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 10 | ||||
-rw-r--r-- | kernel/sched.c | 5 | ||||
-rw-r--r-- | kernel/signal.c | 65 | ||||
-rw-r--r-- | kernel/spinlock.c | 4 | ||||
-rw-r--r-- | kernel/sys.c | 102 | ||||
-rw-r--r-- | kernel/sysctl.c | 352 | ||||
-rw-r--r-- | kernel/taskstats.c | 10 | ||||
-rw-r--r-- | kernel/time.c | 173 | ||||
-rw-r--r-- | kernel/time/Makefile | 2 | ||||
-rw-r--r-- | kernel/time/ntp.c | 350 | ||||
-rw-r--r-- | kernel/timer.c | 230 | ||||
-rw-r--r-- | kernel/tsacct.c | 124 | ||||
-rw-r--r-- | kernel/utsname.c | 95 |
28 files changed, 1685 insertions, 666 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index d62ec66c1af..d948ca12acf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o + hrtimer.o rwsem.o latency.o nsproxy.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ @@ -48,8 +48,9 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o -obj-$(CONFIG_TASKSTATS) += taskstats.o +obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index f4330acead4..0aad5ca36a8 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -602,33 +602,3 @@ void acct_process(void) do_acct_process(file); fput(file); } - - -/** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting - */ -void acct_update_integrals(struct task_struct *tsk) -{ - if (likely(tsk->mm)) { - long delta = - cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; - - if (delta == 0) - return; - tsk->acct_stimexpd = tsk->stime; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - } -} - -/** - * acct_clear_integrals - clear the mm integral fields in task_struct - * @tsk: task_struct whose accounting fields are cleared - */ -void acct_clear_integrals(struct task_struct *tsk) -{ - tsk->acct_stimexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; -} diff --git a/kernel/compat.c b/kernel/compat.c index b4fbd838cd7..75573e5d27b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,8 +26,6 @@ #include <asm/uaccess.h> -extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); - int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c3c400cce9..9d850ae13b1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -377,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directories start off with i_nlink == 2 (for "." entry) */ - inode->i_nlink++; + inc_nlink(inode); } else { return -ENOMEM; } @@ -1565,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode) inode->i_fop = &simple_dir_operations; /* start off with i_nlink == 2 (for "." entry) */ - inode->i_nlink++; + inc_nlink(inode); } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cpuset_file_operations; @@ -1598,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode) error = cpuset_create_file(dentry, S_IFDIR | mode); if (!error) { dentry->d_fsdata = cs; - parent->d_inode->i_nlink++; + inc_nlink(parent->d_inode); cs->dentry = dentry; } dput(dentry); @@ -2033,7 +2033,7 @@ int __init cpuset_init(void) } root = cpuset_mount->mnt_sb->s_root; root->d_fsdata = &top_cpuset; - root->d_inode->i_nlink++; + inc_nlink(root->d_inode); top_cpuset.dentry = root; root->d_inode->i_op = &cpuset_dir_inode_operations; number_of_cpusets = 1; diff --git a/kernel/exit.c b/kernel/exit.c index c189de2927a..f250a5e3e28 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -18,8 +18,10 @@ #include <linux/security.h> #include <linux/cpu.h> #include <linux/acct.h> +#include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/binfmts.h> +#include <linux/nsproxy.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> @@ -396,9 +398,11 @@ void daemonize(const char *name, ...) fs = init_task.fs; current->fs = fs; atomic_inc(&fs->count); - exit_namespace(current); - current->namespace = init_task.namespace; - get_namespace(current->namespace); + + exit_task_namespaces(current); + current->nsproxy = init_task.nsproxy; + get_task_namespaces(current); + exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -916,7 +920,6 @@ fastcall NORET_TYPE void do_exit(long code) exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); - exit_namespace(tsk); exit_thread(); cpuset_exit(tsk); exit_keys(tsk); @@ -931,6 +934,7 @@ fastcall NORET_TYPE void do_exit(long code) tsk->exit_code = code; proc_exit_connector(tsk); exit_notify(tsk); + exit_task_namespaces(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; diff --git a/kernel/fork.c b/kernel/fork.c index 1c999f3e0b4..7dc6140baac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -27,6 +27,7 @@ #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/fs.h> +#include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -42,6 +43,7 @@ #include <linux/profile.h> #include <linux/rmap.h> #include <linux/acct.h> +#include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> @@ -1115,11 +1117,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_signal; if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; - if ((retval = copy_namespace(clone_flags, p))) + if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* @@ -1211,7 +1213,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; } if (clone_flags & CLONE_THREAD) { @@ -1259,8 +1261,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); return p; -bad_fork_cleanup_namespace: - exit_namespace(p); +bad_fork_cleanup_namespaces: + exit_task_namespaces(p); bad_fork_cleanup_keys: exit_keys(p); bad_fork_cleanup_mm: @@ -1513,10 +1515,9 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) */ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->namespace; + struct namespace *ns = current->nsproxy->namespace; - if ((unshare_flags & CLONE_NEWNS) && - (ns && atomic_read(&ns->count) > 1)) { + if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1588,6 +1589,16 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n return 0; } +#ifndef CONFIG_IPC_NS +static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) +{ + if (flags & CLONE_NEWIPC) + return -EINVAL; + + return 0; +} +#endif + /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* @@ -1605,13 +1616,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; + struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; + struct uts_namespace *uts, *new_uts = NULL; + struct ipc_namespace *ipc, *new_ipc = NULL; check_unshare_flags(&unshare_flags); /* Return -EINVAL for all unsupported flags */ err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM)) + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) @@ -1628,11 +1643,30 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; + if ((err = unshare_utsname(unshare_flags, &new_uts))) + goto bad_unshare_cleanup_semundo; + if ((err = unshare_ipcs(unshare_flags, &new_ipc))) + goto bad_unshare_cleanup_uts; + + if (new_ns || new_uts || new_ipc) { + old_nsproxy = current->nsproxy; + new_nsproxy = dup_namespaces(old_nsproxy); + if (!new_nsproxy) { + err = -ENOMEM; + goto bad_unshare_cleanup_ipc; + } + } - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || + new_uts || new_ipc) { task_lock(current); + if (new_nsproxy) { + current->nsproxy = new_nsproxy; + new_nsproxy = old_nsproxy; + } + if (new_fs) { fs = current->fs; current->fs = new_fs; @@ -1640,8 +1674,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } if (new_ns) { - ns = current->namespace; - current->namespace = new_ns; + ns = current->nsproxy->namespace; + current->nsproxy->namespace = new_ns; new_ns = ns; } @@ -1666,9 +1700,33 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fd = fd; } + if (new_uts) { + uts = current->nsproxy->uts_ns; + current->nsproxy->uts_ns = new_uts; + new_uts = uts; + } + + if (new_ipc) { + ipc = current->nsproxy->ipc_ns; + current->nsproxy->ipc_ns = new_ipc; + new_ipc = ipc; + } + task_unlock(current); } + if (new_nsproxy) + put_nsproxy(new_nsproxy); + +bad_unshare_cleanup_ipc: + if (new_ipc) + put_ipc_ns(new_ipc); + +bad_unshare_cleanup_uts: + if (new_uts) + put_uts_ns(new_uts); + +bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/futex.c b/kernel/futex.c index 4b6770e9806..4aaf91951a4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1527,7 +1527,7 @@ static int futex_fd(u32 __user *uaddr, int signal) filp->f_mapping = filp->f_dentry->d_inode->i_mapping; if (signal) { - err = f_setown(filp, current->pid, 1); + err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); if (err < 0) { goto error; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ab16a5a4cfe..342bca62c49 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -154,7 +154,6 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } -EXPORT_SYMBOL_GPL(kallsyms_lookup_name); /* * Lookup an address diff --git a/kernel/kmod.c b/kernel/kmod.c index 842f8015d7f..bb4e29d924e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -18,8 +18,6 @@ call_usermodehelper wait flag, and remove exec_usermodehelper. Rusty Russell <rusty@rustcorp.com.au> Jan 2003 */ -#define __KERNEL_SYSCALLS__ - #include <linux/module.h> #include <linux/sched.h> #include <linux/syscalls.h> @@ -35,6 +33,7 @@ #include <linux/mount.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/resource.h> #include <asm/uaccess.h> extern int max_threads; @@ -122,6 +121,7 @@ struct subprocess_info { struct key *ring; int wait; int retval; + struct file *stdin; }; /* @@ -145,12 +145,30 @@ static int ____call_usermodehelper(void *data) key_put(old_session); + /* Install input pipe when needed */ + if (sub_info->stdin) { + struct files_struct *f = current->files; + struct fdtable *fdt; + /* no races because files should be private here */ + sys_close(0); + fd_install(0, sub_info->stdin); + spin_lock(&f->file_lock); + fdt = files_fdtable(f); + FD_SET(0, fdt->open_fds); + FD_CLR(0, fdt->close_on_exec); + spin_unlock(&f->file_lock); + + /* and disallow core files too */ + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0}; + } + /* We can run anywhere, unlike our parent keventd(). */ set_cpus_allowed(current, CPU_MASK_ALL); retval = -EPERM; if (current->fs->root) - retval = execve(sub_info->path, sub_info->argv,sub_info->envp); + retval = kernel_execve(sub_info->path, + sub_info->argv, sub_info->envp); /* Exec failed? */ sub_info->retval = retval; @@ -268,6 +286,44 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, } EXPORT_SYMBOL(call_usermodehelper_keys); +int call_usermodehelper_pipe(char *path, char **argv, char **envp, + struct file **filp) +{ + DECLARE_COMPLETION(done); + struct subprocess_info sub_info = { + .complete = &done, + .path = path, + .argv = argv, + .envp = envp, + .retval = 0, + }; + struct file *f; + DECLARE_WORK(work, __call_usermodehelper, &sub_info); + + if (!khelper_wq) + return -EBUSY; + + if (path[0] == '\0') + return 0; + + f = create_write_pipe(); + if (!f) + return -ENOMEM; + *filp = f; + + f = create_read_pipe(f); + if (!f) { + free_write_pipe(*filp); + return -ENOMEM; + } + sub_info.stdin = f; + + queue_work(khelper_wq, &work); + wait_for_completion(&done); + return sub_info.retval; +} +EXPORT_SYMBOL(call_usermodehelper_pipe); + void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3f57dfdc8f9..610c837ad9e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -37,6 +37,7 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <linux/kallsyms.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> @@ -45,6 +46,16 @@ #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + +/* + * Some oddball architectures like 64bit powerpc have function descriptors + * so this must be overridable. + */ +#ifndef kprobe_lookup_name +#define kprobe_lookup_name(name, addr) \ + addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) +#endif + static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; static atomic_t kprobe_count; @@ -308,7 +319,8 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri) } /* Called with kretprobe_lock held */ -void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) +void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, + struct hlist_head *head) { /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); @@ -320,7 +332,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) hlist_add_head(&ri->uflist, &ri->rp->free_instances); } else /* Unregistering */ - kfree(ri); + hlist_add_head(&ri->hlist, head); } struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) @@ -336,18 +348,24 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) */ void __kprobes kprobe_flush_task(struct task_struct *tk) { - struct kretprobe_instance *ri; - struct hlist_head *head; + struct kretprobe_instance *ri; + struct hlist_head *head, empty_rp; struct hlist_node *node, *tmp; unsigned long flags = 0; + INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(tk); - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task == tk) - recycle_rp_inst(ri); - } + head = kretprobe_inst_table_head(tk); + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task == tk) + recycle_rp_inst(ri, &empty_rp); + } spin_unlock_irqrestore(&kretprobe_lock, flags); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } } static inline void free_rp_inst(struct kretprobe *rp) @@ -447,6 +465,21 @@ static int __kprobes __register_kprobe(struct kprobe *p, struct kprobe *old_p; struct module *probed_mod; + /* + * If we have a symbol_name argument look it up, + * and add it to the address. That way the addr + * field can either be global or relative to a symbol. + */ + if (p->symbol_name) { + if (p->addr) + return -EINVAL; + kprobe_lookup_name(p->symbol_name, p->addr); + } + + if (!p->addr) + return -EINVAL; + p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); + if ((!kernel_text_address((unsigned long) p->addr)) || in_kprobes_functions((unsigned long) p->addr)) return -EINVAL; @@ -488,7 +521,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, (ARCH_INACTIVE_KPROBE_COUNT + 1)) register_page_fault_notifier(&kprobe_page_fault_nb); - arch_arm_kprobe(p); + arch_arm_kprobe(p); out: mutex_unlock(&kprobe_mutex); diff --git a/kernel/latency.c b/kernel/latency.c new file mode 100644 index 00000000000..258f2555abb --- /dev/null +++ b/kernel/latency.c @@ -0,0 +1,279 @@ +/* + * latency.c: Explicit system-wide latency-expectation infrastructure + * + * The purpose of this infrastructure is to allow device drivers to set + * latency constraint they have and to collect and summarize these + * expectations globally. The cummulated result can then be used by + * power management and similar users to make decisions that have + * tradoffs with a latency component. + * + * An example user of this are the x86 C-states; each higher C state saves + * more power, but has a higher exit latency. For the idle loop power + * code to make a good decision which C-state to use, information about + * acceptable latencies is required. + * + * An example announcer of latency is an audio driver that knowns it + * will get an interrupt when the hardware has 200 usec of samples + * left in the DMA buffer; in that case the driver can set a latency + * constraint of, say, 150 usec. + * + * Multiple drivers can each announce their maximum accepted latency, + * to keep these appart, a string based identifier is used. + * + * + * (C) Copyright 2006 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/latency.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <asm/atomic.h> + +struct latency_info { + struct list_head list; + int usecs; + char *identifier; +}; + +/* + * locking rule: all modifications to current_max_latency and + * latency_list need to be done while holding the latency_lock. + * latency_lock needs to be taken _irqsave. + */ +static atomic_t current_max_latency; +static DEFINE_SPINLOCK(latency_lock); + +static LIST_HEAD(latency_list); +static BLOCKING_NOTIFIER_HEAD(latency_notifier); + +/* + * This function returns the maximum latency allowed, which + * happens to be the minimum of all maximum latencies on the + * list. + */ +static int __find_max_latency(void) +{ + int min = INFINITE_LATENCY; + struct latency_info *info; + + list_for_each_entry(info, &latency_list, list) { + if (info->usecs < min) + min = info->usecs; + } + return min; +} + +/** + * set_acceptable_latency - sets the maximum latency acceptable + * @identifier: string that identifies this driver + * @usecs: maximum acceptable latency for this driver + * + * This function informs the kernel that this device(driver) + * can accept at most usecs latency. This setting is used for + * power management and similar tradeoffs. + * + * This function sleeps and can only be called from process + * context. + * Calling this function with an existing identifier is valid + * and will cause the existing latency setting to be changed. + */ +void set_acceptable_latency(char *identifier, int usecs) +{ + struct latency_info *info, *iter; + unsigned long flags; + int found_old = 0; + + info = kzalloc(sizeof(struct latency_info), GFP_KERNEL); + if (!info) + return; + info->usecs = usecs; + info->identifier = kstrdup(identifier, GFP_KERNEL); + if (!info->identifier) + goto free_info; + + spin_lock_irqsave(&latency_lock, flags); + list_for_each_entry(iter, &latency_list, list) { + if (strcmp(iter->identifier, identifier)==0) { + found_old = 1; + iter->usecs = usecs; + break; + } + } + if (!found_old) + list_add(&info->list, &latency_list); + + if (usecs < atomic_read(¤t_max_latency)) + atomic_set(¤t_max_latency, usecs); + + spin_unlock_irqrestore(&latency_lock, flags); + + blocking_notifier_call_chain(&latency_notifier, + atomic_read(¤t_max_latency), NULL); + + /* + * if we inserted the new one, we're done; otherwise there was + * an existing one so we need to free the redundant data + */ + if (!found_old) + return; + + kfree(info->identifier); +free_info: + kfree(info); +} +EXPORT_SYMBOL_GPL(set_acceptable_latency); + +/** + * modify_acceptable_latency - changes the maximum latency acceptable + * @identifier: string that identifies this driver + * @usecs: maximum acceptable latency for this driver + * + * This function informs the kernel that this device(driver) + * can accept at most usecs latency. This setting is used for + * power management and similar tradeoffs. + * + * This function does not sleep and can be called in any context. + * Trying to use a non-existing identifier silently gets ignored. + * + * Due to the atomic nature of this function, the modified latency + * value will only be used for future decisions; past decisions + * can still lead to longer latencies in the near future. + */ +void modify_acceptable_latency(char *identifier, int usecs) +{ + struct latency_info *iter; + unsigned long flags; + + spin_lock_irqsave(&latency_lock, flags); + list_for_each_entry(iter, &latency_list, list) { + if (strcmp(iter->identifier, identifier) == 0) { + iter->usecs = usecs; + break; + } + } + if (usecs < atomic_read(¤t_max_latency)) + atomic_set(¤t_max_latency, usecs); + spin_unlock_irqrestore(&latency_lock, flags); +} +EXPORT_SYMBOL_GPL(modify_acceptable_latency); + +/** + * remove_acceptable_latency - removes the maximum latency acceptable + * @identifier: string that identifies this driver + * + * This function removes a previously set maximum latency setting + * for the driver and frees up any resources associated with the + * bookkeeping needed for this. + * + * This function does not sleep and can be called in any context. + * Trying to use a non-existing identifier silently gets ignored. + */ +void remove_acceptable_latency(char *identifier) +{ + unsigned long flags; + int newmax = 0; + struct latency_info *iter, *temp; + + spin_lock_irqsave(&latency_lock, flags); + + list_for_each_entry_safe(iter, temp, &latency_list, list) { + if (strcmp(iter->identifier, identifier) == 0) { + list_del(&iter->list); + newmax = iter->usecs; + kfree(iter->identifier); + kfree(iter); + break; + } + } + + /* If we just deleted the system wide value, we need to + * recalculate with a full search + */ + if (newmax == atomic_read(¤t_max_latency)) { + newmax = __find_max_latency(); + atomic_set(¤t_max_latency, newmax); + } + spin_unlock_irqrestore(&latency_lock, flags); +} +EXPORT_SYMBOL_GPL(remove_acceptable_latency); + +/** + * system_latency_constraint - queries the system wide latency maximum + * + * This function returns the system wide maximum latency in + * microseconds. + * + * This function does not sleep and can be called in any context. + */ +int system_latency_constraint(void) +{ + return atomic_read(¤t_max_latency); +} +EXPORT_SYMBOL_GPL(system_latency_constraint); + +/** + * synchronize_acceptable_latency - recalculates all latency decisions + * + * This function will cause a callback to various kernel pieces that + * will make those pieces rethink their latency decisions. This implies + * that if there are overlong latencies in hardware state already, those + * latencies get taken right now. When this call completes no overlong + * latency decisions should be active anymore. + * + * Typical usecase of this is after a modify_acceptable_latency() call, + * which in itself is non-blocking and non-synchronizing. + * + * This function blocks and should not be called with locks held. + */ + +void synchronize_acceptable_latency(void) +{ + blocking_notifier_call_chain(&latency_notifier, + atomic_read(¤t_max_latency), NULL); +} +EXPORT_SYMBOL_GPL(synchronize_acceptable_latency); + +/* + * Latency notifier: this notifier gets called when a non-atomic new + * latency value gets set. The expectation nof the caller of the + * non-atomic set is that when the call returns, future latencies + * are within bounds, so the functions on the notifier list are + * expected to take the overlong latencies immediately, inside the + * callback, and not make a overlong latency decision anymore. + * + * The callback gets called when the new latency value is made + * active so system_latency_constraint() returns the new latency. + */ +int register_latency_notifier(struct notifier_block * nb) +{ + return blocking_notifier_chain_register(&latency_notifier, nb); +} +EXPORT_SYMBOL_GPL(register_latency_notifier); + +int unregister_latency_notifier(struct notifier_block * nb) +{ + return blocking_notifier_chain_unregister(&latency_notifier, nb); +} +EXPORT_SYMBOL_GPL(unregister_latency_notifier); + +static __init int latency_init(void) +{ + atomic_set(¤t_max_latency, INFINITE_LATENCY); + /* + * we don't want by default to have longer latencies than 2 ticks, + * since that would cause lost ticks + */ + set_acceptable_latency("kernel", 2*1000000/HZ); + return 0; +} + +module_init(latency_init); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e596525669e..4c055346100 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -518,9 +518,9 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) static void print_kernel_version(void) { - printk("%s %.*s\n", system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + printk("%s %.*s\n", init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); } /* diff --git a/kernel/module.c b/kernel/module.c index 05625d5dc75..7c77a0a9275 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -851,6 +851,7 @@ static int check_version(Elf_Shdr *sechdrs, printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; } return 1; } @@ -1339,6 +1340,7 @@ static void set_license(struct module *mod, const char *license) printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", mod->name, license); add_taint(TAINT_PROPRIETARY_MODULE); + mod->taints |= TAINT_PROPRIETARY_MODULE; } } @@ -1618,6 +1620,7 @@ static struct module *load_module(void __user *umod, /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", mod->name); } else if (!same_magic(modmagic, vermagic)) { @@ -1711,10 +1714,14 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); - if (strcmp(mod->name, "ndiswrapper") == 0) + if (strcmp(mod->name, "ndiswrapper") == 0) { add_taint(TAINT_PROPRIETARY_MODULE); - if (strcmp(mod->name, "driverloader") == 0) + mod->taints |= TAINT_PROPRIETARY_MODULE; + } + if (strcmp(mod->name, "driverloader") == 0) { add_taint(TAINT_PROPRIETARY_MODULE); + mod->taints |= TAINT_PROPRIETARY_MODULE; + } /* Set up MODINFO_ATTR fields */ setup_modinfo(mod, sechdrs, infoindex); @@ -1760,6 +1767,7 @@ static struct module *load_module(void __user *umod, printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; } #endif @@ -2226,14 +2234,37 @@ struct module *module_text_address(unsigned long addr) return mod; } +static char *taint_flags(unsigned int taints, char *buf) +{ + *buf = '\0'; + if (taints) { + int bx; + + buf[0] = '('; + bx = 1; + if (taints & TAINT_PROPRIETARY_MODULE) + buf[bx++] = 'P'; + if (taints & TAINT_FORCED_MODULE) + buf[bx++] = 'F'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + buf[bx] = ')'; + } + return buf; +} + /* Don't grab lock, we're oopsing. */ void print_modules(void) { struct module *mod; + char buf[8]; printk("Modules linked in:"); list_for_each_entry(mod, &modules, list) - printk(" %s", mod->name); + printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); printk("\n"); } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c new file mode 100644 index 00000000000..6ebdb82a0ce --- /dev/null +++ b/kernel/nsproxy.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2006 IBM Corporation + * + * Author: Serge Hallyn <serue@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * Jun 2006 - namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov <xemul@openvz.org> + */ + +#include <linux/module.h> +#include <linux/version.h> +#include <linux/nsproxy.h> +#include <linux/init_task.h> +#include <linux/namespace.h> +#include <linux/utsname.h> + +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); + +static inline void get_nsproxy(struct nsproxy *ns) +{ + atomic_inc(&ns->count); +} + +void get_task_namespaces(struct task_struct *tsk) +{ + struct nsproxy *ns = tsk->nsproxy; + if (ns) { + get_nsproxy(ns); + } +} + +/* + * creates a copy of "orig" with refcount 1. + * This does not grab references to the contained namespaces, + * so that needs to be done by dup_namespaces. + */ +static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns; + + ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL); + if (ns) { + memcpy(ns, orig, sizeof(struct nsproxy)); + atomic_set(&ns->count, 1); + } + return ns; +} + +/* + * copies the nsproxy, setting refcount to 1, and grabbing a + * reference to all contained namespaces. Called from + * sys_unshare() + */ +struct nsproxy *dup_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns = clone_namespaces(orig); + + if (ns) { + if (ns->namespace) + get_namespace(ns->namespace); + if (ns->uts_ns) + get_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + get_ipc_ns(ns->ipc_ns); + } + + return ns; +} + +/* + * called from clone. This now handles copy for nsproxy and all + * namespaces therein. + */ +int copy_namespaces(int flags, struct task_struct *tsk) +{ + struct nsproxy *old_ns = tsk->nsproxy; + struct nsproxy *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_nsproxy(old_ns); + + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + return 0; + + new_ns = clone_namespaces(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->nsproxy = new_ns; + + err = copy_namespace(flags, tsk); + if (err) + goto out_ns; + + err = copy_utsname(flags, tsk); + if (err) + goto out_uts; + + err = copy_ipcs(flags, tsk); + if (err) + goto out_ipc; + +out: + put_nsproxy(old_ns); + return err; + +out_ipc: + if (new_ns->uts_ns) + put_uts_ns(new_ns->uts_ns); +out_uts: + if (new_ns->namespace) + put_namespace(new_ns->namespace); +out_ns: + tsk->nsproxy = old_ns; + kfree(new_ns); + goto out; +} + +void free_nsproxy(struct nsproxy *ns) +{ + if (ns->namespace) + put_namespace(ns->namespace); + if (ns->uts_ns) + put_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + put_ipc_ns(ns->ipc_ns); + kfree(ns); +} diff --git a/kernel/pid.c b/kernel/pid.c index 8387e8c6819..b914392085f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/hash.h> +#include <linux/pspace.h> #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; @@ -33,17 +34,20 @@ static int pidhash_shift; static kmem_cache_t *pid_cachep; int pid_max = PID_MAX_DEFAULT; -int last_pid; #define RESERVED_PIDS 300 int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) #define BITS_PER_PAGE (PAGE_SIZE*8) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) -#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off)) + +static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) +{ + return (map - pspace->pidmap)*BITS_PER_PAGE + off; +} + #define find_next_offset(map, off) \ find_next_zero_bit((map)->page, BITS_PER_PAGE, off) @@ -53,13 +57,12 @@ int pid_max_max = PID_MAX_LIMIT; * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ -typedef struct pidmap { - atomic_t nr_free; - void *page; -} pidmap_t; - -static pidmap_t pidmap_array[PIDMAP_ENTRIES] = - { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; +struct pspace init_pspace = { + .pidmap = { + [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } + }, + .last_pid = 0 +}; /* * Note: disable interrupts while the pidmap_lock is held as an @@ -74,40 +77,41 @@ static pidmap_t pidmap_array[PIDMAP_ENTRIES] = * irq handlers that take it we can leave the interrupts enabled. * For now it is easier to be safe than to prove it can't happen. */ + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static fastcall void free_pidmap(int pid) +static fastcall void free_pidmap(struct pspace *pspace, int pid) { - pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; + struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; clear_bit(offset, map->page); atomic_inc(&map->nr_free); } -static int alloc_pidmap(void) +static int alloc_pidmap(struct pspace *pspace) { - int i, offset, max_scan, pid, last = last_pid; - pidmap_t *map; + int i, offset, max_scan, pid, last = pspace->last_pid; + struct pidmap *map; pid = last + 1; if (pid >= pid_max) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; - map = &pidmap_array[pid/BITS_PER_PAGE]; + map = &pspace->pidmap[pid/BITS_PER_PAGE]; max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { - unsigned long page = get_zeroed_page(GFP_KERNEL); + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* * Free the page if someone raced with us * installing it: */ spin_lock_irq(&pidmap_lock); if (map->page) - free_page(page); + kfree(page); else - map->page = (void *)page; + map->page = page; spin_unlock_irq(&pidmap_lock); if (unlikely(!map->page)) break; @@ -116,11 +120,11 @@ static int alloc_pidmap(void) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - last_pid = pid; + pspace->last_pid = pid; return pid; } offset = find_next_offset(map, offset); - pid = mk_pid(map, offset); + pid = mk_pid(pspace, map, offset); /* * find_next_offset() found a bit, the pid from it * is in-bounds, and if we fell back to the last @@ -131,16 +135,34 @@ static int alloc_pidmap(void) (i != max_scan || pid < last || !((last+1) & BITS_PER_PAGE_MASK))); } - if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) { + if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; offset = 0; } else { - map = &pidmap_array[0]; + map = &pspace->pidmap[0]; offset = RESERVED_PIDS; if (unlikely(last == offset)) break; } - pid = mk_pid(map, offset); + pid = mk_pid(pspace, map, offset); + } + return -1; +} + +static int next_pidmap(struct pspace *pspace, int last) +{ + int offset; + struct pidmap *map, *end; + + offset = (last + 1) & BITS_PER_PAGE_MASK; + map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; + end = &pspace->pidmap[PIDMAP_ENTRIES]; + for (; map < end; map++, offset = 0) { + if (unlikely(!map->page)) + continue; + offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); + if (offset < BITS_PER_PAGE) + return mk_pid(pspace, map, offset); } return -1; } @@ -153,6 +175,7 @@ fastcall void put_pid(struct pid *pid) atomic_dec_and_test(&pid->count)) kmem_cache_free(pid_cachep, pid); } +EXPORT_SYMBOL_GPL(put_pid); static void delayed_put_pid(struct rcu_head *rhp) { @@ -169,7 +192,7 @@ fastcall void free_pid(struct pid *pid) hlist_del_rcu(&pid->pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - free_pidmap(pid->nr); + free_pidmap(&init_pspace, pid->nr); call_rcu(&pid->rcu, delayed_put_pid); } @@ -183,7 +206,7 @@ struct pid *alloc_pid(void) if (!pid) goto out; - nr = alloc_pidmap(); + nr = alloc_pidmap(&init_pspace); if (nr < 0) goto out_free; @@ -217,6 +240,7 @@ struct pid * fastcall find_pid(int nr) } return NULL; } +EXPORT_SYMBOL_GPL(find_pid); int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) { @@ -280,6 +304,15 @@ struct task_struct *find_task_by_pid_type(int type, int nr) EXPORT_SYMBOL(find_task_by_pid_type); +struct pid *get_task_pid(struct task_struct *task, enum pid_type type) +{ + struct pid *pid; + rcu_read_lock(); + pid = get_pid(task->pids[type].pid); + rcu_read_unlock(); + return pid; +} + struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; @@ -303,6 +336,26 @@ struct pid *find_get_pid(pid_t nr) } /* + * Used by proc to find the first pid that is greater then or equal to nr. + * + * If there is a pid at nr this function is exactly the same as find_pid. + */ +struct pid *find_ge_pid(int nr) +{ + struct pid *pid; + + do { + pid = find_pid(nr); + if (pid) + break; + nr = next_pidmap(&init_pspace, nr); + } while (nr > 0); + + return pid; +} +EXPORT_SYMBOL_GPL(find_get_pid); + +/* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or * more. @@ -329,10 +382,10 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); + init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, pidmap_array->page); - atomic_dec(&pidmap_array->nr_free); + set_bit(0, init_pspace.pidmap[0].page); + atomic_dec(&init_pspace.pidmap[0].nr_free); pid_cachep = kmem_cache_create("pid", sizeof(struct pid), __alignof__(struct pid), diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1b84313cbab..99f9b7d177d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -906,7 +906,7 @@ static void init_header(struct swsusp_info *info) memset(info, 0, sizeof(struct swsusp_info)); info->version_code = LINUX_VERSION_CODE; info->num_physpages = num_physpages; - memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); + memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); info->cpus = num_online_cpus(); info->image_pages = nr_copy_pages; info->pages = nr_copy_pages + nr_meta_pages + 1; @@ -1050,13 +1050,13 @@ static inline int check_header(struct swsusp_info *info) reason = "kernel version"; if (info->num_physpages != num_physpages) reason = "memory size"; - if (strcmp(info->uts.sysname,system_utsname.sysname)) + if (strcmp(info->uts.sysname,init_utsname()->sysname)) reason = "system type"; - if (strcmp(info->uts.release,system_utsname.release)) + if (strcmp(info->uts.release,init_utsname()->release)) reason = "kernel release"; - if (strcmp(info->uts.version,system_utsname.version)) + if (strcmp(info->uts.version,init_utsname()->version)) reason = "version"; - if (strcmp(info->uts.machine,system_utsname.machine)) + if (strcmp(info->uts.machine,init_utsname()->machine)) reason = "machine"; if (reason) { printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); diff --git a/kernel/sched.c b/kernel/sched.c index 74f169ac077..e4e54e86f4a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -49,7 +49,7 @@ #include <linux/seq_file.h> #include <linux/syscalls.h> #include <linux/times.h> -#include <linux/acct.h> +#include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> #include <asm/tlb.h> @@ -4384,7 +4384,10 @@ EXPORT_SYMBOL(cpu_present_map); #ifndef CONFIG_SMP cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); #endif long sched_getaffinity(pid_t pid, cpumask_t *mask) diff --git a/kernel/signal.c b/kernel/signal.c index fb5da6d19f1..7ed8d5304be 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1055,28 +1055,44 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) } /* - * kill_pg_info() sends a signal to a process group: this is what the tty + * kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) */ -int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; int retval, success; - if (pgrp <= 0) - return -EINVAL; - success = 0; retval = -ESRCH; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p); success |= !err; retval = err; - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return success ? 0 : retval; } +int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) +{ + int retval; + + read_lock(&tasklist_lock); + retval = __kill_pgrp_info(sig, info, pgrp); + read_unlock(&tasklist_lock); + + return retval; +} + +int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +{ + if (pgrp <= 0) + return -EINVAL; + + return __kill_pgrp_info(sig, info, find_pid(pgrp)); +} + int kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) { @@ -1089,8 +1105,7 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) return retval; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) { int error; int acquired_tasklist_lock = 0; @@ -1101,7 +1116,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) read_lock(&tasklist_lock); acquired_tasklist_lock = 1; } - p = find_task_by_pid(pid); + p = pid_task(pid, PIDTYPE_PID); error = -ESRCH; if (p) error = group_send_sig_info(sig, info, p); @@ -1111,8 +1126,18 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } -/* like kill_proc_info(), but doesn't use uid/euid of "current" */ -int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, +int +kill_proc_info(int sig, struct siginfo *info, pid_t pid) +{ + int error; + rcu_read_lock(); + error = kill_pid_info(sig, info, find_pid(pid)); + rcu_read_unlock(); + return error; +} + +/* like kill_pid_info(), but doesn't use uid/euid of "current" */ +int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, uid_t uid, uid_t euid, u32 secid) { int ret = -EINVAL; @@ -1122,7 +1147,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, return ret; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; goto out_unlock; @@ -1146,7 +1171,7 @@ out_unlock: read_unlock(&tasklist_lock); return ret; } -EXPORT_SYMBOL_GPL(kill_proc_info_as_uid); +EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); /* * kill_something_info() interprets pid in interesting ways just like kill(2). @@ -1264,6 +1289,18 @@ force_sigsegv(int sig, struct task_struct *p) return 0; } +int kill_pgrp(struct pid *pid, int sig, int priv) +{ + return kill_pgrp_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pgrp); + +int kill_pid(struct pid *pid, int sig, int priv) +{ + return kill_pid_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pid); + int kill_pg(pid_t pgrp, int sig, int priv) { diff --git a/kernel/spinlock.c b/kernel/spinlock.c index d48143eafbf..476c3741511 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -215,7 +215,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ - cpu_relax(); \ + _raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ @@ -237,7 +237,7 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ - cpu_relax(); \ + _raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ diff --git a/kernel/sys.c b/kernel/sys.c index b88806c6624..2314867ae34 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -92,7 +92,8 @@ EXPORT_SYMBOL(fs_overflowgid); */ int C_A_D = 1; -int cad_pid = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); /* * Notifier list for kernel code which wants to be called @@ -221,7 +222,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * of the last notifier function called. */ -int atomic_notifier_call_chain(struct atomic_notifier_head *nh, +int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { int ret; @@ -607,11 +608,10 @@ static void kernel_restart_prepare(char *cmd) void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - if (!cmd) { + if (!cmd) printk(KERN_EMERG "Restarting system.\n"); - } else { + else printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - } machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -627,9 +627,8 @@ static void kernel_kexec(void) #ifdef CONFIG_KEXEC struct kimage *image; image = xchg(&kexec_image, NULL); - if (!image) { + if (!image) return; - } kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); @@ -775,10 +774,9 @@ void ctrl_alt_del(void) if (C_A_D) schedule_work(&cad_work); else - kill_proc(cad_pid, SIGINT, 1); + kill_cad_pid(SIGINT, 1); } - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -823,12 +821,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) (current->sgid == egid) || capable(CAP_SETGID)) new_egid = egid; - else { + else return -EPERM; - } } - if (new_egid != old_egid) - { + if (new_egid != old_egid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -857,19 +853,14 @@ asmlinkage long sys_setgid(gid_t gid) if (retval) return retval; - if (capable(CAP_SETGID)) - { - if(old_egid != gid) - { + if (capable(CAP_SETGID)) { + if (old_egid != gid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } current->gid = current->egid = current->sgid = current->fsgid = gid; - } - else if ((gid == current->gid) || (gid == current->sgid)) - { - if(old_egid != gid) - { + } else if ((gid == current->gid) || (gid == current->sgid)) { + if (old_egid != gid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -900,8 +891,7 @@ static int set_user(uid_t new_ruid, int dumpclear) switch_uid(new_user); - if(dumpclear) - { + if (dumpclear) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -957,8 +947,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) return -EAGAIN; - if (new_euid != old_euid) - { + if (new_euid != old_euid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1008,8 +997,7 @@ asmlinkage long sys_setuid(uid_t uid) } else if ((uid != current->uid) && (uid != new_suid)) return -EPERM; - if (old_euid != uid) - { + if (old_euid != uid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1054,8 +1042,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) return -EAGAIN; } if (euid != (uid_t) -1) { - if (euid != current->euid) - { + if (euid != current->euid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1105,8 +1092,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) return -EPERM; } if (egid != (gid_t) -1) { - if (egid != current->egid) - { + if (egid != current->egid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1151,10 +1137,8 @@ asmlinkage long sys_setfsuid(uid_t uid) if (uid == current->uid || uid == current->euid || uid == current->suid || uid == current->fsuid || - capable(CAP_SETUID)) - { - if (uid != old_fsuid) - { + capable(CAP_SETUID)) { + if (uid != old_fsuid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1182,10 +1166,8 @@ asmlinkage long sys_setfsgid(gid_t gid) if (gid == current->gid || gid == current->egid || gid == current->sgid || gid == current->fsgid || - capable(CAP_SETGID)) - { - if (gid != old_fsgid) - { + capable(CAP_SETGID)) { + if (gid != old_fsgid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1321,9 +1303,9 @@ out: asmlinkage long sys_getpgid(pid_t pid) { - if (!pid) { + if (!pid) return process_group(current); - } else { + else { int retval; struct task_struct *p; @@ -1353,9 +1335,9 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { - if (!pid) { + if (!pid) return current->signal->session; - } else { + else { int retval; struct task_struct *p; @@ -1363,7 +1345,7 @@ asmlinkage long sys_getsid(pid_t pid) p = find_task_by_pid(pid); retval = -ESRCH; - if(p) { + if (p) { retval = security_task_getsid(p); if (!retval) retval = p->signal->session; @@ -1431,9 +1413,9 @@ struct group_info *groups_alloc(int gidsetsize) group_info->nblocks = nblocks; atomic_set(&group_info->usage, 1); - if (gidsetsize <= NGROUPS_SMALL) { + if (gidsetsize <= NGROUPS_SMALL) group_info->blocks[0] = group_info->small_block; - } else { + else { for (i = 0; i < nblocks; i++) { gid_t *b; b = (void *)__get_free_page(GFP_USER); @@ -1489,7 +1471,7 @@ static int groups_to_user(gid_t __user *grouplist, /* fill a group_info from a user-space array - it must be allocated already */ static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) - { +{ int i; int count = group_info->ngroups; @@ -1647,9 +1629,8 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) int in_group_p(gid_t grp) { int retval = 1; - if (grp != current->fsgid) { + if (grp != current->fsgid) retval = groups_search(current->group_info, grp); - } return retval; } @@ -1658,9 +1639,8 @@ EXPORT_SYMBOL(in_group_p); int in_egroup_p(gid_t grp) { int retval = 1; - if (grp != current->egid) { + if (grp != current->egid) retval = groups_search(current->group_info, grp); - } return retval; } @@ -1675,7 +1655,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name) int errno = 0; down_read(&uts_sem); - if (copy_to_user(name,&system_utsname,sizeof *name)) + if (copy_to_user(name, utsname(), sizeof *name)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1693,8 +1673,8 @@ asmlinkage long sys_sethostname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.nodename, tmp, len); - system_utsname.nodename[len] = 0; + memcpy(utsname()->nodename, tmp, len); + utsname()->nodename[len] = 0; errno = 0; } up_write(&uts_sem); @@ -1710,11 +1690,11 @@ asmlinkage long sys_gethostname(char __user *name, int len) if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(system_utsname.nodename); + i = 1 + strlen(utsname()->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, system_utsname.nodename, i)) + if (copy_to_user(name, utsname()->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1739,8 +1719,8 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.domainname, tmp, len); - system_utsname.domainname[len] = 0; + memcpy(utsname()->domainname, tmp, len); + utsname()->domainname[len] = 0; errno = 0; } up_write(&uts_sem); @@ -1775,9 +1755,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); - if(x.rlim_cur > 0x7FFFFFFF) + if (x.rlim_cur > 0x7FFFFFFF) x.rlim_cur = 0x7FFFFFFF; - if(x.rlim_max > 0x7FFFFFFF) + if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c57c4532e29..8020fb273c4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,7 +68,6 @@ extern int sysrq_enabled; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; -extern int cad_pid; extern int pid_max; extern int min_free_kbytes; extern int printk_ratelimit_jiffies; @@ -92,13 +91,8 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif #ifdef CONFIG_SYSVIPC -extern size_t shm_ctlmax; -extern size_t shm_ctlall; -extern int shm_ctlmni; -extern int msg_ctlmax; -extern int msg_ctlmnb; -extern int msg_ctlmni; -extern int sem_ctls[]; +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif #ifdef __sparc__ @@ -139,7 +133,10 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); #endif -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + +static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); static ctl_table root_table[]; @@ -229,51 +226,100 @@ static ctl_table root_table[] = { }; static ctl_table kern_table[] = { +#ifndef CONFIG_UTS_NS + { + .ctl_name = KERN_OSTYPE, + .procname = "ostype", + .data = init_uts_ns.name.sysname, + .maxlen = sizeof(init_uts_ns.name.sysname), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_OSRELEASE, + .procname = "osrelease", + .data = init_uts_ns.name.release, + .maxlen = sizeof(init_uts_ns.name.release), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_VERSION, + .procname = "version", + .data = init_uts_ns.name.version, + .maxlen = sizeof(init_uts_ns.name.version), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_NODENAME, + .procname = "hostname", + .data = init_uts_ns.name.nodename, + .maxlen = sizeof(init_uts_ns.name.nodename), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_DOMAINNAME, + .procname = "domainname", + .data = init_uts_ns.name.domainname, + .maxlen = sizeof(init_uts_ns.name.domainname), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, +#else /* !CONFIG_UTS_NS */ { .ctl_name = KERN_OSTYPE, .procname = "ostype", - .data = system_utsname.sysname, - .maxlen = sizeof(system_utsname.sysname), + .data = NULL, + /* could maybe use __NEW_UTS_LEN here? */ + .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_OSRELEASE, .procname = "osrelease", - .data = system_utsname.release, - .maxlen = sizeof(system_utsname.release), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, release), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_VERSION, .procname = "version", - .data = system_utsname.version, - .maxlen = sizeof(system_utsname.version), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, version), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_NODENAME, .procname = "hostname", - .data = system_utsname.nodename, - .maxlen = sizeof(system_utsname.nodename), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), .mode = 0644, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_DOMAINNAME, .procname = "domainname", - .data = system_utsname.domainname, - .maxlen = sizeof(system_utsname.domainname), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), .mode = 0644, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, +#endif /* !CONFIG_UTS_NS */ { .ctl_name = KERN_PANIC, .procname = "panic", @@ -294,7 +340,7 @@ static ctl_table kern_table[] = { .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", .data = core_pattern, - .maxlen = 64, + .maxlen = 128, .mode = 0644, .proc_handler = &proc_dostring, .strategy = &sysctl_string, @@ -432,58 +478,58 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_SHMMAX, .procname = "shmmax", - .data = &shm_ctlmax, + .data = NULL, .maxlen = sizeof (size_t), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SHMALL, .procname = "shmall", - .data = &shm_ctlall, + .data = NULL, .maxlen = sizeof (size_t), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SHMMNI, .procname = "shmmni", - .data = &shm_ctlmni, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMAX, .procname = "msgmax", - .data = &msg_ctlmax, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMNI, .procname = "msgmni", - .data = &msg_ctlmni, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMNB, .procname = "msgmnb", - .data = &msg_ctlmnb, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SEM, .procname = "sem", - .data = &sem_ctls, + .data = NULL, .maxlen = 4*sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -499,10 +545,10 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_CADPID, .procname = "cad_pid", - .data = &cad_pid, + .data = NULL, .maxlen = sizeof (int), .mode = 0600, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_cad_pid, }, { .ctl_name = KERN_MAX_THREADS, @@ -1624,32 +1670,15 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf, return do_rw_proc(1, file, (char __user *) buf, count, ppos); } -/** - * proc_dostring - read a string sysctl - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes a string from/to the user buffer. If the kernel - * buffer provided is not large enough to hold the string, the - * string is truncated. The copied string is %NULL-terminated. - * If the string is being read by the user process, it is copied - * and a newline '\n' is added. It is truncated if the buffer is - * not large enough. - * - * Returns 0 on success. - */ -int proc_dostring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int _proc_do_string(void* data, int maxlen, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) { size_t len; char __user *p; char c; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; @@ -1665,20 +1694,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, break; len++; } - if (len >= table->maxlen) - len = table->maxlen-1; - if(copy_from_user(table->data, buffer, len)) + if (len >= maxlen) + len = maxlen-1; + if(copy_from_user(data, buffer, len)) return -EFAULT; - ((char *) table->data)[len] = 0; + ((char *) data)[len] = 0; *ppos += *lenp; } else { - len = strlen(table->data); - if (len > table->maxlen) - len = table->maxlen; + len = strlen(data); + if (len > maxlen) + len = maxlen; if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, table->data, len)) + if(copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { if(put_user('\n', ((char __user *) buffer) + len)) @@ -1691,12 +1720,38 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return 0; } +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return _proc_do_string(table->data, table->maxlen, write, filp, + buffer, lenp, ppos); +} + /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +#ifndef CONFIG_UTS_NS +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int r; @@ -1712,6 +1767,48 @@ static int proc_doutsstring(ctl_table *table, int write, struct file *filp, } return r; } +#else /* !CONFIG_UTS_NS */ +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + struct uts_namespace* uts_ns = current->nsproxy->uts_ns; + char* which; + + switch (table->ctl_name) { + case KERN_OSTYPE: + which = uts_ns->name.sysname; + break; + case KERN_NODENAME: + which = uts_ns->name.nodename; + break; + case KERN_OSRELEASE: + which = uts_ns->name.release; + break; + case KERN_VERSION: + which = uts_ns->name.version; + break; + case KERN_DOMAINNAME: + which = uts_ns->name.domainname; + break; + default: + r = -EINVAL; + goto out; + } + + if (!write) { + down_read(&uts_sem); + r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos); + up_read(&uts_sem); + } else { + down_write(&uts_sem); + r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos); + up_write(&uts_sem); + } + out: + return r; +} +#endif /* !CONFIG_UTS_NS */ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1732,8 +1829,9 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, return 0; } -static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos, +static int __do_proc_dointvec(void *tbl_data, ctl_table *table, + int write, struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -1746,13 +1844,13 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (int *) table->data; + i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; @@ -1841,6 +1939,16 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, #undef TMPBUFLEN } +static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + return __do_proc_dointvec(table->data, table, write, filp, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -1974,7 +2082,7 @@ int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, do_proc_dointvec_minmax_conv, ¶m); } -static int do_proc_doulongvec_minmax(ctl_table *table, int write, +static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, @@ -1988,13 +2096,13 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (unsigned long *) table->data; + i = (unsigned long *) data; min = (unsigned long *) table->extra1; max = (unsigned long *) table->extra2; vleft = table->maxlen / sizeof(unsigned long); @@ -2079,6 +2187,17 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, #undef TMPBUFLEN } +static int do_proc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ + return __do_proc_doulongvec_minmax(table->data, table, write, + filp, buffer, lenp, ppos, convmul, convdiv); +} + /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table @@ -2267,6 +2386,71 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, do_proc_dointvec_ms_jiffies_conv, NULL); } +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + void *data; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; + + switch (table->ctl_name) { + case KERN_SHMMAX: + data = &ns->shm_ctlmax; + goto proc_minmax; + case KERN_SHMALL: + data = &ns->shm_ctlall; + goto proc_minmax; + case KERN_SHMMNI: + data = &ns->shm_ctlmni; + break; + case KERN_MSGMAX: + data = &ns->msg_ctlmax; + break; + case KERN_MSGMNI: + data = &ns->msg_ctlmni; + break; + case KERN_MSGMNB: + data = &ns->msg_ctlmnb; + break; + case KERN_SEM: + data = &ns->sem_ctls; + break; + default: + return -EINVAL; + } + + return __do_proc_dointvec(data, table, write, filp, buffer, + lenp, ppos, NULL, NULL); +proc_minmax: + return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, + lenp, ppos, 1l, 1l); +} +#endif + +static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp; + int r; + + tmp = pid_nr(cad_pid); + + r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + lenp, ppos, NULL, NULL); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + #else /* CONFIG_PROC_FS */ int proc_dostring(ctl_table *table, int write, struct file *filp, @@ -2275,12 +2459,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return -ENOSYS; } -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif + int proc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2ed4040d0dc..5d6a8c54ee8 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -18,7 +18,9 @@ #include <linux/kernel.h> #include <linux/taskstats_kern.h> +#include <linux/tsacct_kern.h> #include <linux/delayacct.h> +#include <linux/tsacct_kern.h> #include <linux/cpumask.h> #include <linux/percpu.h> #include <net/genetlink.h> @@ -75,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - skb = nlmsg_new(size, GFP_KERNEL); + skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); if (!skb) return -ENOMEM; @@ -198,7 +200,13 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, */ delayacct_add_tsk(stats, tsk); + + /* fill in basic acct fields */ stats->version = TASKSTATS_VERSION; + bacct_add_tsk(stats, tsk); + + /* fill in extended acct fields */ + xacct_add_tsk(stats, tsk); /* Define err: label here if needed */ put_task_struct(tsk); diff --git a/kernel/time.c b/kernel/time.c index 5bd48974764..0e017bff4c1 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -202,179 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv, return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -/* we call this to notify the arch when the clock is being - * controlled. If no such arch routine, do nothing. - */ -void __attribute__ ((weak)) notify_arch_cmos_timer(void) -{ - return; -} - -/* adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. - */ -int do_adjtimex(struct timex *txc) -{ - long ltemp, mtemp, save_adjust; - int result; - - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) - /* singleshot must not be used with any other mode bits */ - if (txc->modes != ADJ_OFFSET_SINGLESHOT) - return -EINVAL; - - if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) - /* adjustment Offset limited to +- .512 seconds */ - if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) - return -EINVAL; - - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - result = time_state; /* mostly `TIME_OK' */ - - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_next_adjust ? time_next_adjust : time_adjust; - -#if 0 /* STA_CLOCKERR is never set yet */ - time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ -#endif - /* If there are input parameters, then process them */ - if (txc->modes) - { - if (txc->modes & ADJ_STATUS) /* only set allowed bits */ - time_status = (txc->status & ~STA_RONLY) | - (time_status & STA_RONLY); - - if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ - if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { - result = -EINVAL; - goto leave; - } - time_freq = txc->freq; - } - - if (txc->modes & ADJ_MAXERROR) { - if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; - } - time_maxerror = txc->maxerror; - } - - if (txc->modes & ADJ_ESTERROR) { - if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; - } - time_esterror = txc->esterror; - } - - if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ - if (txc->constant < 0) { /* NTP v4 uses values > 6 */ - result = -EINVAL; - goto leave; - } - time_constant = txc->constant; - } - - if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ - if (txc->modes == ADJ_OFFSET_SINGLESHOT) { - /* adjtime() is independent from ntp_adjtime() */ - if ((time_next_adjust = txc->offset) == 0) - time_adjust = 0; - } - else if (time_status & STA_PLL) { - ltemp = txc->offset; - - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ - if (ltemp > MAXPHASE) - time_offset = MAXPHASE << SHIFT_UPDATE; - else if (ltemp < -MAXPHASE) - time_offset = -(MAXPHASE << SHIFT_UPDATE); - else - time_offset = ltemp << SHIFT_UPDATE; - - /* - * Select whether the frequency is to be controlled - * and in which mode (PLL or FLL). Clamp to the operating - * range. Ugly multiply/divide should be replaced someday. - */ - - if (time_status & STA_FREQHOLD || time_reftime == 0) - time_reftime = xtime.tv_sec; - mtemp = xtime.tv_sec - time_reftime; - time_reftime = xtime.tv_sec; - if (time_status & STA_FLL) { - if (mtemp >= MINSEC) { - ltemp = (time_offset / mtemp) << (SHIFT_USEC - - SHIFT_UPDATE); - time_freq += shift_right(ltemp, SHIFT_KH); - } else /* calibration interval too short (p. 12) */ - result = TIME_ERROR; - } else { /* PLL mode */ - if (mtemp < MAXSEC) { - ltemp *= mtemp; - time_freq += shift_right(ltemp,(time_constant + - time_constant + - SHIFT_KF - SHIFT_USEC)); - } else /* calibration interval too long (p. 12) */ - result = TIME_ERROR; - } - time_freq = min(time_freq, time_tolerance); - time_freq = max(time_freq, -time_tolerance); - } /* STA_PLL */ - } /* txc->modes & ADJ_OFFSET */ - if (txc->modes & ADJ_TICK) { - tick_usec = txc->tick; - tick_nsec = TICK_USEC_TO_NSEC(tick_usec); - } - } /* txc->modes */ -leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) - result = TIME_ERROR; - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset, SHIFT_UPDATE); - } - txc->freq = time_freq; - txc->maxerror = time_maxerror; - txc->esterror = time_esterror; - txc->status = time_status; - txc->constant = time_constant; - txc->precision = time_precision; - txc->tolerance = time_tolerance; - txc->tick = tick_usec; - - /* PPS is not implemented, so these are zero */ - txc->ppsfreq = 0; - txc->jitter = 0; - txc->shift = 0; - txc->stabil = 0; - txc->jitcnt = 0; - txc->calcnt = 0; - txc->errcnt = 0; - txc->stbcnt = 0; - write_sequnlock_irq(&xtime_lock); - do_gettimeofday(&txc->time); - notify_arch_cmos_timer(); - return(result); -} - asmlinkage long sys_adjtimex(struct timex __user *txc_p) { struct timex txc; /* Local copy of parameter */ diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e1dfd8e86cc..61a3907d16f 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1 +1 @@ -obj-y += clocksource.o jiffies.o +obj-y += ntp.o clocksource.o jiffies.o diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c new file mode 100644 index 00000000000..47195fa0ec4 --- /dev/null +++ b/kernel/time/ntp.c @@ -0,0 +1,350 @@ +/* + * linux/kernel/time/ntp.c + * + * NTP state machine interfaces and logic. + * + * This code was mainly moved from kernel/timer.c and kernel/time.c + * Please see those files for relevant copyright info and historical + * changelogs. + */ + +#include <linux/mm.h> +#include <linux/time.h> +#include <linux/timex.h> + +#include <asm/div64.h> +#include <asm/timex.h> + +/* + * Timekeeping variables + */ +unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ +unsigned long tick_nsec; /* ACTHZ period (nsec) */ +static u64 tick_length, tick_length_base; + +#define MAX_TICKADJ 500 /* microsecs */ +#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ + TICK_LENGTH_SHIFT) / HZ) + +/* + * phase-lock loop variables + */ +/* TIME_ERROR prevents overwriting the CMOS clock */ +static int time_state = TIME_OK; /* clock synchronization status */ +int time_status = STA_UNSYNC; /* clock status bits */ +static long time_offset; /* time adjustment (ns) */ +static long time_constant = 2; /* pll time constant */ +long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ +long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ +long time_freq; /* frequency offset (scaled ppm)*/ +static long time_reftime; /* time at last adjustment (s) */ +long time_adjust; + +#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE) +#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \ + (s64)CLOCK_TICK_RATE) + +static void ntp_update_frequency(void) +{ + tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; + tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); + + do_div(tick_length_base, HZ); + + tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; +} + +/** + * ntp_clear - Clears the NTP state variables + * + * Must be called while holding a write on the xtime_lock + */ +void ntp_clear(void) +{ + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + + ntp_update_frequency(); + + tick_length = tick_length_base; + time_offset = 0; +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + */ +void second_overflow(void) +{ + long time_adj; + + /* Bump the maxerror field */ + time_maxerror += MAXFREQ >> SHIFT_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. The microtime() + * routine or external clock driver will insure that reported time is + * always monotonic. The ugly divides should be replaced. + */ + switch (time_state) { + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + wall_to_monotonic.tv_sec++; + /* + * The timer interpolator will make time change + * gradually instead of an immediate jump by one second + */ + time_interpolator_update(-NSEC_PER_SEC); + time_state = TIME_OOP; + clock_was_set(); + printk(KERN_NOTICE "Clock: inserting leap second " + "23:59:60 UTC\n"); + } + break; + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + wall_to_monotonic.tv_sec--; + /* + * Use of time interpolator for a gradual change of + * time + */ + time_interpolator_update(NSEC_PER_SEC); + time_state = TIME_WAIT; + clock_was_set(); + printk(KERN_NOTICE "Clock: deleting leap second " + "23:59:59 UTC\n"); + } + break; + case TIME_OOP: + time_state = TIME_WAIT; + break; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + + /* + * Compute the phase adjustment for the next second. The offset is + * reduced by a fixed factor times the time constant. + */ + tick_length = tick_length_base; + time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); + time_offset -= time_adj; + tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); + + if (unlikely(time_adjust)) { + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; + } else if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; + } else { + time_adjust = 0; + tick_length += (s64)(time_adjust * NSEC_PER_USEC / + HZ) << TICK_LENGTH_SHIFT; + } + } +} + +/* + * Return how long ticks are at the moment, that is, how much time + * update_wall_time_one_tick will add to xtime next time we call it + * (assuming no calls to do_adjtimex in the meantime). + * The return value is in fixed-point nanoseconds shifted by the + * specified number of bits to the right of the binary point. + * This function has no side-effects. + */ +u64 current_tick_length(void) +{ + return tick_length; +} + + +void __attribute__ ((weak)) notify_arch_cmos_timer(void) +{ + return; +} + +/* adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int do_adjtimex(struct timex *txc) +{ + long ltemp, mtemp, save_adjust; + s64 freq_adj, temp64; + int result; + + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* Now we validate the data before disabling interrupts */ + + if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) + /* singleshot must not be used with any other mode bits */ + if (txc->modes != ADJ_OFFSET_SINGLESHOT) + return -EINVAL; + + if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) + /* adjustment Offset limited to +- .512 seconds */ + if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) + return -EINVAL; + + /* if the quartz is off by more than 10% something is VERY wrong ! */ + if (txc->modes & ADJ_TICK) + if (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ) + return -EINVAL; + + write_seqlock_irq(&xtime_lock); + result = time_state; /* mostly `TIME_OK' */ + + /* Save for later - semantics of adjtime is to return old value */ + save_adjust = time_adjust; + +#if 0 /* STA_CLOCKERR is never set yet */ + time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ +#endif + /* If there are input parameters, then process them */ + if (txc->modes) + { + if (txc->modes & ADJ_STATUS) /* only set allowed bits */ + time_status = (txc->status & ~STA_RONLY) | + (time_status & STA_RONLY); + + if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ + if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { + result = -EINVAL; + goto leave; + } + time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); + } + + if (txc->modes & ADJ_MAXERROR) { + if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { + result = -EINVAL; + goto leave; + } + time_maxerror = txc->maxerror; + } + + if (txc->modes & ADJ_ESTERROR) { + if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { + result = -EINVAL; + goto leave; + } + time_esterror = txc->esterror; + } + + if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ + if (txc->constant < 0) { /* NTP v4 uses values > 6 */ + result = -EINVAL; + goto leave; + } + time_constant = min(txc->constant + 4, (long)MAXTC); + } + + if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ + if (txc->modes == ADJ_OFFSET_SINGLESHOT) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + } + else if (time_status & STA_PLL) { + ltemp = txc->offset * NSEC_PER_USEC; + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC); + time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC); + + /* + * Select whether the frequency is to be controlled + * and in which mode (PLL or FLL). Clamp to the operating + * range. Ugly multiply/divide should be replaced someday. + */ + + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = xtime.tv_sec; + mtemp = xtime.tv_sec - time_reftime; + time_reftime = xtime.tv_sec; + + freq_adj = (s64)time_offset * mtemp; + freq_adj = shift_right(freq_adj, time_constant * 2 + + (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); + if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { + temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL); + if (time_offset < 0) { + temp64 = -temp64; + do_div(temp64, mtemp); + freq_adj -= temp64; + } else { + do_div(temp64, mtemp); + freq_adj += temp64; + } + } + freq_adj += time_freq; + freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); + time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); + time_offset = (time_offset / HZ) << SHIFT_UPDATE; + } /* STA_PLL */ + } /* txc->modes & ADJ_OFFSET */ + if (txc->modes & ADJ_TICK) + tick_usec = txc->tick; + + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) + ntp_update_frequency(); + } /* txc->modes */ +leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) + result = TIME_ERROR; + + if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) + txc->offset = save_adjust; + else + txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; + txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); + txc->maxerror = time_maxerror; + txc->esterror = time_esterror; + txc->status = time_status; + txc->constant = time_constant; + txc->precision = 1; + txc->tolerance = MAXFREQ; + txc->tick = tick_usec; + + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; + write_sequnlock_irq(&xtime_lock); + do_gettimeofday(&txc->time); + notify_arch_cmos_timer(); + return(result); +} diff --git a/kernel/timer.c b/kernel/timer.c index 4f55622b0d3..c1c7fbcffec 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -41,12 +41,6 @@ #include <asm/timex.h> #include <asm/io.h> -#ifdef CONFIG_TIME_INTERPOLATION -static void time_interpolator_update(long delta_nsec); -#else -#define time_interpolator_update(x) -#endif - u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -568,12 +562,6 @@ found: /******************************************************************/ -/* - * Timekeeping variables - */ -unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ -unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ - /* * The current time * wall_to_monotonic is what we need to add to xtime (or xtime corrected @@ -587,209 +575,6 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16))); EXPORT_SYMBOL(xtime); -/* Don't completely fail for HZ > 500. */ -int tickadj = 500/HZ ? : 1; /* microsecs */ - - -/* - * phase-lock loop variables - */ -/* TIME_ERROR prevents overwriting the CMOS clock */ -int time_state = TIME_OK; /* clock synchronization status */ -int time_status = STA_UNSYNC; /* clock status bits */ -long time_offset; /* time adjustment (us) */ -long time_constant = 2; /* pll time constant */ -long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ -long time_precision = 1; /* clock precision (us) */ -long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ -long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; - /* frequency offset (scaled ppm)*/ -static long time_adj; /* tick adjust (scaled 1 / HZ) */ -long time_reftime; /* time at last adjustment (s) */ -long time_adjust; -long time_next_adjust; - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - * - */ -static void second_overflow(void) -{ - long ltemp; - - /* Bump the maxerror field */ - time_maxerror += time_tolerance >> SHIFT_USEC; - if (time_maxerror > NTP_PHASE_LIMIT) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. The microtime() - * routine or external clock driver will insure that reported time is - * always monotonic. The ugly divides should be replaced. - */ - switch (time_state) { - case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; - break; - case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; - /* - * The timer interpolator will make time change - * gradually instead of an immediate jump by one second - */ - time_interpolator_update(-NSEC_PER_SEC); - time_state = TIME_OOP; - clock_was_set(); - printk(KERN_NOTICE "Clock: inserting leap second " - "23:59:60 UTC\n"); - } - break; - case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - wall_to_monotonic.tv_sec--; - /* - * Use of time interpolator for a gradual change of - * time - */ - time_interpolator_update(NSEC_PER_SEC); - time_state = TIME_WAIT; - clock_was_set(); - printk(KERN_NOTICE "Clock: deleting leap second " - "23:59:59 UTC\n"); - } - break; - case TIME_OOP: - time_state = TIME_WAIT; - break; - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - } - - /* - * Compute the phase adjustment for the next second. In PLL mode, the - * offset is reduced by a fixed factor times the time constant. In FLL - * mode the offset is used directly. In either mode, the maximum phase - * adjustment for each second is clamped so as to spread the adjustment - * over not more than the number of seconds between updates. - */ - ltemp = time_offset; - if (!(time_status & STA_FLL)) - ltemp = shift_right(ltemp, SHIFT_KG + time_constant); - ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); - ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); - time_offset -= ltemp; - time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - - /* - * Compute the frequency estimate and additional phase adjustment due - * to frequency error for the next second. - */ - ltemp = time_freq; - time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); - -#if HZ == 100 - /* - * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to - * get 128.125; => only 0.125% error (p. 14) - */ - time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); -#endif -#if HZ == 250 - /* - * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and - * 0.78125% to get 255.85938; => only 0.05% error (p. 14) - */ - time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); -#endif -#if HZ == 1000 - /* - * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and - * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) - */ - time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); -#endif -} - -/* - * Returns how many microseconds we need to add to xtime this tick - * in doing an adjustment requested with adjtime. - */ -static long adjtime_adjustment(void) -{ - long time_adjust_step; - - time_adjust_step = time_adjust; - if (time_adjust_step) { - /* - * We are doing an adjtime thing. Prepare time_adjust_step to - * be within bounds. Note that a positive time_adjust means we - * want the clock to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - time_adjust_step = min(time_adjust_step, (long)tickadj); - time_adjust_step = max(time_adjust_step, (long)-tickadj); - } - return time_adjust_step; -} - -/* in the NTP reference this is called "hardclock()" */ -static void update_ntp_one_tick(void) -{ - long time_adjust_step; - - time_adjust_step = adjtime_adjustment(); - if (time_adjust_step) - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; - - /* Changes by adjtime() do not take effect till next tick. */ - if (time_next_adjust != 0) { - time_adjust = time_next_adjust; - time_next_adjust = 0; - } -} - -/* - * Return how long ticks are at the moment, that is, how much time - * update_wall_time_one_tick will add to xtime next time we call it - * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds shifted by the - * specified number of bits to the right of the binary point. - * This function has no side-effects. - */ -u64 current_tick_length(void) -{ - long delta_nsec; - u64 ret; - - /* calculate the finest interval NTP will allow. - * ie: nanosecond value shifted by (SHIFT_SCALE - 10) - */ - delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; - ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); - - return ret; -} /* XXX - all of this timekeeping code should be later moved to time.c */ #include <linux/clocksource.h> @@ -966,10 +751,13 @@ void __init timekeeping_init(void) unsigned long flags; write_seqlock_irqsave(&xtime_lock, flags); + + ntp_clear(); + clock = clocksource_get_next(); clocksource_calculate_interval(clock, tick_nsec); clock->cycle_last = clocksource_read(clock); - ntp_clear(); + write_sequnlock_irqrestore(&xtime_lock, flags); } @@ -980,7 +768,7 @@ static int timekeeping_suspended; * @dev: unused * * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are + * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ static int timekeeping_resume(struct sys_device *dev) @@ -1149,8 +937,6 @@ static void update_wall_time(void) /* interpolator bits */ time_interpolator_update(clock->xtime_interval >> clock->shift); - /* increment the NTP state machine */ - update_ntp_one_tick(); /* accumulate error between NTP and clock interval */ clock->error += current_tick_length(); @@ -1230,9 +1016,6 @@ static inline void calc_load(unsigned long ticks) } } -/* jiffies at the most recent update of wall time */ -unsigned long wall_jiffies = INITIAL_JIFFIES; - /* * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. @@ -1270,7 +1053,6 @@ void run_local_timers(void) */ static inline void update_times(unsigned long ticks) { - wall_jiffies += ticks; update_wall_time(); calc_load(ticks); } @@ -1775,7 +1557,7 @@ unsigned long time_interpolator_get_offset(void) #define INTERPOLATOR_ADJUST 65536 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST -static void time_interpolator_update(long delta_nsec) +void time_interpolator_update(long delta_nsec) { u64 counter; unsigned long offset; diff --git a/kernel/tsacct.c b/kernel/tsacct.c new file mode 100644 index 00000000000..db443221ba5 --- /dev/null +++ b/kernel/tsacct.c @@ -0,0 +1,124 @@ +/* + * tsacct.c - System accounting over taskstats interface + * + * Copyright (C) Jay Lan, <jlan@sgi.com> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/tsacct_kern.h> +#include <linux/acct.h> +#include <linux/jiffies.h> + + +#define USEC_PER_TICK (USEC_PER_SEC/HZ) +/* + * fill in basic accounting fields + */ +void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) +{ + struct timespec uptime, ts; + s64 ac_etime; + + BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); + + /* calculate task elapsed time in timespec */ + do_posix_clock_monotonic_gettime(&uptime); + ts = timespec_sub(uptime, current->group_leader->start_time); + /* rebase elapsed time to usec */ + ac_etime = timespec_to_ns(&ts); + do_div(ac_etime, NSEC_PER_USEC); + stats->ac_etime = ac_etime; + stats->ac_btime = xtime.tv_sec - ts.tv_sec; + if (thread_group_leader(tsk)) { + stats->ac_exitcode = tsk->exit_code; + if (tsk->flags & PF_FORKNOEXEC) + stats->ac_flag |= AFORK; + } + if (tsk->flags & PF_SUPERPRIV) + stats->ac_flag |= ASU; + if (tsk->flags & PF_DUMPCORE) + stats->ac_flag |= ACORE; + if (tsk->flags & PF_SIGNALED) + stats->ac_flag |= AXSIG; + stats->ac_nice = task_nice(tsk); + stats->ac_sched = tsk->policy; + stats->ac_uid = tsk->uid; + stats->ac_gid = tsk->gid; + stats->ac_pid = tsk->pid; + stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; + stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; + stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; + stats->ac_minflt = tsk->min_flt; + stats->ac_majflt = tsk->maj_flt; + + strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); +} + + +#ifdef CONFIG_TASK_XACCT + +#define KB 1024 +#define MB (1024*KB) +/* + * fill in extended accounting fields + */ +void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) +{ + /* convert pages-jiffies to Mbyte-usec */ + stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; + stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; + if (p->mm) { + /* adjust to KB unit */ + stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB; + stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB; + } + stats->read_char = p->rchar; + stats->write_char = p->wchar; + stats->read_syscalls = p->syscr; + stats->write_syscalls = p->syscw; +} +#undef KB +#undef MB + +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting + */ +void acct_update_integrals(struct task_struct *tsk) +{ + if (likely(tsk->mm)) { + long delta = cputime_to_jiffies( + cputime_sub(tsk->stime, tsk->acct_stimexpd)); + + if (delta == 0) + return; + tsk->acct_stimexpd = tsk->stime; + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + } +} + +/** + * acct_clear_integrals - clear the mm integral fields in task_struct + * @tsk: task_struct whose accounting fields are cleared + */ +void acct_clear_integrals(struct task_struct *tsk) +{ + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; +} +#endif diff --git a/kernel/utsname.c b/kernel/utsname.c new file mode 100644 index 00000000000..c859164a699 --- /dev/null +++ b/kernel/utsname.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2004 IBM Corporation + * + * Author: Serge Hallyn <serue@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/module.h> +#include <linux/uts.h> +#include <linux/utsname.h> +#include <linux/version.h> + +/* + * Clone a new ns copying an original utsname, setting refcount to 1 + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +{ + struct uts_namespace *ns; + + ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + if (ns) { + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + kref_init(&ns->kref); + } + return ns; +} + +/* + * unshare the current process' utsname namespace. + * called only in sys_unshare() + */ +int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts) +{ + if (unshare_flags & CLONE_NEWUTS) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_uts = clone_uts_ns(current->nsproxy->uts_ns); + if (!*new_uts) + return -ENOMEM; + } + + return 0; +} + +/* + * Copy task tsk's utsname namespace, or clone it if flags + * specifies CLONE_NEWUTS. In latter case, changes to the + * utsname of this process won't be seen by parent, and vice + * versa. + */ +int copy_utsname(int flags, struct task_struct *tsk) +{ + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; + struct uts_namespace *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_uts_ns(old_ns); + + if (!(flags & CLONE_NEWUTS)) + return 0; + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + new_ns = clone_uts_ns(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + tsk->nsproxy->uts_ns = new_ns; + +out: + put_uts_ns(old_ns); + return err; +} + +void free_uts_ns(struct kref *kref) +{ + struct uts_namespace *ns; + + ns = container_of(kref, struct uts_namespace, kref); + kfree(ns); +} |