aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-10-02 08:45:08 -0400
committerSteven Whitehouse <swhiteho@redhat.com>2006-10-02 08:45:08 -0400
commit59458f40e25915a355d8b1d701425fe9f4f9ea23 (patch)
treef1c9a2934df686e36d75f759ab7313b6f0e0e5f9 /kernel
parent825f9075d74028d11d7f5932f04e1b5db3022b51 (diff)
parentd834c16516d1ebec4766fc58c059bf01311e6045 (diff)
Merge branch 'master' into gfs2
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c36
-rw-r--r--kernel/auditsc.c5
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/compat.c35
-rw-r--r--kernel/cpuset.c114
-rw-r--r--kernel/exit.c41
-rw-r--r--kernel/fork.c16
-rw-r--r--kernel/futex.c10
-rw-r--r--kernel/hrtimer.c20
-rw-r--r--kernel/irq/chip.c6
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kfifo.c28
-rw-r--r--kernel/kmod.c71
-rw-r--r--kernel/latency.c279
-rw-r--r--kernel/lockdep.c17
-rw-r--r--kernel/module.c26
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c15
-rw-r--r--kernel/posix-cpu-timers.c101
-rw-r--r--kernel/posix-timers.c21
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/rcutorture.c8
-rw-r--r--kernel/relay.c36
-rw-r--r--kernel/rtmutex.c51
-rw-r--r--kernel/sched.c76
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c3
-rw-r--r--kernel/spinlock.c15
-rw-r--r--kernel/stop_machine.c3
-rw-r--r--kernel/sys.c89
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/time.c173
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/ntp.c350
-rw-r--r--kernel/timer.c283
-rw-r--r--kernel/tsacct.c124
-rw-r--r--kernel/unwind.c4
41 files changed, 1380 insertions, 734 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66c1af..aacaafb28b9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o
+ hrtimer.o rwsem.o latency.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
@@ -49,7 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
-obj-$(CONFIG_TASKSTATS) += taskstats.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 2a7c933651c..0aad5ca36a8 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -483,10 +483,14 @@ static void do_acct_process(struct file *file)
ac.ac_ppid = current->parent->tgid;
#endif
- read_lock(&tasklist_lock); /* pin current->signal */
+ mutex_lock(&tty_mutex);
+ /* FIXME: Whoever is responsible for current->signal locking needs
+ to use the same locking all over the kernel and document it */
+ read_lock(&tasklist_lock);
ac.ac_tty = current->signal->tty ?
old_encode_dev(tty_devnum(current->signal->tty)) : 0;
read_unlock(&tasklist_lock);
+ mutex_unlock(&tty_mutex);
spin_lock_irq(&current->sighand->siglock);
ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
@@ -598,33 +602,3 @@ void acct_process(void)
do_acct_process(file);
fput(file);
}
-
-
-/**
- * acct_update_integrals - update mm integral fields in task_struct
- * @tsk: task_struct for accounting
- */
-void acct_update_integrals(struct task_struct *tsk)
-{
- if (likely(tsk->mm)) {
- long delta =
- cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
-
- if (delta == 0)
- return;
- tsk->acct_stimexpd = tsk->stime;
- tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
- tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
- }
-}
-
-/**
- * acct_clear_integrals - clear the mm integral fields in task_struct
- * @tsk: task_struct whose accounting fields are cleared
- */
-void acct_clear_integrals(struct task_struct *tsk)
-{
- tsk->acct_stimexpd = 0;
- tsk->acct_rss_mem1 = 0;
- tsk->acct_vm_mem1 = 0;
-}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fb83c5cb8c3..10514763175 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -817,6 +817,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_format(ab, " success=%s exit=%ld",
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
+
+ mutex_lock(&tty_mutex);
if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
tty = tsk->signal->tty->name;
else
@@ -838,6 +840,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->gid,
context->euid, context->suid, context->fsuid,
context->egid, context->sgid, context->fsgid, tty);
+
+ mutex_unlock(&tty_mutex);
+
audit_log_task_info(ab, tsk);
if (context->filterkey) {
audit_log_format(ab, " key=");
diff --git a/kernel/capability.c b/kernel/capability.c
index c7685ad00a9..edb845a6e84 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -133,7 +133,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
int found = 0;
do_each_thread(g, target) {
- if (target == current || target->pid == 1)
+ if (target == current || is_init(target))
continue;
found = 1;
if (security_capset_check(target, effective, inheritable,
diff --git a/kernel/compat.c b/kernel/compat.c
index 126dee9530a..b4fbd838cd7 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -22,9 +22,12 @@
#include <linux/security.h>
#include <linux/timex.h>
#include <linux/migrate.h>
+#include <linux/posix-timers.h>
#include <asm/uaccess.h>
+extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+
int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
@@ -601,6 +604,30 @@ long compat_sys_clock_getres(clockid_t which_clock,
return err;
}
+static long compat_clock_nanosleep_restart(struct restart_block *restart)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec tu;
+ struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1);
+
+ restart->arg1 = (unsigned long) &tu;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = clock_nanosleep_restart(restart);
+ set_fs(oldfs);
+
+ if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+ put_compat_timespec(&tu, rmtp))
+ return -EFAULT;
+
+ if (err == -ERESTART_RESTARTBLOCK) {
+ restart->fn = compat_clock_nanosleep_restart;
+ restart->arg1 = (unsigned long) rmtp;
+ }
+ return err;
+}
+
long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
struct compat_timespec __user *rqtp,
struct compat_timespec __user *rmtp)
@@ -608,6 +635,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
long err;
mm_segment_t oldfs;
struct timespec in, out;
+ struct restart_block *restart;
if (get_compat_timespec(&in, rqtp))
return -EFAULT;
@@ -618,9 +646,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
(struct timespec __user *) &in,
(struct timespec __user *) &out);
set_fs(oldfs);
+
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
put_compat_timespec(&out, rmtp))
return -EFAULT;
+
+ if (err == -ERESTART_RESTARTBLOCK) {
+ restart = &current_thread_info()->restart_block;
+ restart->fn = compat_clock_nanosleep_restart;
+ restart->arg1 = (unsigned long) rmtp;
+ }
return err;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1b32c2c04c1..9d850ae13b1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -240,7 +240,7 @@ static struct super_block *cpuset_sb;
* A cpuset can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cpusets is empty. Since all
* tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * least one task in the system (init), therefore, top_cpuset
* always has either children cpusets and/or using tasks. So we don't
* need a special hack to ensure that top_cpuset cannot be deleted.
*
@@ -377,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directories start off with i_nlink == 2 (for "." entry) */
- inode->i_nlink++;
+ inc_nlink(inode);
} else {
return -ENOMEM;
}
@@ -912,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
int fudge;
int retval;
+ /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
trialcs = *cs;
retval = nodelist_parse(buf, trialcs.mems_allowed);
if (retval < 0)
@@ -1221,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
task_lock(tsk);
oldcs = tsk->cpuset;
- if (!oldcs) {
+ /*
+ * After getting 'oldcs' cpuset ptr, be sure still not exiting.
+ * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
+ * then fail this attach_task(), to avoid breaking top_cpuset.count.
+ */
+ if (tsk->flags & PF_EXITING) {
task_unlock(tsk);
mutex_unlock(&callback_mutex);
put_task_struct(tsk);
@@ -1556,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
inode->i_fop = &simple_dir_operations;
/* start off with i_nlink == 2 (for "." entry) */
- inode->i_nlink++;
+ inc_nlink(inode);
} else if (S_ISREG(mode)) {
inode->i_size = 0;
inode->i_fop = &cpuset_file_operations;
@@ -1589,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
error = cpuset_create_file(dentry, S_IFDIR | mode);
if (!error) {
dentry->d_fsdata = cs;
- parent->d_inode->i_nlink++;
+ inc_nlink(parent->d_inode);
cs->dentry = dentry;
}
dput(dentry);
@@ -2024,7 +2033,7 @@ int __init cpuset_init(void)
}
root = cpuset_mount->mnt_sb->s_root;
root->d_fsdata = &top_cpuset;
- root->d_inode->i_nlink++;
+ inc_nlink(root->d_inode);
top_cpuset.dentry = root;
root->d_inode->i_op = &cpuset_dir_inode_operations;
number_of_cpusets = 1;
@@ -2036,33 +2045,104 @@ out:
return err;
}
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period. This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * This handles CPU hotplug (cpuhp) events. If someday Memory
- * Nodes can be hotplugged (dynamically changing node_online_map)
- * then we should handle that too, perhaps in a similar way.
+ * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then the guarantee_online_cpus()
+ * or guarantee_online_mems() code will use that emptied cpusets
+ * parent online CPUs or nodes. Cpusets that were already empty of
+ * CPUs or nodes are left empty.
+ *
+ * This routine is intentionally inefficient in a couple of regards.
+ * It will check all cpusets in a subtree even if the top cpuset of
+ * the subtree has no offline CPUs or nodes. It checks both CPUs and
+ * nodes, even though the caller could have been coded to know that
+ * only one of CPUs or nodes needed to be checked on a given call.
+ * This was done to minimize text size rather than cpu cycles.
+ *
+ * Call with both manage_mutex and callback_mutex held.
+ *
+ * Recursive, on depth of cpuset subtree.
*/
-#ifdef CONFIG_HOTPLUG_CPU
-static int cpuset_handle_cpuhp(struct notifier_block *nb,
- unsigned long phase, void *cpu)
+static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+{
+ struct cpuset *c;
+
+ /* Each of our child cpusets mems must be online */
+ list_for_each_entry(c, &cur->children, sibling) {
+ guarantee_online_cpus_mems_in_subtree(c);
+ if (!cpus_empty(c->cpus_allowed))
+ guarantee_online_cpus(c, &c->cpus_allowed);
+ if (!nodes_empty(c->mems_allowed))
+ guarantee_online_mems(c, &c->mems_allowed);
+ }
+}
+
+/*
+ * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
+ * cpu_online_map and node_online_map. Force the top cpuset to track
+ * whats online after any CPU or memory node hotplug or unplug event.
+ *
+ * To ensure that we don't remove a CPU or node from the top cpuset
+ * that is currently in use by a child cpuset (which would violate
+ * the rule that cpusets must be subsets of their parent), we first
+ * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ *
+ * Since there are two callers of this routine, one for CPU hotplug
+ * events and one for memory node hotplug events, we could have coded
+ * two separate routines here. We code it as a single common routine
+ * in order to minimize text size.
+ */
+
+static void common_cpu_mem_hotplug_unplug(void)
{
mutex_lock(&manage_mutex);
mutex_lock(&callback_mutex);
+ guarantee_online_cpus_mems_in_subtree(&top_cpuset);
top_cpuset.cpus_allowed = cpu_online_map;
+ top_cpuset.mems_allowed = node_online_map;
mutex_unlock(&callback_mutex);
mutex_unlock(&manage_mutex);
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period. This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_online_map on each CPU hotplug (cpuhp) event.
+ */
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+ unsigned long phase, void *cpu)
+{
+ common_cpu_mem_hotplug_unplug();
return 0;
}
#endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Keep top_cpuset.mems_allowed tracking node_online_map.
+ * Call this routine anytime after you change node_online_map.
+ * See also the previous routine cpuset_handle_cpuhp().
+ */
+
+void cpuset_track_online_nodes()
+{
+ common_cpu_mem_hotplug_unplug();
+}
+#endif
+
/**
* cpuset_init_smp - initialize cpus_allowed
*
diff --git a/kernel/exit.c b/kernel/exit.c
index d891883420f..3b47f26985f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -18,6 +18,7 @@
#include <linux/security.h>
#include <linux/cpu.h>
#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/binfmts.h>
#include <linux/ptrace.h>
@@ -38,6 +39,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
+#include <linux/blkdev.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -219,7 +221,7 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
if (p == ignored_task
|| p->exit_state
- || p->real_parent->pid == 1)
+ || is_init(p->real_parent))
continue;
if (process_group(p->real_parent) != pgrp
&& p->real_parent->signal->session == p->signal->session) {
@@ -249,17 +251,6 @@ static int has_stopped_jobs(int pgrp)
do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
if (p->state != TASK_STOPPED)
continue;
-
- /* If p is stopped by a debugger on a signal that won't
- stop it, then don't count p as stopped. This isn't
- perfect but it's a good approximation. */
- if (unlikely (p->ptrace)
- && p->exit_code != SIGSTOP
- && p->exit_code != SIGTSTP
- && p->exit_code != SIGTTOU
- && p->exit_code != SIGTTIN)
- continue;
-
retval = 1;
break;
} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
@@ -292,9 +283,7 @@ static void reparent_to_init(void)
/* Set the exit signal to SIGCHLD so we signal init on exit */
current->exit_signal = SIGCHLD;
- if ((current->policy == SCHED_NORMAL ||
- current->policy == SCHED_BATCH)
- && (task_nice(current) < 0))
+ if (!has_rt_policy(current) && (task_nice(current) < 0))
set_user_nice(current, 0);
/* cpus_allowed? */
/* rt_priority? */
@@ -487,6 +476,18 @@ void fastcall put_files_struct(struct files_struct *files)
EXPORT_SYMBOL(put_files_struct);
+void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
+{
+ struct files_struct *old;
+
+ old = tsk->files;
+ task_lock(tsk);
+ tsk->files = files;
+ task_unlock(tsk);
+ put_files_struct(old);
+}
+EXPORT_SYMBOL(reset_files_struct);
+
static inline void __exit_files(struct task_struct *tsk)
{
struct files_struct * files = tsk->files;
@@ -954,15 +955,15 @@ fastcall NORET_TYPE void do_exit(long code)
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
- /* PF_DEAD causes final put_task_struct after we schedule. */
preempt_disable();
- BUG_ON(tsk->flags & PF_DEAD);
- tsk->flags |= PF_DEAD;
+ /* causes final put_task_struct in finish_task_switch(). */
+ tsk->state = TASK_DEAD;
schedule();
BUG();
/* Avoid "noreturn function does return". */
- for (;;) ;
+ for (;;)
+ cpu_relax(); /* For when BUG is null */
}
EXPORT_SYMBOL_GPL(do_exit);
@@ -971,7 +972,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
{
if (comp)
complete(comp);
-
+
do_exit(code);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index a0dad84567c..89f666491d1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
@@ -183,7 +184,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
atomic_set(&tsk->fs_excl, 0);
+#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
+#endif
tsk->splice_pipe = NULL;
return tsk;
}
@@ -1061,7 +1064,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ p->hardirqs_enabled = 1;
+#else
p->hardirqs_enabled = 0;
+#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
@@ -1144,7 +1151,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
-
p->parent_exec_id = p->self_exec_id;
/* ok, now we should be set up.. */
@@ -1167,6 +1173,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
+ /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
+ p->ioprio = current->ioprio;
+
/*
* The task hasn't been attached yet, so its cpus_allowed mask will
* not be changed, nor will its assigned CPU.
@@ -1226,11 +1235,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
}
- /*
- * inherit ioprio
- */
- p->ioprio = current->ioprio;
-
if (likely(p->pid)) {
add_parent(p);
if (unlikely(p->ptrace & PT_PTRACED))
diff --git a/kernel/futex.c b/kernel/futex.c
index 9d260e838cf..4b6770e9806 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -389,7 +389,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
{
struct task_struct *p;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
p = find_task_by_pid(pid);
if (!p)
goto out_unlock;
@@ -403,7 +403,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
}
get_task_struct(p);
out_unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return p;
}
@@ -1624,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
struct task_struct *p;
ret = -ESRCH;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
p = find_task_by_pid(pid);
if (!p)
goto err_unlock;
@@ -1633,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
!capable(CAP_SYS_PTRACE))
goto err_unlock;
head = p->robust_list;
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
}
if (put_user(sizeof(*head), len_ptr))
@@ -1641,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
return put_user(head, head_ptr);
err_unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return ret;
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 21c38a7e666..d0ba190dfeb 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -693,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
return t->task == NULL;
}
-static long __sched nanosleep_restart(struct restart_block *restart)
+long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
struct hrtimer_sleeper t;
struct timespec __user *rmtp;
@@ -702,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart)
restart->fn = do_no_restart_syscall;
- hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
- t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+ hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS);
+ t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
if (do_nanosleep(&t, HRTIMER_ABS))
return 0;
- rmtp = (struct timespec __user *) restart->arg2;
+ rmtp = (struct timespec __user *) restart->arg1;
if (rmtp) {
time = ktime_sub(t.timer.expires, t.timer.base->get_time());
if (time.tv64 <= 0)
@@ -718,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
return -EFAULT;
}
- restart->fn = nanosleep_restart;
+ restart->fn = hrtimer_nanosleep_restart;
/* The other values in restart are already filled in */
return -ERESTART_RESTARTBLOCK;
@@ -751,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
}
restart = &current_thread_info()->restart_block;
- restart->fn = nanosleep_restart;
- restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
- restart->arg1 = t.timer.expires.tv64 >> 32;
- restart->arg2 = (unsigned long) rmtp;
- restart->arg3 = (unsigned long) t.timer.base->index;
+ restart->fn = hrtimer_nanosleep_restart;
+ restart->arg0 = (unsigned long) t.timer.base->index;
+ restart->arg1 = (unsigned long) rmtp;
+ restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF;
+ restart->arg3 = t.timer.expires.tv64 >> 32;
return -ERESTART_RESTARTBLOCK;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ac1f850d493..736cb0bd498 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -40,10 +40,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
spin_lock_irqsave(&desc->lock, flags);
irq_chip_set_defaults(chip);
desc->chip = chip;
- /*
- * For compatibility only:
- */
- desc->chip = chip;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
@@ -146,7 +142,7 @@ static void default_disable(unsigned int irq)
struct irq_desc *desc = irq_desc + irq;
if (!(desc->status & IRQ_DELAYED_DISABLE))
- irq_desc[irq].chip->mask(irq);
+ desc->chip->mask(irq);
}
/*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 50087ecf337..fcdd5d2bc3f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,7 +40,7 @@ struct resource crashk_res = {
int kexec_should_crash(struct task_struct *p)
{
- if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+ if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
return 1;
return 0;
}
@@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
image = xchg(dest_image, image);
out:
- xchg(&kexec_lock, 0); /* Release the mutex */
+ locked = xchg(&kexec_lock, 0); /* Release the mutex */
+ BUG_ON(!locked);
kimage_free(image);
return result;
@@ -1061,7 +1062,8 @@ void crash_kexec(struct pt_regs *regs)
machine_crash_shutdown(&fixed_regs);
machine_kexec(kexec_crash_image);
}
- xchg(&kexec_lock, 0);
+ locked = xchg(&kexec_lock, 0);
+ BUG_ON(!locked);
}
}
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 64ab045c3d9..5d1d907378a 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
len = min(len, fifo->size - fifo->in + fifo->out);
+ /*
+ * Ensure that we sample the fifo->out index -before- we
+ * start putting bytes into the kfifo.
+ */
+
+ smp_mb();
+
/* first put the data starting from fifo->in to buffer end */
l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
@@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
/* then put the rest (if any) at the beginning of the buffer */
memcpy(fifo->buffer, buffer + l, len - l);
+ /*
+ * Ensure that we add the bytes to the kfifo -before-
+ * we update the fifo->in index.
+ */
+
+ smp_wmb();
+
fifo->in += len;
return len;
@@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
len = min(len, fifo->in - fifo->out);
+ /*
+ * Ensure that we sample the fifo->in index -before- we
+ * start removing bytes from the kfifo.
+ */
+
+ smp_rmb();
+
/* first get the data from fifo->out until the end of the buffer */
l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
@@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
/* then get the rest (if any) from the beginning of the buffer */
memcpy(buffer + l, fifo->buffer, len - l);
+ /*
+ * Ensure that we remove the bytes from the kfifo -before-
+ * we update the fifo->out index.
+ */
+
+ smp_mb();
+
fifo->out += len;
return len;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5c470c57fb5..f8121b95183 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -35,6 +35,7 @@
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/resource.h>
#include <asm/uaccess.h>
extern int max_threads;
@@ -122,6 +123,7 @@ struct subprocess_info {
struct key *ring;
int wait;
int retval;
+ struct file *stdin;
};
/*
@@ -145,12 +147,29 @@ static int ____call_usermodehelper(void *data)
key_put(old_session);
+ /* Install input pipe when needed */
+ if (sub_info->stdin) {
+ struct files_struct *f = current->files;
+ struct fdtable *fdt;
+ /* no races because files should be private here */
+ sys_close(0);
+ fd_install(0, sub_info->stdin);
+ spin_lock(&f->file_lock);
+ fdt = files_fdtable(f);
+ FD_SET(0, fdt->open_fds);
+ FD_CLR(0, fdt->close_on_exec);
+ spin_unlock(&f->file_lock);
+
+ /* and disallow core files too */
+ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
+ }
+
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed(current, CPU_MASK_ALL);
retval = -EPERM;
if (current->fs->root)
- retval = execve(sub_info->path, sub_info->argv,sub_info->envp);
+ retval = execve(sub_info->path, sub_info->argv, sub_info->envp);
/* Exec failed? */
sub_info->retval = retval;
@@ -176,6 +195,8 @@ static int wait_for_helper(void *data)
if (pid < 0) {
sub_info->retval = pid;
} else {
+ int ret;
+
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
@@ -185,7 +206,15 @@ static int wait_for_helper(void *data)
*
* Thus the __user pointer cast is valid here.
*/
- sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL);
+ sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+ /*
+ * If ret is 0, either ____call_usermodehelper failed and the
+ * real error code is already in sub_info->retval or
+ * sub_info->retval is 0 anyway, so don't mess with it then.
+ */
+ if (ret)
+ sub_info->retval = ret;
}
complete(sub_info->complete);
@@ -258,6 +287,44 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
}
EXPORT_SYMBOL(call_usermodehelper_keys);
+int call_usermodehelper_pipe(char *path, char **argv, char **envp,
+ struct file **filp)
+{
+ DECLARE_COMPLETION(done);
+ struct subprocess_info sub_info = {
+ .complete = &done,
+ .path = path,
+ .argv = argv,
+ .envp = envp,
+ .retval = 0,
+ };
+ struct file *f;
+ DECLARE_WORK(work, __call_usermodehelper, &sub_info);
+
+ if (!khelper_wq)
+ return -EBUSY;
+
+ if (path[0] == '\0')
+ return 0;
+
+ f = create_write_pipe();
+ if (!f)
+ return -ENOMEM;
+ *filp = f;
+
+ f = create_read_pipe(f);
+ if (!f) {
+ free_write_pipe(*filp);
+ return -ENOMEM;
+ }
+ sub_info.stdin = f;
+
+ queue_work(khelper_wq, &work);
+ wait_for_completion(&done);
+ return sub_info.retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_pipe);
+
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/latency.c b/kernel/latency.c
new file mode 100644
index 00000000000..258f2555abb
--- /dev/null
+++ b/kernel/latency.c
@@ -0,0 +1,279 @@
+/*
+ * latency.c: Explicit system-wide latency-expectation infrastructure
+ *
+ * The purpose of this infrastructure is to allow device drivers to set
+ * latency constraint they have and to collect and summarize these
+ * expectations globally. The cummulated result can then be used by
+ * power management and similar users to make decisions that have
+ * tradoffs with a latency component.
+ *
+ * An example user of this are the x86 C-states; each higher C state saves
+ * more power, but has a higher exit latency. For the idle loop power
+ * code to make a good decision which C-state to use, information about
+ * acceptable latencies is required.
+ *
+ * An example announcer of latency is an audio driver that knowns it
+ * will get an interrupt when the hardware has 200 usec of samples
+ * left in the DMA buffer; in that case the driver can set a latency
+ * constraint of, say, 150 usec.
+ *
+ * Multiple drivers can each announce their maximum accepted latency,
+ * to keep these appart, a string based identifier is used.
+ *
+ *
+ * (C) Copyright 2006 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/latency.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <asm/atomic.h>
+
+struct latency_info {
+ struct list_head list;
+ int usecs;
+ char *identifier;
+};
+
+/*
+ * locking rule: all modifications to current_max_latency and
+ * latency_list need to be done while holding the latency_lock.
+ * latency_lock needs to be taken _irqsave.
+ */
+static atomic_t current_max_latency;
+static DEFINE_SPINLOCK(latency_lock);
+
+static LIST_HEAD(latency_list);
+static BLOCKING_NOTIFIER_HEAD(latency_notifier);
+
+/*
+ * This function returns the maximum latency allowed, which
+ * happens to be the minimum of all maximum latencies on the
+ * list.
+ */
+static int __find_max_latency(void)
+{
+ int min = INFINITE_LATENCY;
+ struct latency_info *info;
+
+ list_for_each_entry(info, &latency_list, list) {
+ if (info->usecs < min)
+ min = info->usecs;
+ }
+ return min;
+}
+
+/**
+ * set_acceptable_latency - sets the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function sleeps and can only be called from process
+ * context.
+ * Calling this function with an existing identifier is valid
+ * and will cause the existing latency setting to be changed.
+ */
+void set_acceptable_latency(char *identifier, int usecs)
+{
+ struct latency_info *info, *iter;
+ unsigned long flags;
+ int found_old = 0;
+
+ info = kzalloc(sizeof(struct latency_info), GFP_KERNEL);
+ if (!info)
+ return;
+ info->usecs = usecs;
+ info->identifier = kstrdup(identifier, GFP_KERNEL);
+ if (!info->identifier)
+ goto free_info;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ list_for_each_entry(iter, &latency_list, list) {
+ if (strcmp(iter->identifier, identifier)==0) {
+ found_old = 1;
+ iter->usecs = usecs;
+ break;
+ }
+ }
+ if (!found_old)
+ list_add(&info->list, &latency_list);
+
+ if (usecs < atomic_read(&current_max_latency))
+ atomic_set(&current_max_latency, usecs);
+
+ spin_unlock_irqrestore(&latency_lock, flags);
+
+ blocking_notifier_call_chain(&latency_notifier,
+ atomic_read(&current_max_latency), NULL);
+
+ /*
+ * if we inserted the new one, we're done; otherwise there was
+ * an existing one so we need to free the redundant data
+ */
+ if (!found_old)
+ return;
+
+ kfree(info->identifier);
+free_info:
+ kfree(info);
+}
+EXPORT_SYMBOL_GPL(set_acceptable_latency);
+
+/**
+ * modify_acceptable_latency - changes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ *
+ * Due to the atomic nature of this function, the modified latency
+ * value will only be used for future decisions; past decisions
+ * can still lead to longer latencies in the near future.
+ */
+void modify_acceptable_latency(char *identifier, int usecs)
+{
+ struct latency_info *iter;
+ unsigned long flags;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ list_for_each_entry(iter, &latency_list, list) {
+ if (strcmp(iter->identifier, identifier) == 0) {
+ iter->usecs = usecs;
+ break;
+ }
+ }
+ if (usecs < atomic_read(&current_max_latency))
+ atomic_set(&current_max_latency, usecs);
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(modify_acceptable_latency);
+
+/**
+ * remove_acceptable_latency - removes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ *
+ * This function removes a previously set maximum latency setting
+ * for the driver and frees up any resources associated with the
+ * bookkeeping needed for this.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ */
+void remove_acceptable_latency(char *identifier)
+{
+ unsigned long flags;
+ int newmax = 0;
+ struct latency_info *iter, *temp;
+
+ spin_lock_irqsave(&latency_lock, flags);
+
+ list_for_each_entry_safe(iter, temp, &latency_list, list) {
+ if (strcmp(iter->identifier, identifier) == 0) {
+ list_del(&iter->list);
+ newmax = iter->usecs;
+ kfree(iter->identifier);
+ kfree(iter);
+ break;
+ }
+ }
+
+ /* If we just deleted the system wide value, we need to
+ * recalculate with a full search
+ */
+ if (newmax == atomic_read(&current_max_latency)) {
+ newmax = __find_max_latency();
+ atomic_set(&current_max_latency, newmax);
+ }
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(remove_acceptable_latency);
+
+/**
+ * system_latency_constraint - queries the system wide latency maximum
+ *
+ * This function returns the system wide maximum latency in
+ * microseconds.
+ *
+ * This function does not sleep and can be called in any context.
+ */
+int system_latency_constraint(void)
+{
+ return atomic_read(&current_max_latency);
+}
+EXPORT_SYMBOL_GPL(system_latency_constraint);
+
+/**
+ * synchronize_acceptable_latency - recalculates all latency decisions
+ *
+ * This function will cause a callback to various kernel pieces that
+ * will make those pieces rethink their latency decisions. This implies
+ * that if there are overlong latencies in hardware state already, those
+ * latencies get taken right now. When this call completes no overlong
+ * latency decisions should be active anymore.
+ *
+ * Typical usecase of this is after a modify_acceptable_latency() call,
+ * which in itself is non-blocking and non-synchronizing.
+ *
+ * This function blocks and should not be called with locks held.
+ */
+
+void synchronize_acceptable_latency(void)
+{
+ blocking_notifier_call_chain(&latency_notifier,
+ atomic_read(&current_max_latency), NULL);
+}
+EXPORT_SYMBOL_GPL(synchronize_acceptable_latency);
+
+/*
+ * Latency notifier: this notifier gets called when a non-atomic new
+ * latency value gets set. The expectation nof the caller of the
+ * non-atomic set is that when the call returns, future latencies
+ * are within bounds, so the functions on the notifier list are
+ * expected to take the overlong latencies immediately, inside the
+ * callback, and not make a overlong latency decision anymore.
+ *
+ * The callback gets called when the new latency value is made
+ * active so system_latency_constraint() returns the new latency.
+ */
+int register_latency_notifier(struct notifier_block * nb)
+{
+ return blocking_notifier_chain_register(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(register_latency_notifier);
+
+int unregister_latency_notifier(struct notifier_block * nb)
+{
+ return blocking_notifier_chain_unregister(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_latency_notifier);
+
+static __init int latency_init(void)
+{
+ atomic_set(&current_max_latency, INFINITE_LATENCY);
+ /*
+ * we don't want by default to have longer latencies than 2 ticks,
+ * since that would cause lost ticks
+ */
+ set_acceptable_latency("kernel", 2*1000000/HZ);
+ return 0;
+}
+
+module_init(latency_init);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c088e5542e8..e596525669e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -36,6 +36,7 @@
#include <linux/stacktrace.h>
#include <linux/debug_locks.h>
#include <linux/irqflags.h>
+#include <linux/utsname.h>
#include <asm/sections.h>
@@ -121,8 +122,8 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
* unique.
*/
#define iterate_chain_key(key1, key2) \
- (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
- ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
+ (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
+ ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
(key2))
void lockdep_off(void)
@@ -515,6 +516,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
return 0;
}
+static void print_kernel_version(void)
+{
+ printk("%s %.*s\n", system_utsname.release,
+ (int)strcspn(system_utsname.version, " "),
+ system_utsname.version);
+}
+
/*
* When a circular dependency is detected, print the
* header first:
@@ -531,6 +539,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
printk("\n=======================================================\n");
printk( "[ INFO: possible circular locking dependency detected ]\n");
+ print_kernel_version();
printk( "-------------------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, curr->pid);
@@ -712,6 +721,7 @@ print_bad_irq_dependency(struct task_struct *curr,
printk("\n======================================================\n");
printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
irqclass, irqclass);
+ print_kernel_version();
printk( "------------------------------------------------------\n");
printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, curr->pid,
@@ -793,6 +803,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
printk("\n=============================================\n");
printk( "[ INFO: possible recursive locking detected ]\n");
+ print_kernel_version();
printk( "---------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, curr->pid);
@@ -1375,6 +1386,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
printk("\n=========================================================\n");
printk( "[ INFO: possible irq lock inversion dependency detected ]\n");
+ print_kernel_version();
printk( "---------------------------------------------------------\n");
printk("%s/%d just changed the state of lock:\n",
curr->comm, curr->pid);
@@ -1469,6 +1481,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
printk("\n=================================\n");
printk( "[ INFO: inconsistent lock state ]\n");
+ print_kernel_version();
printk( "---------------------------------\n");
printk("inconsistent {%s} -> {%s} usage.\n",
diff --git a/kernel/module.c b/kernel/module.c
index b7fe6e84096..05625d5dc75 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -933,6 +933,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
return sprintf(buf, "0x%lx\n", sattr->address);
}
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+ int section;
+
+ for (section = 0; section < sect_attrs->nsections; section++)
+ kfree(sect_attrs->attrs[section].name);
+ kfree(sect_attrs);
+}
+
static void add_sect_attrs(struct module *mod, unsigned int nsect,
char *secstrings, Elf_Shdr *sechdrs)
{
@@ -949,21 +958,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
+ nloaded * sizeof(sect_attrs->attrs[0]),
sizeof(sect_attrs->grp.attrs[0]));
size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
- if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
+ sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+ if (sect_attrs == NULL)
return;
/* Setup section attributes. */
sect_attrs->grp.name = "sections";
sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
+ sect_attrs->nsections = 0;
sattr = &sect_attrs->attrs[0];
gattr = &sect_attrs->grp.attrs[0];
for (i = 0; i < nsect; i++) {
if (! (sechdrs[i].sh_flags & SHF_ALLOC))
continue;
sattr->address = sechdrs[i].sh_addr;
- strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
- MODULE_SECT_NAME_LEN);
+ sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
+ GFP_KERNEL);
+ if (sattr->name == NULL)
+ goto out;
+ sect_attrs->nsections++;
sattr->mattr.show = module_sect_show;
sattr->mattr.store = NULL;
sattr->mattr.attr.name = sattr->name;
@@ -979,7 +993,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
mod->sect_attrs = sect_attrs;
return;
out:
- kfree(sect_attrs);
+ free_sect_attrs(sect_attrs);
}
static void remove_sect_attrs(struct module *mod)
@@ -989,13 +1003,13 @@ static void remove_sect_attrs(struct module *mod)
&mod->sect_attrs->grp);
/* We are positive that no one is using any sect attrs
* at this point. Deallocate immediately. */
- kfree(mod->sect_attrs);
+ free_sect_attrs(mod->sect_attrs);
mod->sect_attrs = NULL;
}
}
-
#else
+
static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
char *sectstrings, Elf_Shdr *sechdrs)
{
diff --git a/kernel/panic.c b/kernel/panic.c
index 6ceb664fb52..525e365f723 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,7 +21,6 @@
#include <linux/debug_locks.h>
int panic_on_oops;
-int panic_on_unrecovered_nmi;
int tainted;
static int pause_on_oops;
static int pause_on_oops_flag;
diff --git a/kernel/params.c b/kernel/params.c
index 91aea7aa532..f406655d665 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -547,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name,
unsigned int name_skip)
{
struct module_kobject *mk;
+ int ret;
mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
BUG_ON(!mk);
@@ -554,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
mk->mod = THIS_MODULE;
kobj_set_kset_s(mk, module_subsys);
kobject_set_name(&mk->kobj, name);
- kobject_register(&mk->kobj);
+ ret = kobject_register(&mk->kobj);
+ BUG_ON(ret < 0);
/* no need to keep the kobject if no parameter is exported */
if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
@@ -684,13 +686,20 @@ decl_subsys(module, &module_ktype, NULL);
*/
static int __init param_sysfs_init(void)
{
- subsystem_register(&module_subsys);
+ int ret;
+
+ ret = subsystem_register(&module_subsys);
+ if (ret < 0) {
+ printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
+ __FILE__, __LINE__, ret);
+ return ret;
+ }
param_sysfs_builtin();
return 0;
}
-__initcall(param_sysfs_init);
+subsys_initcall(param_sysfs_init);
EXPORT_SYMBOL(param_set_byte);
EXPORT_SYMBOL(param_get_byte);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d38d9ec3276..479b16b44f7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1393,25 +1393,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
}
-static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
-
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct timespec __user *rmtp)
+static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct itimerspec *it)
{
- struct restart_block *restart_block =
- &current_thread_info()->restart_block;
struct k_itimer timer;
int error;
/*
- * Diagnose required errors first.
- */
- if (CPUCLOCK_PERTHREAD(which_clock) &&
- (CPUCLOCK_PID(which_clock) == 0 ||
- CPUCLOCK_PID(which_clock) == current->pid))
- return -EINVAL;
-
- /*
* Set up a temporary timer and then wait for it to go off.
*/
memset(&timer, 0, sizeof timer);
@@ -1422,11 +1410,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
timer.it_process = current;
if (!error) {
static struct itimerspec zero_it;
- struct itimerspec it = { .it_value = *rqtp,
- .it_interval = {} };
+
+ memset(it, 0, sizeof *it);
+ it->it_value = *rqtp;
spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+ error = posix_cpu_timer_set(&timer, flags, it, NULL);
if (error) {
spin_unlock_irq(&timer.it_lock);
return error;
@@ -1454,49 +1443,89 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
* We were interrupted by a signal.
*/
sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
- posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+ posix_cpu_timer_set(&timer, 0, &zero_it, it);
spin_unlock_irq(&timer.it_lock);
- if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+ if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
/*
* It actually did fire already.
*/
return 0;
}
+ error = -ERESTART_RESTARTBLOCK;
+ }
+
+ return error;
+}
+
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct timespec __user *rmtp)
+{
+ struct restart_block *restart_block =
+ &current_thread_info()->restart_block;
+ struct itimerspec it;
+ int error;
+
+ /*
+ * Diagnose required errors first.
+ */
+ if (CPUCLOCK_PERTHREAD(which_clock) &&
+ (CPUCLOCK_PID(which_clock) == 0 ||
+ CPUCLOCK_PID(which_clock) == current->pid))
+ return -EINVAL;
+
+ error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+
+ if (error == -ERESTART_RESTARTBLOCK) {
+
+ if (flags & TIMER_ABSTIME)
+ return -ERESTARTNOHAND;
/*
- * Report back to the user the time still remaining.
- */
- if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
- copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
return -EFAULT;
- restart_block->fn = posix_cpu_clock_nanosleep_restart;
- /* Caller already set restart_block->arg1 */
+ restart_block->fn = posix_cpu_nsleep_restart;
restart_block->arg0 = which_clock;
restart_block->arg1 = (unsigned long) rmtp;
restart_block->arg2 = rqtp->tv_sec;
restart_block->arg3 = rqtp->tv_nsec;
-
- error = -ERESTART_RESTARTBLOCK;
}
-
return error;
}
-static long
-posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
+long posix_cpu_nsleep_restart(struct restart_block *restart_block)
{
clockid_t which_clock = restart_block->arg0;
struct timespec __user *rmtp;
struct timespec t;
+ struct itimerspec it;
+ int error;
rmtp = (struct timespec __user *) restart_block->arg1;
t.tv_sec = restart_block->arg2;
t.tv_nsec = restart_block->arg3;
restart_block->fn = do_no_restart_syscall;
- return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
+ error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
+
+ if (error == -ERESTART_RESTARTBLOCK) {
+ /*
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ return -EFAULT;
+
+ restart_block->fn = posix_cpu_nsleep_restart;
+ restart_block->arg0 = which_clock;
+ restart_block->arg1 = (unsigned long) rmtp;
+ restart_block->arg2 = t.tv_sec;
+ restart_block->arg3 = t.tv_nsec;
+ }
+ return error;
+
}
@@ -1524,6 +1553,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags,
{
return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
}
+static long process_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ return -EINVAL;
+}
static int thread_cpu_clock_getres(const clockid_t which_clock,
struct timespec *tp)
{
@@ -1544,6 +1577,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
{
return -EINVAL;
}
+static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ return -EINVAL;
+}
static __init int init_posix_cpu_timers(void)
{
@@ -1553,6 +1590,7 @@ static __init int init_posix_cpu_timers(void)
.clock_set = do_posix_clock_nosettime,
.timer_create = process_cpu_timer_create,
.nsleep = process_cpu_nsleep,
+ .nsleep_restart = process_cpu_nsleep_restart,
};
struct k_clock thread = {
.clock_getres = thread_cpu_clock_getres,
@@ -1560,6 +1598,7 @@ static __init int init_posix_cpu_timers(void)
.clock_set = do_posix_clock_nosettime,
.timer_create = thread_cpu_timer_create,
.nsleep = thread_cpu_nsleep,
+ .nsleep_restart = thread_cpu_nsleep_restart,
};
register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ac6dc874442..e5ebcc1ec3a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
return CLOCK_DISPATCH(which_clock, nsleep,
(which_clock, flags, &t, rmtp));
}
+
+/*
+ * nanosleep_restart for monotonic and realtime clocks
+ */
+static int common_nsleep_restart(struct restart_block *restart_block)
+{
+ return hrtimer_nanosleep_restart(restart_block);
+}
+
+/*
+ * This will restart clock_nanosleep. This is required only by
+ * compat_clock_nanosleep_restart for now.
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+ clockid_t which_clock = restart_block->arg0;
+
+ return CLOCK_DISPATCH(which_clock, nsleep_restart,
+ (restart_block));
+}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8aad0331d82..4d50e06fd74 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -440,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
child = find_task_by_pid(pid);
if (child)
get_task_struct(child);
+
read_unlock(&tasklist_lock);
if (!child)
return ERR_PTR(-ESRCH);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 4d1c3d24712..4f2c4272d59 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -192,13 +192,13 @@ static struct rcu_torture_ops *cur_ops = NULL;
* Definitions for rcu torture testing.
*/
-static int rcu_torture_read_lock(void)
+static int rcu_torture_read_lock(void) __acquires(RCU)
{
rcu_read_lock();
return 0;
}
-static void rcu_torture_read_unlock(int idx)
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
{
rcu_read_unlock();
}
@@ -250,13 +250,13 @@ static struct rcu_torture_ops rcu_ops = {
* Definitions for rcu_bh torture testing.
*/
-static int rcu_bh_torture_read_lock(void)
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
{
rcu_read_lock_bh();
return 0;
}
-static void rcu_bh_torture_read_unlock(int idx)
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
{
rcu_read_unlock_bh();
}
diff --git a/kernel/relay.c b/kernel/relay.c
index 85786ff2a4f..1d63ecddfa7 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
* @buf: the buffer struct
* @size: total size of the buffer
*
- * Returns a pointer to the resulting buffer, NULL if unsuccessful. The
+ * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
* passed in size will get page aligned, if it isn't already.
*/
static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
@@ -132,10 +132,9 @@ depopulate:
/**
* relay_create_buf - allocate and initialize a channel buffer
- * @alloc_size: size of the buffer to allocate
- * @n_subbufs: number of sub-buffers in the channel
+ * @chan: the relay channel
*
- * Returns channel buffer if successful, NULL otherwise
+ * Returns channel buffer if successful, %NULL otherwise.
*/
struct rchan_buf *relay_create_buf(struct rchan *chan)
{
@@ -163,6 +162,7 @@ free_buf:
/**
* relay_destroy_channel - free the channel struct
+ * @kref: target kernel reference that contains the relay channel
*
* Should only be called from kref_put().
*/
@@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
/**
* relay_remove_buf - remove a channel buffer
+ * @kref: target kernel reference that contains the relay buffer
*
* Removes the file from the fileystem, which also frees the
* rchan_buf_struct and the channel buffer. Should only be called from
@@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan)
}
EXPORT_SYMBOL_GPL(relay_reset);
-/**
+/*
* relay_open_buf - create a new relay channel buffer
*
* Internal - used by relay_open().
@@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan,
/**
* relay_open - create a new relay channel
* @base_filename: base name of files to create
- * @parent: dentry of parent directory, NULL for root directory
+ * @parent: dentry of parent directory, %NULL for root directory
* @subbuf_size: size of sub-buffers
* @n_subbufs: number of sub-buffers
* @cb: client callback functions
*
- * Returns channel pointer if successful, NULL otherwise.
+ * Returns channel pointer if successful, %NULL otherwise.
*
* Creates a channel buffer for each cpu using the sizes and
* attributes specified. The created channel buffer files
@@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf);
* subbufs_consumed should be the number of sub-buffers newly consumed,
* not the total consumed.
*
- * NOTE: kernel clients don't need to call this function if the channel
+ * NOTE: Kernel clients don't need to call this function if the channel
* mode is 'overwrite'.
*/
void relay_subbufs_consumed(struct rchan *chan,
@@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close);
* relay_flush - close the channel
* @chan: the channel
*
- * Flushes all channel buffers i.e. forces buffer switch.
+ * Flushes all channel buffers, i.e. forces buffer switch.
*/
void relay_flush(struct rchan *chan)
{
@@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp)
return 0;
}
-/**
+/*
* relay_file_read_consume - update the consumed count for the buffer
*/
static void relay_file_read_consume(struct rchan_buf *buf,
@@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf,
}
}
-/**
+/*
* relay_file_read_avail - boolean, are there unconsumed bytes available?
*/
static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
@@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
/**
* relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ * @read_pos: file read position
+ * @buf: relay channel buffer
*/
static size_t relay_file_read_subbuf_avail(size_t read_pos,
struct rchan_buf *buf)
@@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
/**
* relay_file_read_start_pos - find the first available byte to read
+ * @read_pos: file read position
+ * @buf: relay channel buffer
*
* If the read_pos is in the middle of padding, return the
* position of the first actually available byte, otherwise
@@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos,
/**
* relay_file_read_end_pos - return the new read position
+ * @read_pos: file read position
+ * @buf: relay channel buffer
+ * @count: number of bytes to be read
*/
static size_t relay_file_read_end_pos(struct rchan_buf *buf,
size_t read_pos,
@@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
return end_pos;
}
-/**
+/*
* subbuf_read_actor - read up to one subbuf's worth of data
*/
static int subbuf_read_actor(size_t read_start,
@@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start,
return ret;
}
-/**
+/*
* subbuf_send_actor - send up to one subbuf's worth of data
*/
static int subbuf_send_actor(size_t read_start,
@@ -933,7 +941,7 @@ typedef int (*subbuf_actor_t) (size_t read_start,
read_descriptor_t *desc,
read_actor_t actor);
-/**
+/*
* relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
*/
static inline ssize_t relay_file_read_subbufs(struct file *filp,
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 3e13a1e5856..4ab17da46fd 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -251,6 +251,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/* Grab the next task */
task = rt_mutex_owner(lock);
+ get_task_struct(task);
spin_lock_irqsave(&task->pi_lock, flags);
if (waiter == rt_mutex_top_waiter(lock)) {
@@ -269,7 +270,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
__rt_mutex_adjust_prio(task);
}
- get_task_struct(task);
spin_unlock_irqrestore(&task->pi_lock, flags);
top_waiter = rt_mutex_top_waiter(lock);
@@ -409,7 +409,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
unsigned long flags;
- int boost = 0, res;
+ int chain_walk = 0, res;
spin_lock_irqsave(&current->pi_lock, flags);
__rt_mutex_adjust_prio(current);
@@ -433,25 +433,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
__rt_mutex_adjust_prio(owner);
- if (owner->pi_blocked_on) {
- boost = 1;
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(owner);
- }
- spin_unlock_irqrestore(&owner->pi_lock, flags);
- }
- else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
- spin_lock_irqsave(&owner->pi_lock, flags);
- if (owner->pi_blocked_on) {
- boost = 1;
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(owner);
- }
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
spin_unlock_irqrestore(&owner->pi_lock, flags);
}
- if (!boost)
+ else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+ chain_walk = 1;
+
+ if (!chain_walk)
return 0;
+ /*
+ * The owner can't disappear while holding a lock,
+ * so the owner struct is protected by wait_lock.
+ * Gets dropped in rt_mutex_adjust_prio_chain()!
+ */
+ get_task_struct(owner);
+
spin_unlock(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
@@ -532,7 +530,7 @@ static void remove_waiter(struct rt_mutex *lock,
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
unsigned long flags;
- int boost = 0;
+ int chain_walk = 0;
spin_lock_irqsave(&current->pi_lock, flags);
plist_del(&waiter->list_entry, &lock->wait_list);
@@ -554,19 +552,20 @@ static void remove_waiter(struct rt_mutex *lock,
}
__rt_mutex_adjust_prio(owner);
- if (owner->pi_blocked_on) {
- boost = 1;
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(owner);
- }
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
+
spin_unlock_irqrestore(&owner->pi_lock, flags);
}
WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
- if (!boost)
+ if (!chain_walk)
return;
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
+
spin_unlock(&lock->wait_lock);
rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
@@ -592,10 +591,10 @@ void rt_mutex_adjust_pi(struct task_struct *task)
return;
}
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(task);
spin_unlock_irqrestore(&task->pi_lock, flags);
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 5c848fd4e46..2bbd948f016 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,7 +49,7 @@
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/times.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
#include <asm/tlb.h>
@@ -1755,27 +1755,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
__releases(rq->lock)
{
struct mm_struct *mm = rq->prev_mm;
- unsigned long prev_task_flags;
+ long prev_state;
rq->prev_mm = NULL;
/*
* A task struct has one reference for the use as "current".
- * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
- * calls schedule one last time. The schedule call will never return,
- * and the scheduled task must drop that reference.
- * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+ * schedule one last time. The schedule call will never return, and
+ * the scheduled task must drop that reference.
+ * The test for TASK_DEAD must occur while the runqueue locks are
* still held, otherwise prev could be scheduled on another cpu, die
* there before we look at prev->state, and then the reference would
* be dropped twice.
* Manfred Spraul <manfred@colorfullife.com>
*/
- prev_task_flags = prev->flags;
+ prev_state = prev->state;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
if (mm)
mmdrop(mm);
- if (unlikely(prev_task_flags & PF_DEAD)) {
+ if (unlikely(prev_state == TASK_DEAD)) {
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
@@ -3348,9 +3348,6 @@ need_resched_nonpreemptible:
spin_lock_irq(&rq->lock);
- if (unlikely(prev->flags & PF_DEAD))
- prev->state = EXIT_DEAD;
-
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
switch_count = &prev->nvcsw;
@@ -4080,6 +4077,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
* @p: the task in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
+ *
+ * NOTE: the task may be already dead
*/
int sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param)
@@ -4107,28 +4106,32 @@ recheck:
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
- != (param->sched_priority == 0))
+ if (is_rt_policy(policy) != (param->sched_priority != 0))
return -EINVAL;
/*
* Allow unprivileged RT tasks to decrease priority:
*/
if (!capable(CAP_SYS_NICE)) {
- /*
- * can't change policy, except between SCHED_NORMAL
- * and SCHED_BATCH:
- */
- if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
- (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
- !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
- return -EPERM;
- /* can't increase priority */
- if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
- param->sched_priority > p->rt_priority &&
- param->sched_priority >
- p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
- return -EPERM;
+ if (is_rt_policy(policy)) {
+ unsigned long rlim_rtprio;
+ unsigned long flags;
+
+ if (!lock_task_sighand(p, &flags))
+ return -ESRCH;
+ rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+ unlock_task_sighand(p, &flags);
+
+ /* can't set/change the rt policy */
+ if (policy != p->policy && !rlim_rtprio)
+ return -EPERM;
+
+ /* can't increase priority */
+ if (param->sched_priority > p->rt_priority &&
+ param->sched_priority > rlim_rtprio)
+ return -EPERM;
+ }
+
/* can't change other user's priorities */
if ((current->euid != p->euid) &&
(current->euid != p->uid))
@@ -4193,14 +4196,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
- read_lock_irq(&tasklist_lock);
+
+ rcu_read_lock();
+ retval = -ESRCH;
p = find_process_by_pid(pid);
- if (!p) {
- read_unlock_irq(&tasklist_lock);
- return -ESRCH;
- }
- retval = sched_setscheduler(p, policy, &lparam);
- read_unlock_irq(&tasklist_lock);
+ if (p != NULL)
+ retval = sched_setscheduler(p, policy, &lparam);
+ rcu_read_unlock();
return retval;
}
@@ -5151,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
/* Cannot have done final schedule yet: would have vanished. */
- BUG_ON(p->flags & PF_DEAD);
+ BUG_ON(p->state == TASK_DEAD);
get_task_struct(p);
@@ -5272,9 +5274,11 @@ static struct notifier_block __cpuinitdata migration_notifier = {
int __init migration_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
+ int err;
/* Start one for the boot CPU: */
- migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+ BUG_ON(err == NOTIFY_BAD);
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
diff --git a/kernel/signal.c b/kernel/signal.c
index 05853a7337e..fb5da6d19f1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -417,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
siginfo_t *info)
{
- int sig = 0;
+ int sig = next_signal(pending, mask);
- sig = next_signal(pending, mask);
if (sig) {
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
@@ -432,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
if (!collect_signal(sig, pending, info))
sig = 0;
-
}
- recalc_sigpending();
return sig;
}
@@ -451,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
if (!signr)
signr = __dequeue_signal(&tsk->signal->shared_pending,
mask, info);
+ recalc_sigpending_tsk(tsk);
if (signr && unlikely(sig_kernel_stop(signr))) {
/*
* Set a marker that we have dequeued a stop signal. Our
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3789ca98197..bf25015dce1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -612,7 +612,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
__init int spawn_ksoftirqd(void)
{
void *cpu = (void *)(long)smp_processor_id();
- cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+
+ BUG_ON(err == NOTIFY_BAD);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 03e6a2b0b78..50afeb81330 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -149,8 +149,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
__init void spawn_softlockup_task(void)
{
void *cpu = (void *)(long)smp_processor_id();
+ int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
- cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ BUG_ON(err == NOTIFY_BAD);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 9644a41e0be..476c3741511 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,17 +21,6 @@
#include <linux/debug_locks.h>
#include <linux/module.h>
-/*
- * Generic declaration of the raw read_trylock() function,
- * architectures are supposed to optimize this:
- */
-int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
-{
- __raw_read_lock(lock);
- return 1;
-}
-EXPORT_SYMBOL(generic__raw_read_trylock);
-
int __lockfunc _spin_trylock(spinlock_t *lock)
{
preempt_disable();
@@ -226,7 +215,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
while (!op##_can_lock(lock) && (lock)->break_lock) \
- cpu_relax(); \
+ _raw_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
} \
@@ -248,7 +237,7 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
while (!op##_can_lock(lock) && (lock)->break_lock) \
- cpu_relax(); \
+ _raw_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
return flags; \
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 51cacd111db..12458040e66 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,6 @@
+/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+ * GPL v2 and any later version.
+ */
#include <linux/stop_machine.h>
#include <linux/kthread.h>
#include <linux/sched.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 3f894775488..2460581c928 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -607,12 +607,10 @@ static void kernel_restart_prepare(char *cmd)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
- if (!cmd) {
+ if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
- } else {
+ else
printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
- }
- printk(".\n");
machine_restart(cmd);
}
EXPORT_SYMBOL_GPL(kernel_restart);
@@ -628,9 +626,8 @@ static void kernel_kexec(void)
#ifdef CONFIG_KEXEC
struct kimage *image;
image = xchg(&kexec_image, NULL);
- if (!image) {
+ if (!image)
return;
- }
kernel_restart_prepare(NULL);
printk(KERN_EMERG "Starting new kernel\n");
machine_shutdown();
@@ -824,12 +821,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
(current->sgid == egid) ||
capable(CAP_SETGID))
new_egid = egid;
- else {
+ else
return -EPERM;
- }
}
- if (new_egid != old_egid)
- {
+ if (new_egid != old_egid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -858,19 +853,14 @@ asmlinkage long sys_setgid(gid_t gid)
if (retval)
return retval;
- if (capable(CAP_SETGID))
- {
- if(old_egid != gid)
- {
+ if (capable(CAP_SETGID)) {
+ if (old_egid != gid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->gid = current->egid = current->sgid = current->fsgid = gid;
- }
- else if ((gid == current->gid) || (gid == current->sgid))
- {
- if(old_egid != gid)
- {
+ } else if ((gid == current->gid) || (gid == current->sgid)) {
+ if (old_egid != gid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -901,8 +891,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
switch_uid(new_user);
- if(dumpclear)
- {
+ if (dumpclear) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -958,8 +947,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
return -EAGAIN;
- if (new_euid != old_euid)
- {
+ if (new_euid != old_euid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1009,8 +997,7 @@ asmlinkage long sys_setuid(uid_t uid)
} else if ((uid != current->uid) && (uid != new_suid))
return -EPERM;
- if (old_euid != uid)
- {
+ if (old_euid != uid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1055,8 +1042,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
return -EAGAIN;
}
if (euid != (uid_t) -1) {
- if (euid != current->euid)
- {
+ if (euid != current->euid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1106,8 +1092,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
return -EPERM;
}
if (egid != (gid_t) -1) {
- if (egid != current->egid)
- {
+ if (egid != current->egid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1152,10 +1137,8 @@ asmlinkage long sys_setfsuid(uid_t uid)
if (uid == current->uid || uid == current->euid ||
uid == current->suid || uid == current->fsuid ||
- capable(CAP_SETUID))
- {
- if (uid != old_fsuid)
- {
+ capable(CAP_SETUID)) {
+ if (uid != old_fsuid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1183,10 +1166,8 @@ asmlinkage long sys_setfsgid(gid_t gid)
if (gid == current->gid || gid == current->egid ||
gid == current->sgid || gid == current->fsgid ||
- capable(CAP_SETGID))
- {
- if (gid != old_fsgid)
- {
+ capable(CAP_SETGID)) {
+ if (gid != old_fsgid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1322,9 +1303,9 @@ out:
asmlinkage long sys_getpgid(pid_t pid)
{
- if (!pid) {
+ if (!pid)
return process_group(current);
- } else {
+ else {
int retval;
struct task_struct *p;
@@ -1354,9 +1335,9 @@ asmlinkage long sys_getpgrp(void)
asmlinkage long sys_getsid(pid_t pid)
{
- if (!pid) {
+ if (!pid)
return current->signal->session;
- } else {
+ else {
int retval;
struct task_struct *p;
@@ -1364,7 +1345,7 @@ asmlinkage long sys_getsid(pid_t pid)
p = find_task_by_pid(pid);
retval = -ESRCH;
- if(p) {
+ if (p) {
retval = security_task_getsid(p);
if (!retval)
retval = p->signal->session;
@@ -1432,9 +1413,9 @@ struct group_info *groups_alloc(int gidsetsize)
group_info->nblocks = nblocks;
atomic_set(&group_info->usage, 1);
- if (gidsetsize <= NGROUPS_SMALL) {
+ if (gidsetsize <= NGROUPS_SMALL)
group_info->blocks[0] = group_info->small_block;
- } else {
+ else {
for (i = 0; i < nblocks; i++) {
gid_t *b;
b = (void *)__get_free_page(GFP_USER);
@@ -1490,7 +1471,7 @@ static int groups_to_user(gid_t __user *grouplist,
/* fill a group_info from a user-space array - it must be allocated already */
static int groups_from_user(struct group_info *group_info,
gid_t __user *grouplist)
- {
+{
int i;
int count = group_info->ngroups;
@@ -1648,9 +1629,8 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
int in_group_p(gid_t grp)
{
int retval = 1;
- if (grp != current->fsgid) {
+ if (grp != current->fsgid)
retval = groups_search(current->group_info, grp);
- }
return retval;
}
@@ -1659,9 +1639,8 @@ EXPORT_SYMBOL(in_group_p);
int in_egroup_p(gid_t grp)
{
int retval = 1;
- if (grp != current->egid) {
+ if (grp != current->egid)
retval = groups_search(current->group_info, grp);
- }
return retval;
}
@@ -1776,9 +1755,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
task_lock(current->group_leader);
x = current->signal->rlim[resource];
task_unlock(current->group_leader);
- if(x.rlim_cur > 0x7FFFFFFF)
+ if (x.rlim_cur > 0x7FFFFFFF)
x.rlim_cur = 0x7FFFFFFF;
- if(x.rlim_max > 0x7FFFFFFF)
+ if (x.rlim_max > 0x7FFFFFFF)
x.rlim_max = 0x7FFFFFFF;
return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
}
@@ -2084,12 +2063,12 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
* padding
*/
unsigned long t0, t1;
- get_user(t0, &cache->t0);
- get_user(t1, &cache->t1);
+ get_user(t0, &cache->blob[0]);
+ get_user(t1, &cache->blob[1]);
t0++;
t1++;
- put_user(t0, &cache->t0);
- put_user(t1, &cache->t1);
+ put_user(t0, &cache->blob[0]);
+ put_user(t1, &cache->blob[1]);
}
return err ? -EFAULT : 0;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bece67e..7a3b2e75f04 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -134,3 +134,8 @@ cond_syscall(sys_madvise);
cond_syscall(sys_mremap);
cond_syscall(sys_remap_file_pages);
cond_syscall(compat_sys_move_pages);
+
+/* block-layer dependent */
+cond_syscall(sys_bdflush);
+cond_syscall(sys_ioprio_set);
+cond_syscall(sys_ioprio_get);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bfa7d117c5..ba42694f045 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,10 @@
extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_X86
+#include <asm/nmi.h>
+#endif
+
#if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
@@ -74,13 +78,6 @@ extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
- void __user *, size_t *, loff_t *);
-#endif
-
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
@@ -297,7 +294,7 @@ static ctl_table kern_table[] = {
.ctl_name = KERN_CORE_PATTERN,
.procname = "core_pattern",
.data = core_pattern,
- .maxlen = 64,
+ .maxlen = 128,
.mode = 0644,
.proc_handler = &proc_dostring,
.strategy = &sysctl_string,
@@ -1915,7 +1912,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
return -EPERM;
}
- op = (current->pid == 1) ? OP_SET : OP_AND;
+ op = is_init(current) ? OP_SET : OP_AND;
return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
do_proc_dointvec_bset_conv,&op);
}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2ed4040d0dc..5d6a8c54ee8 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,7 +18,9 @@
#include <linux/kernel.h>
#include <linux/taskstats_kern.h>
+#include <linux/tsacct_kern.h>
#include <linux/delayacct.h>
+#include <linux/tsacct_kern.h>
#include <linux/cpumask.h>
#include <linux/percpu.h>
#include <net/genetlink.h>
@@ -75,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
/*
* If new attributes are added, please revisit this allocation
*/
- skb = nlmsg_new(size, GFP_KERNEL);
+ skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL);
if (!skb)
return -ENOMEM;
@@ -198,7 +200,13 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
*/
delayacct_add_tsk(stats, tsk);
+
+ /* fill in basic acct fields */
stats->version = TASKSTATS_VERSION;
+ bacct_add_tsk(stats, tsk);
+
+ /* fill in extended acct fields */
+ xacct_add_tsk(stats, tsk);
/* Define err: label here if needed */
put_task_struct(tsk);
diff --git a/kernel/time.c b/kernel/time.c
index 5bd48974764..0e017bff4c1 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,179 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
-/* we call this to notify the arch when the clock is being
- * controlled. If no such arch routine, do nothing.
- */
-void __attribute__ ((weak)) notify_arch_cmos_timer(void)
-{
- return;
-}
-
-/* adjtimex mainly allows reading (and writing, if superuser) of
- * kernel time-keeping variables. used by xntpd.
- */
-int do_adjtimex(struct timex *txc)
-{
- long ltemp, mtemp, save_adjust;
- int result;
-
- /* In order to modify anything, you gotta be super-user! */
- if (txc->modes && !capable(CAP_SYS_TIME))
- return -EPERM;
-
- /* Now we validate the data before disabling interrupts */
-
- if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
- /* singleshot must not be used with any other mode bits */
- if (txc->modes != ADJ_OFFSET_SINGLESHOT)
- return -EINVAL;
-
- if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
- /* adjustment Offset limited to +- .512 seconds */
- if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
- return -EINVAL;
-
- /* if the quartz is off by more than 10% something is VERY wrong ! */
- if (txc->modes & ADJ_TICK)
- if (txc->tick < 900000/USER_HZ ||
- txc->tick > 1100000/USER_HZ)
- return -EINVAL;
-
- write_seqlock_irq(&xtime_lock);
- result = time_state; /* mostly `TIME_OK' */
-
- /* Save for later - semantics of adjtime is to return old value */
- save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
-
-#if 0 /* STA_CLOCKERR is never set yet */
- time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
-#endif
- /* If there are input parameters, then process them */
- if (txc->modes)
- {
- if (txc->modes & ADJ_STATUS) /* only set allowed bits */
- time_status = (txc->status & ~STA_RONLY) |
- (time_status & STA_RONLY);
-
- if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
- if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
- result = -EINVAL;
- goto leave;
- }
- time_freq = txc->freq;
- }
-
- if (txc->modes & ADJ_MAXERROR) {
- if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
- result = -EINVAL;
- goto leave;
- }
- time_maxerror = txc->maxerror;
- }
-
- if (txc->modes & ADJ_ESTERROR) {
- if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
- result = -EINVAL;
- goto leave;
- }
- time_esterror = txc->esterror;
- }
-
- if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
- if (txc->constant < 0) { /* NTP v4 uses values > 6 */
- result = -EINVAL;
- goto leave;
- }
- time_constant = txc->constant;
- }
-
- if (txc->modes & ADJ_OFFSET) { /* values checked earlier */
- if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
- /* adjtime() is independent from ntp_adjtime() */
- if ((time_next_adjust = txc->offset) == 0)
- time_adjust = 0;
- }
- else if (time_status & STA_PLL) {
- ltemp = txc->offset;
-
- /*
- * Scale the phase adjustment and
- * clamp to the operating range.
- */
- if (ltemp > MAXPHASE)
- time_offset = MAXPHASE << SHIFT_UPDATE;
- else if (ltemp < -MAXPHASE)
- time_offset = -(MAXPHASE << SHIFT_UPDATE);
- else
- time_offset = ltemp << SHIFT_UPDATE;
-
- /*
- * Select whether the frequency is to be controlled
- * and in which mode (PLL or FLL). Clamp to the operating
- * range. Ugly multiply/divide should be replaced someday.
- */
-
- if (time_status & STA_FREQHOLD || time_reftime == 0)
- time_reftime = xtime.tv_sec;
- mtemp = xtime.tv_sec - time_reftime;
- time_reftime = xtime.tv_sec;
- if (time_status & STA_FLL) {
- if (mtemp >= MINSEC) {
- ltemp = (time_offset / mtemp) << (SHIFT_USEC -
- SHIFT_UPDATE);
- time_freq += shift_right(ltemp, SHIFT_KH);
- } else /* calibration interval too short (p. 12) */
- result = TIME_ERROR;
- } else { /* PLL mode */
- if (mtemp < MAXSEC) {
- ltemp *= mtemp;
- time_freq += shift_right(ltemp,(time_constant +
- time_constant +
- SHIFT_KF - SHIFT_USEC));
- } else /* calibration interval too long (p. 12) */
- result = TIME_ERROR;
- }
- time_freq = min(time_freq, time_tolerance);
- time_freq = max(time_freq, -time_tolerance);
- } /* STA_PLL */
- } /* txc->modes & ADJ_OFFSET */
- if (txc->modes & ADJ_TICK) {
- tick_usec = txc->tick;
- tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
- }
- } /* txc->modes */
-leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
- result = TIME_ERROR;
-
- if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
- txc->offset = save_adjust;
- else {
- txc->offset = shift_right(time_offset, SHIFT_UPDATE);
- }
- txc->freq = time_freq;
- txc->maxerror = time_maxerror;
- txc->esterror = time_esterror;
- txc->status = time_status;
- txc->constant = time_constant;
- txc->precision = time_precision;
- txc->tolerance = time_tolerance;
- txc->tick = tick_usec;
-
- /* PPS is not implemented, so these are zero */
- txc->ppsfreq = 0;
- txc->jitter = 0;
- txc->shift = 0;
- txc->stabil = 0;
- txc->jitcnt = 0;
- txc->calcnt = 0;
- txc->errcnt = 0;
- txc->stbcnt = 0;
- write_sequnlock_irq(&xtime_lock);
- do_gettimeofday(&txc->time);
- notify_arch_cmos_timer();
- return(result);
-}
-
asmlinkage long sys_adjtimex(struct timex __user *txc_p)
{
struct timex txc; /* Local copy of parameter */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e1dfd8e86cc..61a3907d16f 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1 @@
-obj-y += clocksource.o jiffies.o
+obj-y += ntp.o clocksource.o jiffies.o
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
new file mode 100644
index 00000000000..47195fa0ec4
--- /dev/null
+++ b/kernel/time/ntp.c
@@ -0,0 +1,350 @@
+/*
+ * linux/kernel/time/ntp.c
+ *
+ * NTP state machine interfaces and logic.
+ *
+ * This code was mainly moved from kernel/timer.c and kernel/time.c
+ * Please see those files for relevant copyright info and historical
+ * changelogs.
+ */
+
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+
+#include <asm/div64.h>
+#include <asm/timex.h>
+
+/*
+ * Timekeeping variables
+ */
+unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
+unsigned long tick_nsec; /* ACTHZ period (nsec) */
+static u64 tick_length, tick_length_base;
+
+#define MAX_TICKADJ 500 /* microsecs */
+#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
+ TICK_LENGTH_SHIFT) / HZ)
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+static int time_state = TIME_OK; /* clock synchronization status */
+int time_status = STA_UNSYNC; /* clock status bits */
+static long time_offset; /* time adjustment (ns) */
+static long time_constant = 2; /* pll time constant */
+long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
+long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
+long time_freq; /* frequency offset (scaled ppm)*/
+static long time_reftime; /* time at last adjustment (s) */
+long time_adjust;
+
+#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE)
+#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \
+ (s64)CLOCK_TICK_RATE)
+
+static void ntp_update_frequency(void)
+{
+ tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
+ tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+ tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
+
+ do_div(tick_length_base, HZ);
+
+ tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
+}
+
+/**
+ * ntp_clear - Clears the NTP state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+void ntp_clear(void)
+{
+ time_adjust = 0; /* stop active adjtime() */
+ time_status |= STA_UNSYNC;
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_esterror = NTP_PHASE_LIMIT;
+
+ ntp_update_frequency();
+
+ tick_length = tick_length_base;
+ time_offset = 0;
+}
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ */
+void second_overflow(void)
+{
+ long time_adj;
+
+ /* Bump the maxerror field */
+ time_maxerror += MAXFREQ >> SHIFT_USEC;
+ if (time_maxerror > NTP_PHASE_LIMIT) {
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_status |= STA_UNSYNC;
+ }
+
+ /*
+ * Leap second processing. If in leap-insert state at the end of the
+ * day, the system clock is set back one second; if in leap-delete
+ * state, the system clock is set ahead one second. The microtime()
+ * routine or external clock driver will insure that reported time is
+ * always monotonic. The ugly divides should be replaced.
+ */
+ switch (time_state) {
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+ case TIME_INS:
+ if (xtime.tv_sec % 86400 == 0) {
+ xtime.tv_sec--;
+ wall_to_monotonic.tv_sec++;
+ /*
+ * The timer interpolator will make time change
+ * gradually instead of an immediate jump by one second
+ */
+ time_interpolator_update(-NSEC_PER_SEC);
+ time_state = TIME_OOP;
+ clock_was_set();
+ printk(KERN_NOTICE "Clock: inserting leap second "
+ "23:59:60 UTC\n");
+ }
+ break;
+ case TIME_DEL:
+ if ((xtime.tv_sec + 1) % 86400 == 0) {
+ xtime.tv_sec++;
+ wall_to_monotonic.tv_sec--;
+ /*
+ * Use of time interpolator for a gradual change of
+ * time
+ */
+ time_interpolator_update(NSEC_PER_SEC);
+ time_state = TIME_WAIT;
+ clock_was_set();
+ printk(KERN_NOTICE "Clock: deleting leap second "
+ "23:59:59 UTC\n");
+ }
+ break;
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ }
+
+ /*
+ * Compute the phase adjustment for the next second. The offset is
+ * reduced by a fixed factor times the time constant.
+ */
+ tick_length = tick_length_base;
+ time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
+ time_offset -= time_adj;
+ tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
+
+ if (unlikely(time_adjust)) {
+ if (time_adjust > MAX_TICKADJ) {
+ time_adjust -= MAX_TICKADJ;
+ tick_length += MAX_TICKADJ_SCALED;
+ } else if (time_adjust < -MAX_TICKADJ) {
+ time_adjust += MAX_TICKADJ;
+ tick_length -= MAX_TICKADJ_SCALED;
+ } else {
+ time_adjust = 0;
+ tick_length += (s64)(time_adjust * NSEC_PER_USEC /
+ HZ) << TICK_LENGTH_SHIFT;
+ }
+ }
+}
+
+/*
+ * Return how long ticks are at the moment, that is, how much time
+ * update_wall_time_one_tick will add to xtime next time we call it
+ * (assuming no calls to do_adjtimex in the meantime).
+ * The return value is in fixed-point nanoseconds shifted by the
+ * specified number of bits to the right of the binary point.
+ * This function has no side-effects.
+ */
+u64 current_tick_length(void)
+{
+ return tick_length;
+}
+
+
+void __attribute__ ((weak)) notify_arch_cmos_timer(void)
+{
+ return;
+}
+
+/* adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int do_adjtimex(struct timex *txc)
+{
+ long ltemp, mtemp, save_adjust;
+ s64 freq_adj, temp64;
+ int result;
+
+ /* In order to modify anything, you gotta be super-user! */
+ if (txc->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ /* Now we validate the data before disabling interrupts */
+
+ if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+ /* singleshot must not be used with any other mode bits */
+ if (txc->modes != ADJ_OFFSET_SINGLESHOT)
+ return -EINVAL;
+
+ if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
+ /* adjustment Offset limited to +- .512 seconds */
+ if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
+ return -EINVAL;
+
+ /* if the quartz is off by more than 10% something is VERY wrong ! */
+ if (txc->modes & ADJ_TICK)
+ if (txc->tick < 900000/USER_HZ ||
+ txc->tick > 1100000/USER_HZ)
+ return -EINVAL;
+
+ write_seqlock_irq(&xtime_lock);
+ result = time_state; /* mostly `TIME_OK' */
+
+ /* Save for later - semantics of adjtime is to return old value */
+ save_adjust = time_adjust;
+
+#if 0 /* STA_CLOCKERR is never set yet */
+ time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
+#endif
+ /* If there are input parameters, then process them */
+ if (txc->modes)
+ {
+ if (txc->modes & ADJ_STATUS) /* only set allowed bits */
+ time_status = (txc->status & ~STA_RONLY) |
+ (time_status & STA_RONLY);
+
+ if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
+ if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
+ result = -EINVAL;
+ goto leave;
+ }
+ time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC);
+ }
+
+ if (txc->modes & ADJ_MAXERROR) {
+ if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
+ result = -EINVAL;
+ goto leave;
+ }
+ time_maxerror = txc->maxerror;
+ }
+
+ if (txc->modes & ADJ_ESTERROR) {
+ if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
+ result = -EINVAL;
+ goto leave;
+ }
+ time_esterror = txc->esterror;
+ }
+
+ if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
+ if (txc->constant < 0) { /* NTP v4 uses values > 6 */
+ result = -EINVAL;
+ goto leave;
+ }
+ time_constant = min(txc->constant + 4, (long)MAXTC);
+ }
+
+ if (txc->modes & ADJ_OFFSET) { /* values checked earlier */
+ if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
+ /* adjtime() is independent from ntp_adjtime() */
+ time_adjust = txc->offset;
+ }
+ else if (time_status & STA_PLL) {
+ ltemp = txc->offset * NSEC_PER_USEC;
+
+ /*
+ * Scale the phase adjustment and
+ * clamp to the operating range.
+ */
+ time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC);
+ time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC);
+
+ /*
+ * Select whether the frequency is to be controlled
+ * and in which mode (PLL or FLL). Clamp to the operating
+ * range. Ugly multiply/divide should be replaced someday.
+ */
+
+ if (time_status & STA_FREQHOLD || time_reftime == 0)
+ time_reftime = xtime.tv_sec;
+ mtemp = xtime.tv_sec - time_reftime;
+ time_reftime = xtime.tv_sec;
+
+ freq_adj = (s64)time_offset * mtemp;
+ freq_adj = shift_right(freq_adj, time_constant * 2 +
+ (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
+ if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+ temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL);
+ if (time_offset < 0) {
+ temp64 = -temp64;
+ do_div(temp64, mtemp);
+ freq_adj -= temp64;
+ } else {
+ do_div(temp64, mtemp);
+ freq_adj += temp64;
+ }
+ }
+ freq_adj += time_freq;
+ freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
+ time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
+ time_offset = (time_offset / HZ) << SHIFT_UPDATE;
+ } /* STA_PLL */
+ } /* txc->modes & ADJ_OFFSET */
+ if (txc->modes & ADJ_TICK)
+ tick_usec = txc->tick;
+
+ if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
+ ntp_update_frequency();
+ } /* txc->modes */
+leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
+ result = TIME_ERROR;
+
+ if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+ txc->offset = save_adjust;
+ else
+ txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000;
+ txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC);
+ txc->maxerror = time_maxerror;
+ txc->esterror = time_esterror;
+ txc->status = time_status;
+ txc->constant = time_constant;
+ txc->precision = 1;
+ txc->tolerance = MAXFREQ;
+ txc->tick = tick_usec;
+
+ /* PPS is not implemented, so these are zero */
+ txc->ppsfreq = 0;
+ txc->jitter = 0;
+ txc->shift = 0;
+ txc->stabil = 0;
+ txc->jitcnt = 0;
+ txc->calcnt = 0;
+ txc->errcnt = 0;
+ txc->stbcnt = 0;
+ write_sequnlock_irq(&xtime_lock);
+ do_gettimeofday(&txc->time);
+ notify_arch_cmos_timer();
+ return(result);
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index 1d7dd6267c2..c1c7fbcffec 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -41,12 +41,6 @@
#include <asm/timex.h>
#include <asm/io.h>
-#ifdef CONFIG_TIME_INTERPOLATION
-static void time_interpolator_update(long delta_nsec);
-#else
-#define time_interpolator_update(x)
-#endif
-
u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
@@ -136,7 +130,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
list_add_tail(&timer->entry, vec);
}
-/***
+/**
* init_timer - initialize a timer.
* @timer: the timer to be initialized
*
@@ -175,6 +169,7 @@ static inline void detach_timer(struct timer_list *timer,
*/
static tvec_base_t *lock_timer_base(struct timer_list *timer,
unsigned long *flags)
+ __acquires(timer->base->lock)
{
tvec_base_t *base;
@@ -235,7 +230,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(__mod_timer);
-/***
+/**
* add_timer_on - start a timer on a particular CPU
* @timer: the timer to be added
* @cpu: the CPU to start it on
@@ -255,9 +250,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
}
-/***
+/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified
+ * @expires: new timeout in jiffies
*
* mod_timer is a more efficient way to update the expire field of an
* active timer (if the timer is inactive it will be activated)
@@ -291,7 +287,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(mod_timer);
-/***
+/**
* del_timer - deactive a timer.
* @timer: the timer to be deactivated
*
@@ -323,7 +319,10 @@ int del_timer(struct timer_list *timer)
EXPORT_SYMBOL(del_timer);
#ifdef CONFIG_SMP
-/*
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: timer do del
+ *
* This function tries to deactivate a timer. Upon successful (ret >= 0)
* exit the timer is not queued and the handler is not running on any CPU.
*
@@ -351,7 +350,7 @@ out:
return ret;
}
-/***
+/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
*
@@ -401,15 +400,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
return index;
}
-/***
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+
+/**
* __run_timers - run all expired timers (if any) on this CPU.
* @base: the timer vector to be processed.
*
* This function cascades all vectors and executes all expired timer
* vectors.
*/
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
-
static inline void __run_timers(tvec_base_t *base)
{
struct timer_list *timer;
@@ -563,12 +562,6 @@ found:
/******************************************************************/
-/*
- * Timekeeping variables
- */
-unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
-unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */
-
/*
* The current time
* wall_to_monotonic is what we need to add to xtime (or xtime corrected
@@ -582,209 +575,6 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
EXPORT_SYMBOL(xtime);
-/* Don't completely fail for HZ > 500. */
-int tickadj = 500/HZ ? : 1; /* microsecs */
-
-
-/*
- * phase-lock loop variables
- */
-/* TIME_ERROR prevents overwriting the CMOS clock */
-int time_state = TIME_OK; /* clock synchronization status */
-int time_status = STA_UNSYNC; /* clock status bits */
-long time_offset; /* time adjustment (us) */
-long time_constant = 2; /* pll time constant */
-long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
-long time_precision = 1; /* clock precision (us) */
-long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
-long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
-long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
- /* frequency offset (scaled ppm)*/
-static long time_adj; /* tick adjust (scaled 1 / HZ) */
-long time_reftime; /* time at last adjustment (s) */
-long time_adjust;
-long time_next_adjust;
-
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- *
- */
-static void second_overflow(void)
-{
- long ltemp;
-
- /* Bump the maxerror field */
- time_maxerror += time_tolerance >> SHIFT_USEC;
- if (time_maxerror > NTP_PHASE_LIMIT) {
- time_maxerror = NTP_PHASE_LIMIT;
- time_status |= STA_UNSYNC;
- }
-
- /*
- * Leap second processing. If in leap-insert state at the end of the
- * day, the system clock is set back one second; if in leap-delete
- * state, the system clock is set ahead one second. The microtime()
- * routine or external clock driver will insure that reported time is
- * always monotonic. The ugly divides should be replaced.
- */
- switch (time_state) {
- case TIME_OK:
- if (time_status & STA_INS)
- time_state = TIME_INS;
- else if (time_status & STA_DEL)
- time_state = TIME_DEL;
- break;
- case TIME_INS:
- if (xtime.tv_sec % 86400 == 0) {
- xtime.tv_sec--;
- wall_to_monotonic.tv_sec++;
- /*
- * The timer interpolator will make time change
- * gradually instead of an immediate jump by one second
- */
- time_interpolator_update(-NSEC_PER_SEC);
- time_state = TIME_OOP;
- clock_was_set();
- printk(KERN_NOTICE "Clock: inserting leap second "
- "23:59:60 UTC\n");
- }
- break;
- case TIME_DEL:
- if ((xtime.tv_sec + 1) % 86400 == 0) {
- xtime.tv_sec++;
- wall_to_monotonic.tv_sec--;
- /*
- * Use of time interpolator for a gradual change of
- * time
- */
- time_interpolator_update(NSEC_PER_SEC);
- time_state = TIME_WAIT;
- clock_was_set();
- printk(KERN_NOTICE "Clock: deleting leap second "
- "23:59:59 UTC\n");
- }
- break;
- case TIME_OOP:
- time_state = TIME_WAIT;
- break;
- case TIME_WAIT:
- if (!(time_status & (STA_INS | STA_DEL)))
- time_state = TIME_OK;
- }
-
- /*
- * Compute the phase adjustment for the next second. In PLL mode, the
- * offset is reduced by a fixed factor times the time constant. In FLL
- * mode the offset is used directly. In either mode, the maximum phase
- * adjustment for each second is clamped so as to spread the adjustment
- * over not more than the number of seconds between updates.
- */
- ltemp = time_offset;
- if (!(time_status & STA_FLL))
- ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
- ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
- ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
- time_offset -= ltemp;
- time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-
- /*
- * Compute the frequency estimate and additional phase adjustment due
- * to frequency error for the next second.
- */
- ltemp = time_freq;
- time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
-
-#if HZ == 100
- /*
- * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to
- * get 128.125; => only 0.125% error (p. 14)
- */
- time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
-#endif
-#if HZ == 250
- /*
- * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and
- * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
- */
- time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-#if HZ == 1000
- /*
- * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and
- * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
- */
- time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-}
-
-/*
- * Returns how many microseconds we need to add to xtime this tick
- * in doing an adjustment requested with adjtime.
- */
-static long adjtime_adjustment(void)
-{
- long time_adjust_step;
-
- time_adjust_step = time_adjust;
- if (time_adjust_step) {
- /*
- * We are doing an adjtime thing. Prepare time_adjust_step to
- * be within bounds. Note that a positive time_adjust means we
- * want the clock to run faster.
- *
- * Limit the amount of the step to be in the range
- * -tickadj .. +tickadj
- */
- time_adjust_step = min(time_adjust_step, (long)tickadj);
- time_adjust_step = max(time_adjust_step, (long)-tickadj);
- }
- return time_adjust_step;
-}
-
-/* in the NTP reference this is called "hardclock()" */
-static void update_ntp_one_tick(void)
-{
- long time_adjust_step;
-
- time_adjust_step = adjtime_adjustment();
- if (time_adjust_step)
- /* Reduce by this step the amount of time left */
- time_adjust -= time_adjust_step;
-
- /* Changes by adjtime() do not take effect till next tick. */
- if (time_next_adjust != 0) {
- time_adjust = time_next_adjust;
- time_next_adjust = 0;
- }
-}
-
-/*
- * Return how long ticks are at the moment, that is, how much time
- * update_wall_time_one_tick will add to xtime next time we call it
- * (assuming no calls to do_adjtimex in the meantime).
- * The return value is in fixed-point nanoseconds shifted by the
- * specified number of bits to the right of the binary point.
- * This function has no side-effects.
- */
-u64 current_tick_length(void)
-{
- long delta_nsec;
- u64 ret;
-
- /* calculate the finest interval NTP will allow.
- * ie: nanosecond value shifted by (SHIFT_SCALE - 10)
- */
- delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
- ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
- ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
-
- return ret;
-}
/* XXX - all of this timekeeping code should be later moved to time.c */
#include <linux/clocksource.h>
@@ -961,21 +751,24 @@ void __init timekeeping_init(void)
unsigned long flags;
write_seqlock_irqsave(&xtime_lock, flags);
+
+ ntp_clear();
+
clock = clocksource_get_next();
clocksource_calculate_interval(clock, tick_nsec);
clock->cycle_last = clocksource_read(clock);
- ntp_clear();
+
write_sequnlock_irqrestore(&xtime_lock, flags);
}
static int timekeeping_suspended;
-/*
+/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
* @dev: unused
*
* This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
+ * xtime/wall_to_monotonic/jiffies/etc are
* still managed by arch specific suspend/resume code.
*/
static int timekeeping_resume(struct sys_device *dev)
@@ -1106,7 +899,7 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
}
-/*
+/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
* Called from the timer interrupt, must hold a write on xtime_lock.
@@ -1144,8 +937,6 @@ static void update_wall_time(void)
/* interpolator bits */
time_interpolator_update(clock->xtime_interval
>> clock->shift);
- /* increment the NTP state machine */
- update_ntp_one_tick();
/* accumulate error between NTP and clock interval */
clock->error += current_tick_length();
@@ -1217,19 +1008,14 @@ static inline void calc_load(unsigned long ticks)
unsigned long active_tasks; /* fixed-point */
static int count = LOAD_FREQ;
- count -= ticks;
- if (count < 0) {
- count += LOAD_FREQ;
- active_tasks = count_active_tasks();
+ active_tasks = count_active_tasks();
+ for (count -= ticks; count < 0; count += LOAD_FREQ) {
CALC_LOAD(avenrun[0], EXP_1, active_tasks);
CALC_LOAD(avenrun[1], EXP_5, active_tasks);
CALC_LOAD(avenrun[2], EXP_15, active_tasks);
}
}
-/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies = INITIAL_JIFFIES;
-
/*
* This read-write spinlock protects us from races in SMP while
* playing with xtime and avenrun.
@@ -1265,12 +1051,8 @@ void run_local_timers(void)
* Called by the timer interrupt. xtime_lock must already be taken
* by the timer IRQ!
*/
-static inline void update_times(void)
+static inline void update_times(unsigned long ticks)
{
- unsigned long ticks;
-
- ticks = jiffies - wall_jiffies;
- wall_jiffies += ticks;
update_wall_time();
calc_load(ticks);
}
@@ -1281,12 +1063,10 @@ static inline void update_times(void)
* jiffies is defined in the linker script...
*/
-void do_timer(struct pt_regs *regs)
+void do_timer(unsigned long ticks)
{
- jiffies_64++;
- /* prevent loading jiffies before storing new jiffies_64 value. */
- barrier();
- update_times();
+ jiffies_64 += ticks;
+ update_times(ticks);
}
#ifdef __ARCH_WANT_SYS_ALARM
@@ -1470,8 +1250,9 @@ asmlinkage long sys_gettid(void)
return current->pid;
}
-/*
+/**
* sys_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
*/
asmlinkage long sys_sysinfo(struct sysinfo __user *info)
{
@@ -1688,8 +1469,10 @@ static struct notifier_block __cpuinitdata timers_nb = {
void __init init_timers(void)
{
- timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+ int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
+
+ BUG_ON(err == NOTIFY_BAD);
register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
}
@@ -1774,7 +1557,7 @@ unsigned long time_interpolator_get_offset(void)
#define INTERPOLATOR_ADJUST 65536
#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
-static void time_interpolator_update(long delta_nsec)
+void time_interpolator_update(long delta_nsec)
{
u64 counter;
unsigned long offset;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
new file mode 100644
index 00000000000..db443221ba5
--- /dev/null
+++ b/kernel/tsacct.c
@@ -0,0 +1,124 @@
+/*
+ * tsacct.c - System accounting over taskstats interface
+ *
+ * Copyright (C) Jay Lan, <jlan@sgi.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/acct.h>
+#include <linux/jiffies.h>
+
+
+#define USEC_PER_TICK (USEC_PER_SEC/HZ)
+/*
+ * fill in basic accounting fields
+ */
+void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+ struct timespec uptime, ts;
+ s64 ac_etime;
+
+ BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
+
+ /* calculate task elapsed time in timespec */
+ do_posix_clock_monotonic_gettime(&uptime);
+ ts = timespec_sub(uptime, current->group_leader->start_time);
+ /* rebase elapsed time to usec */
+ ac_etime = timespec_to_ns(&ts);
+ do_div(ac_etime, NSEC_PER_USEC);
+ stats->ac_etime = ac_etime;
+ stats->ac_btime = xtime.tv_sec - ts.tv_sec;
+ if (thread_group_leader(tsk)) {
+ stats->ac_exitcode = tsk->exit_code;
+ if (tsk->flags & PF_FORKNOEXEC)
+ stats->ac_flag |= AFORK;
+ }
+ if (tsk->flags & PF_SUPERPRIV)
+ stats->ac_flag |= ASU;
+ if (tsk->flags & PF_DUMPCORE)
+ stats->ac_flag |= ACORE;
+ if (tsk->flags & PF_SIGNALED)
+ stats->ac_flag |= AXSIG;
+ stats->ac_nice = task_nice(tsk);
+ stats->ac_sched = tsk->policy;
+ stats->ac_uid = tsk->uid;
+ stats->ac_gid = tsk->gid;
+ stats->ac_pid = tsk->pid;
+ stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0;
+ stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
+ stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+ stats->ac_minflt = tsk->min_flt;
+ stats->ac_majflt = tsk->maj_flt;
+
+ strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
+}
+
+
+#ifdef CONFIG_TASK_XACCT
+
+#define KB 1024
+#define MB (1024*KB)
+/*
+ * fill in extended accounting fields
+ */
+void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
+{
+ /* convert pages-jiffies to Mbyte-usec */
+ stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
+ stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+ if (p->mm) {
+ /* adjust to KB unit */
+ stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB;
+ stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB;
+ }
+ stats->read_char = p->rchar;
+ stats->write_char = p->wchar;
+ stats->read_syscalls = p->syscr;
+ stats->write_syscalls = p->syscw;
+}
+#undef KB
+#undef MB
+
+/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+ if (likely(tsk->mm)) {
+ long delta = cputime_to_jiffies(
+ cputime_sub(tsk->stime, tsk->acct_stimexpd));
+
+ if (delta == 0)
+ return;
+ tsk->acct_stimexpd = tsk->stime;
+ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
+ tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+ }
+}
+
+/**
+ * acct_clear_integrals - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
+ */
+void acct_clear_integrals(struct task_struct *tsk)
+{
+ tsk->acct_stimexpd = 0;
+ tsk->acct_rss_mem1 = 0;
+ tsk->acct_vm_mem1 = 0;
+}
+#endif
diff --git a/kernel/unwind.c b/kernel/unwind.c
index 3430475fcd8..2e2368607aa 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -102,7 +102,7 @@ static struct unwind_table {
unsigned long size;
struct unwind_table *link;
const char *name;
-} root_table, *last_table;
+} root_table;
struct unwind_item {
enum item_location {
@@ -174,6 +174,8 @@ void __init unwind_init(void)
#ifdef CONFIG_MODULES
+static struct unwind_table *last_table;
+
/* Must be called with module_mutex held. */
void *unwind_add_table(struct module *module,
const void *table_start,