aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt15
-rw-r--r--kernel/acct.c23
-rw-r--r--kernel/audit.c30
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/cgroup.c4
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/exit.c98
-rw-r--r--kernel/futex.c4
-rw-r--r--kernel/kprobes.c52
-rw-r--r--kernel/marker.c40
-rw-r--r--kernel/module.c24
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/process.c29
-rw-r--r--kernel/power/snapshot.c41
-rw-r--r--kernel/printk.c83
-rw-r--r--kernel/relay.c12
-rw-r--r--kernel/res_counter.c1
-rw-r--r--kernel/sched.c437
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c420
-rw-r--r--kernel/sched_rt.c10
-rw-r--r--kernel/signal.c16
-rw-r--r--kernel/sysctl.c18
-rw-r--r--kernel/time/clocksource.c16
-rw-r--r--kernel/time/ntp.c23
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c10
-rw-r--r--kernel/timer.c10
28 files changed, 710 insertions, 717 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0669b70fa6a..9fdba03dc1f 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,8 +52,23 @@ config PREEMPT
endchoice
+config PREEMPT_RCU
+ bool "Preemptible RCU"
+ depends on PREEMPT
+ default n
+ help
+ This option reduces the latency of the kernel by making certain
+ RCU sections preemptible. Normally RCU code is non-preemptible, if
+ this option is selected then read-only RCU sections become
+ preemptible. This helps latency, but may expose bugs due to
+ now-naive assumptions about each RCU read-side critical section
+ remaining on a given CPU through its execution.
+
+ Say N if you are unsure.
+
config RCU_TRACE
bool "Enable tracing for RCU - currently stats in debugfs"
+ depends on PREEMPT_RCU
select DEBUG_FS
default y
help
diff --git a/kernel/acct.c b/kernel/acct.c
index 521dfa53cb9..91e1cfd734d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -58,6 +58,7 @@
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
+#include <linux/pid_namespace.h>
/*
* These constants control the amount of freespace that suspend and
@@ -74,7 +75,7 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(struct file *);
+static void do_acct_process(struct pid_namespace *ns, struct file *);
/*
* This structure is used so that all the data protected by lock
@@ -86,6 +87,7 @@ struct acct_glbs {
volatile int active;
volatile int needcheck;
struct file *file;
+ struct pid_namespace *ns;
struct timer_list timer;
};
@@ -175,9 +177,11 @@ out:
static void acct_file_reopen(struct file *file)
{
struct file *old_acct = NULL;
+ struct pid_namespace *old_ns = NULL;
if (acct_globals.file) {
old_acct = acct_globals.file;
+ old_ns = acct_globals.ns;
del_timer(&acct_globals.timer);
acct_globals.active = 0;
acct_globals.needcheck = 0;
@@ -185,6 +189,7 @@ static void acct_file_reopen(struct file *file)
}
if (file) {
acct_globals.file = file;
+ acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
acct_globals.needcheck = 0;
acct_globals.active = 1;
/* It's been deleted if it was used before so this is safe */
@@ -196,8 +201,9 @@ static void acct_file_reopen(struct file *file)
if (old_acct) {
mnt_unpin(old_acct->f_path.mnt);
spin_unlock(&acct_globals.lock);
- do_acct_process(old_acct);
+ do_acct_process(old_ns, old_acct);
filp_close(old_acct, NULL);
+ put_pid_ns(old_ns);
spin_lock(&acct_globals.lock);
}
}
@@ -419,7 +425,7 @@ static u32 encode_float(u64 value)
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
-static void do_acct_process(struct file *file)
+static void do_acct_process(struct pid_namespace *ns, struct file *file)
{
struct pacct_struct *pacct = &current->signal->pacct;
acct_t ac;
@@ -481,8 +487,10 @@ static void do_acct_process(struct file *file)
ac.ac_gid16 = current->gid;
#endif
#if ACCT_VERSION==3
- ac.ac_pid = current->tgid;
- ac.ac_ppid = current->real_parent->tgid;
+ ac.ac_pid = task_tgid_nr_ns(current, ns);
+ rcu_read_lock();
+ ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
+ rcu_read_unlock();
#endif
spin_lock_irq(&current->sighand->siglock);
@@ -578,6 +586,7 @@ void acct_collect(long exitcode, int group_dead)
void acct_process(void)
{
struct file *file = NULL;
+ struct pid_namespace *ns;
/*
* accelerate the common fastpath:
@@ -592,8 +601,10 @@ void acct_process(void)
return;
}
get_file(file);
+ ns = get_pid_ns(acct_globals.ns);
spin_unlock(&acct_globals.lock);
- do_acct_process(file);
+ do_acct_process(ns, file);
fput(file);
+ put_pid_ns(ns);
}
diff --git a/kernel/audit.c b/kernel/audit.c
index 2eeea9a1424..be55cb50363 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -78,9 +78,13 @@ static int audit_default;
/* If auditing cannot proceed, audit_failure selects what happens. */
static int audit_failure = AUDIT_FAIL_PRINTK;
-/* If audit records are to be written to the netlink socket, audit_pid
- * contains the (non-zero) pid. */
+/*
+ * If audit records are to be written to the netlink socket, audit_pid
+ * contains the pid of the auditd process and audit_nlk_pid contains
+ * the pid to use to send netlink messages to that process.
+ */
int audit_pid;
+static int audit_nlk_pid;
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
@@ -170,7 +174,9 @@ void audit_panic(const char *message)
printk(KERN_ERR "audit: %s\n", message);
break;
case AUDIT_FAIL_PANIC:
- panic("audit: %s\n", message);
+ /* test audit_pid since printk is always losey, why bother? */
+ if (audit_pid)
+ panic("audit: %s\n", message);
break;
}
}
@@ -348,10 +354,11 @@ static int kauditd_thread(void *dummy)
wake_up(&audit_backlog_wait);
if (skb) {
if (audit_pid) {
- int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
+ int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
if (err < 0) {
BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+ audit_log_lost("auditd dissapeared\n");
audit_pid = 0;
}
} else {
@@ -623,6 +630,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
sid, 1);
audit_pid = new_pid;
+ audit_nlk_pid = NETLINK_CB(skb).pid;
}
if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
err = audit_set_rate_limit(status_get->rate_limit,
@@ -1350,17 +1358,19 @@ void audit_log_end(struct audit_buffer *ab)
if (!audit_rate_check()) {
audit_log_lost("rate limit exceeded");
} else {
+ struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
if (audit_pid) {
- struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
skb_queue_tail(&audit_skb_queue, ab->skb);
ab->skb = NULL;
wake_up_interruptible(&kauditd_wait);
- } else if (printk_ratelimit()) {
- struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0));
- } else {
- audit_log_lost("printk limit exceeded\n");
+ } else if (nlh->nlmsg_type != AUDIT_EOE) {
+ if (printk_ratelimit()) {
+ printk(KERN_NOTICE "type=%d %s\n",
+ nlh->nlmsg_type,
+ ab->skb->data + NLMSG_SPACE(0));
+ } else
+ audit_log_lost("printk limit exceeded\n");
}
}
audit_buffer_free(ab);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2087d6de67e..782262e4107 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1070,7 +1070,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
* so we can be sure nothing was lost.
*/
if ((i == 0) && (too_long))
- audit_log_format(*ab, "a%d_len=%ld ", arg_num,
+ audit_log_format(*ab, "a%d_len=%zu ", arg_num,
has_cntl ? 2*len : len);
/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d8abe996e00..e9c2fb01e89 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2232,7 +2232,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
mutex_lock(&cgroup_mutex);
- cgrp->flags = 0;
INIT_LIST_HEAD(&cgrp->sibling);
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
@@ -2242,6 +2241,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
cgrp->root = parent->root;
cgrp->top_cgroup = parent->top_cgroup;
+ if (notify_on_release(parent))
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css = ss->create(ss, cgrp);
if (IS_ERR(css)) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e296ed81d4..a1b61f41422 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -322,8 +322,8 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
* Call without callback_mutex or task_lock() held. May be
* called with or without cgroup_mutex held. Thanks in part to
* 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL. This routine also might acquire callback_mutex and
- * current->mm->mmap_sem during call.
+ * be NULL. This routine also might acquire callback_mutex during
+ * call.
*
* Reading current->cpuset->mems_generation doesn't need task_lock
* to guard the current->cpuset derefence, because it is guarded
diff --git a/kernel/exit.c b/kernel/exit.c
index 506a957b665..53872bf993f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -214,20 +214,19 @@ struct pid *session_of_pgrp(struct pid *pgrp)
static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
{
struct task_struct *p;
- int ret = 1;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
- if (p == ignored_task
- || p->exit_state
- || is_global_init(p->real_parent))
+ if ((p == ignored_task) ||
+ (p->exit_state && thread_group_empty(p)) ||
+ is_global_init(p->real_parent))
continue;
+
if (task_pgrp(p->real_parent) != pgrp &&
- task_session(p->real_parent) == task_session(p)) {
- ret = 0;
- break;
- }
+ task_session(p->real_parent) == task_session(p))
+ return 0;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return ret; /* (sighing) "Often!" */
+
+ return 1;
}
int is_current_pgrp_orphaned(void)
@@ -255,6 +254,37 @@ static int has_stopped_jobs(struct pid *pgrp)
return retval;
}
+/*
+ * Check to see if any process groups have become orphaned as
+ * a result of our exiting, and if they have any stopped jobs,
+ * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void
+kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
+{
+ struct pid *pgrp = task_pgrp(tsk);
+ struct task_struct *ignored_task = tsk;
+
+ if (!parent)
+ /* exit: our father is in a different pgrp than
+ * we are and we were the only connection outside.
+ */
+ parent = tsk->real_parent;
+ else
+ /* reparent: our child is in a different pgrp than
+ * we are, and it was the only connection outside.
+ */
+ ignored_task = NULL;
+
+ if (task_pgrp(parent) != pgrp &&
+ task_session(parent) == task_session(tsk) &&
+ will_become_orphaned_pgrp(pgrp, ignored_task) &&
+ has_stopped_jobs(pgrp)) {
+ __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+ __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
+ }
+}
+
/**
* reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
*
@@ -635,22 +665,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
p->exit_signal != -1 && thread_group_empty(p))
do_notify_parent(p, p->exit_signal);
- /*
- * process group orphan check
- * Case ii: Our child is in a different pgrp
- * than we are, and it was the only connection
- * outside, so the child pgrp is now orphaned.
- */
- if ((task_pgrp(p) != task_pgrp(father)) &&
- (task_session(p) == task_session(father))) {
- struct pid *pgrp = task_pgrp(p);
-
- if (will_become_orphaned_pgrp(pgrp, NULL) &&
- has_stopped_jobs(pgrp)) {
- __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
- __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
- }
- }
+ kill_orphaned_pgrp(p, father);
}
/*
@@ -735,11 +750,9 @@ static void forget_original_parent(struct task_struct *father)
* Send signals to all our closest relatives so that they know
* to properly mourn us..
*/
-static void exit_notify(struct task_struct *tsk)
+static void exit_notify(struct task_struct *tsk, int group_dead)
{
int state;
- struct task_struct *t;
- struct pid *pgrp;
/*
* This does two things:
@@ -753,25 +766,8 @@ static void exit_notify(struct task_struct *tsk)
exit_task_namespaces(tsk);
write_lock_irq(&tasklist_lock);
- /*
- * Check to see if any process groups have become orphaned
- * as a result of our exiting, and if they have any stopped
- * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
- *
- * Case i: Our father is in a different pgrp than we are
- * and we were the only connection outside, so our pgrp
- * is about to become orphaned.
- */
- t = tsk->real_parent;
-
- pgrp = task_pgrp(tsk);
- if ((task_pgrp(t) != pgrp) &&
- (task_session(t) == task_session(tsk)) &&
- will_become_orphaned_pgrp(pgrp, tsk) &&
- has_stopped_jobs(pgrp)) {
- __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
- __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
- }
+ if (group_dead)
+ kill_orphaned_pgrp(tsk->group_leader, NULL);
/* Let father know we died
*
@@ -788,8 +784,8 @@ static void exit_notify(struct task_struct *tsk)
* the same after a fork.
*/
if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
- ( tsk->parent_exec_id != t->self_exec_id ||
- tsk->self_exec_id != tsk->parent_exec_id)
+ (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
+ tsk->self_exec_id != tsk->parent_exec_id)
&& !capable(CAP_KILL))
tsk->exit_signal = SIGCHLD;
@@ -986,7 +982,7 @@ NORET_TYPE void do_exit(long code)
module_put(tsk->binfmt->module);
proc_exit_connector(tsk);
- exit_notify(tsk);
+ exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
mpol_free(tsk->mempolicy);
tsk->mempolicy = NULL;
@@ -1382,7 +1378,7 @@ unlock_sig:
if (!retval && infop)
retval = put_user(0, &infop->si_errno);
if (!retval && infop)
- retval = put_user(why, &infop->si_code);
+ retval = put_user((short)why, &infop->si_code);
if (!retval && infop)
retval = put_user(exit_code, &infop->si_status);
if (!retval && infop)
diff --git a/kernel/futex.c b/kernel/futex.c
index 06968cd7920..87a6428cb5b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2158,7 +2158,7 @@ static struct file_system_type futex_fs_type = {
.kill_sb = kill_anon_super,
};
-static int __init init(void)
+static int __init futex_init(void)
{
u32 curval;
int i;
@@ -2194,4 +2194,4 @@ static int __init init(void)
return 0;
}
-__initcall(init);
+__initcall(futex_init);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7a86e643233..fcfb580c3af 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -498,27 +498,36 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
return 0;
}
+/*
+ * If we have a symbol_name argument, look it up and add the offset field
+ * to it. This way, we can specify a relative address to a symbol.
+ */
+static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
+{
+ kprobe_opcode_t *addr = p->addr;
+ if (p->symbol_name) {
+ if (addr)
+ return NULL;
+ kprobe_lookup_name(p->symbol_name, addr);
+ }
+
+ if (!addr)
+ return NULL;
+ return (kprobe_opcode_t *)(((char *)addr) + p->offset);
+}
+
static int __kprobes __register_kprobe(struct kprobe *p,
unsigned long called_from)
{
int ret = 0;
struct kprobe *old_p;
struct module *probed_mod;
+ kprobe_opcode_t *addr;
- /*
- * If we have a symbol_name argument look it up,
- * and add it to the address. That way the addr
- * field can either be global or relative to a symbol.
- */
- if (p->symbol_name) {
- if (p->addr)
- return -EINVAL;
- kprobe_lookup_name(p->symbol_name, p->addr);
- }
-
- if (!p->addr)
+ addr = kprobe_addr(p);
+ if (!addr)
return -EINVAL;
- p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
+ p->addr = addr;
if (!kernel_text_address((unsigned long) p->addr) ||
in_kprobes_functions((unsigned long) p->addr))
@@ -678,8 +687,7 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
unregister_kprobe(&jp->kp);
}
-#ifdef ARCH_SUPPORTS_KRETPROBES
-
+#ifdef CONFIG_KRETPROBES
/*
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
@@ -722,12 +730,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
int ret = 0;
struct kretprobe_instance *inst;
int i;
- void *addr = rp->kp.addr;
+ void *addr;
if (kretprobe_blacklist_size) {
- if (addr == NULL)
- kprobe_lookup_name(rp->kp.symbol_name, addr);
- addr += rp->kp.offset;
+ addr = kprobe_addr(&rp->kp);
+ if (!addr)
+ return -EINVAL;
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
if (kretprobe_blacklist[i].addr == addr)
@@ -769,8 +777,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
return ret;
}
-#else /* ARCH_SUPPORTS_KRETPROBES */
-
+#else /* CONFIG_KRETPROBES */
int __kprobes register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
@@ -781,8 +788,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
{
return 0;
}
-
-#endif /* ARCH_SUPPORTS_KRETPROBES */
+#endif /* CONFIG_KRETPROBES */
void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
diff --git a/kernel/marker.c b/kernel/marker.c
index 50effc01d9a..041c33e3e95 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -104,18 +104,18 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
char ptype;
/*
- * disabling preemption to make sure the teardown of the callbacks can
- * be done correctly when they are in modules and they insure RCU read
- * coherency.
+ * preempt_disable does two things : disabling preemption to make sure
+ * the teardown of the callbacks can be done correctly when they are in
+ * modules and they insure RCU read coherency.
*/
preempt_disable();
- ptype = ACCESS_ONCE(mdata->ptype);
+ ptype = mdata->ptype;
if (likely(!ptype)) {
marker_probe_func *func;
/* Must read the ptype before ptr. They are not data dependant,
* so we put an explicit smp_rmb() here. */
smp_rmb();
- func = ACCESS_ONCE(mdata->single.func);
+ func = mdata->single.func;
/* Must read the ptr before private data. They are not data
* dependant, so we put an explicit smp_rmb() here. */
smp_rmb();
@@ -133,7 +133,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
* in the fast path, so put the explicit barrier here.
*/
smp_read_barrier_depends();
- multi = ACCESS_ONCE(mdata->multi);
+ multi = mdata->multi;
for (i = 0; multi[i].func; i++) {
va_start(args, fmt);
multi[i].func(multi[i].probe_private, call_private, fmt,
@@ -161,13 +161,13 @@ void marker_probe_cb_noarg(const struct marker *mdata,
char ptype;
preempt_disable();
- ptype = ACCESS_ONCE(mdata->ptype);
+ ptype = mdata->ptype;
if (likely(!ptype)) {
marker_probe_func *func;
/* Must read the ptype before ptr. They are not data dependant,
* so we put an explicit smp_rmb() here. */
smp_rmb();
- func = ACCESS_ONCE(mdata->single.func);
+ func = mdata->single.func;
/* Must read the ptr before private data. They are not data
* dependant, so we put an explicit smp_rmb() here. */
smp_rmb();
@@ -183,7 +183,7 @@ void marker_probe_cb_noarg(const struct marker *mdata,
* in the fast path, so put the explicit barrier here.
*/
smp_read_barrier_depends();
- multi = ACCESS_ONCE(mdata->multi);
+ multi = mdata->multi;
for (i = 0; multi[i].func; i++)
multi[i].func(multi[i].probe_private, call_private, fmt,
&args);
@@ -551,9 +551,9 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
/*
* Disable a marker and its probe callback.
- * Note: only after a synchronize_sched() issued after setting elem->call to the
- * empty function insures that the original callback is not used anymore. This
- * insured by preemption disabling around the call site.
+ * Note: only waiting an RCU period after setting elem->call to the empty
+ * function insures that the original callback is not used anymore. This insured
+ * by preempt_disable around the call site.
*/
static void disable_marker(struct marker *elem)
{
@@ -565,8 +565,8 @@ static void disable_marker(struct marker *elem)
elem->ptype = 0; /* single probe */
/*
* Leave the private data and id there, because removal is racy and
- * should be done only after a synchronize_sched(). These are never used
- * until the next initialization anyway.
+ * should be done only after an RCU period. These are never used until
+ * the next initialization anyway.
*/
}
@@ -601,9 +601,6 @@ void marker_update_probe_range(struct marker *begin,
/*
* Update probes, removing the faulty probes.
- * Issues a synchronize_sched() when no reference to the module passed
- * as parameter is found in the probes so the probe module can be
- * safely unloaded from now on.
*
* Internal callback only changed before the first probe is connected to it.
* Single probe private data can only be changed on 0 -> 1 and 2 -> 1
@@ -698,14 +695,12 @@ int marker_probe_unregister(const char *name,
{
struct marker_entry *entry;
struct marker_probe_closure *old;
- int ret = 0;
+ int ret = -ENOENT;
mutex_lock(&markers_mutex);
entry = get_marker(name);
- if (!entry) {
- ret = -ENOENT;
+ if (!entry)
goto end;
- }
if (entry->rcu_pending)
rcu_barrier();
old = marker_entry_remove_probe(entry, probe, probe_private);
@@ -713,12 +708,15 @@ int marker_probe_unregister(const char *name,
marker_update_probes(); /* may update entry */
mutex_lock(&markers_mutex);
entry = get_marker(name);
+ if (!entry)
+ goto end;
entry->oldptr = old;
entry->rcu_pending = 1;
/* write rcu_pending before calling the RCU callback */
smp_wmb();
call_rcu(&entry->rcu, free_old_closure);
remove_marker(name); /* Ignore busy error message */
+ ret = 0;
end:
mutex_unlock(&markers_mutex);
return ret;
diff --git a/kernel/module.c b/kernel/module.c
index 901cd6ac2f1..5d437bffd8d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1933,8 +1933,15 @@ static struct module *load_module(void __user *umod,
/* Set up license info based on the info section */
set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+ /*
+ * ndiswrapper is under GPL by itself, but loads proprietary modules.
+ * Don't use add_taint_module(), as it would prevent ndiswrapper from
+ * using GPL-only symbols it needs.
+ */
if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint(TAINT_PROPRIETARY_MODULE);
+
+ /* driverloader was caught wrongly pretending to be under GPL */
if (strcmp(mod->name, "driverloader") == 0)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -2171,10 +2178,20 @@ sys_init_module(void __user *umod,
wake_up(&module_wq);
return ret;
}
+ if (ret > 0) {
+ printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
+ "it should follow 0/-E convention\n"
+ KERN_WARNING "%s: loading module anyway...\n",
+ __func__, mod->name, ret,
+ __func__);
+ dump_stack();
+ }
- /* Now it's a first class citizen! */
- mutex_lock(&module_mutex);
+ /* Now it's a first class citizen! Wake up anyone waiting for it. */
mod->state = MODULE_STATE_LIVE;
+ wake_up(&module_wq);
+
+ mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
unwind_remove_table(mod->unwind_info, 1);
@@ -2183,7 +2200,6 @@ sys_init_module(void __user *umod,
mod->init_size = 0;
mod->init_text_size = 0;
mutex_unlock(&module_mutex);
- wake_up(&module_wq);
return 0;
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 79833170bb9..6233f3b4ae6 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -190,7 +190,7 @@ config APM_EMULATION
notification of APM "events" (e.g. battery status change).
In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/pm.txt> and the
+ and more information, read <file:Documentation/power/pm.txt> and the
Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7c2118f9597..f1d0b345c9b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -75,22 +75,15 @@ void refrigerator(void)
__set_current_state(save);
}
-static void fake_signal_wake_up(struct task_struct *p, int resume)
+static void fake_signal_wake_up(struct task_struct *p)
{
unsigned long flags;
spin_lock_irqsave(&p->sighand->siglock, flags);
- signal_wake_up(p, resume);
+ signal_wake_up(p, 0);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
-static void send_fake_signal(struct task_struct *p)
-{
- if (task_is_stopped(p))
- force_sig_specific(SIGSTOP, p);
- fake_signal_wake_up(p, task_is_stopped(p));
-}
-
static int has_mm(struct task_struct *p)
{
return (p->mm && !(p->flags & PF_BORROWED_MM));
@@ -121,7 +114,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only)
if (freezing(p)) {
if (has_mm(p)) {
if (!signal_pending(p))
- fake_signal_wake_up(p, 0);
+ fake_signal_wake_up(p);
} else {
if (with_mm_only)
ret = 0;
@@ -135,7 +128,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only)
} else {
if (has_mm(p)) {
set_freeze_flag(p);
- send_fake_signal(p);
+ fake_signal_wake_up(p);
} else {
if (with_mm_only) {
ret = 0;
@@ -182,15 +175,17 @@ static int try_to_freeze_tasks(int freeze_user_space)
if (frozen(p) || !freezeable(p))
continue;
- if (task_is_traced(p) && frozen(p->parent)) {
- cancel_freezing(p);
- continue;
- }
-
if (!freeze_task(p, freeze_user_space))
continue;
- if (!freezer_should_skip(p))
+ /*
+ * Now that we've done set_freeze_flag, don't
+ * perturb a task in TASK_STOPPED or TASK_TRACED.
+ * It is "frozen enough". If the task does wake
+ * up, it will immediately call try_to_freeze.
+ */
+ if (!task_is_stopped_or_traced(p) &&
+ !freezer_should_skip(p))
todo++;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 72a020cabb4..5f91a07c4ea 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
* of @bm->cur_zone_bm are updated.
*/
-static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
void **addr, unsigned int *bit_nr)
{
struct zone_bitmap *zone_bm;
@@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
zone_bm = zone_bm->next;
- BUG_ON(!zone_bm);
+ if (!zone_bm)
+ return -EFAULT;
}
bm->cur.zone_bm = zone_bm;
}
@@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
pfn -= bb->start_pfn;
*bit_nr = pfn % BM_BITS_PER_CHUNK;
*addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+ return 0;
}
static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
+ int error;
- memory_bm_find_bit(bm, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
set_bit(bit, addr);
}
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ if (!error)
+ set_bit(bit, addr);
+ return error;
+}
+
static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
+ int error;
- memory_bm_find_bit(bm, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
clear_bit(bit, addr);
}
@@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
+ int error;
- memory_bm_find_bit(bm, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
return test_bit(bit, addr);
}
@@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
region->end_pfn << PAGE_SHIFT);
for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
- if (pfn_valid(pfn))
- memory_bm_set_bit(bm, pfn);
+ if (pfn_valid(pfn)) {
+ /*
+ * It is safe to ignore the result of
+ * mem_bm_set_bit_check() here, since we won't
+ * touch the PFNs for which the error is
+ * returned anyway.
+ */
+ mem_bm_set_bit_check(bm, pfn);
+ }
}
}
diff --git a/kernel/printk.c b/kernel/printk.c
index 9adc2a473e6..c46a20a19a1 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -616,6 +616,40 @@ asmlinkage int printk(const char *fmt, ...)
/* cpu currently holding logbuf_lock */
static volatile unsigned int printk_cpu = UINT_MAX;
+/*
+ * Can we actually use the console at this time on this cpu?
+ *
+ * Console drivers may assume that per-cpu resources have
+ * been allocated. So unless they're explicitly marked as
+ * being able to cope (CON_ANYTIME) don't call them until
+ * this CPU is officially up.
+ */
+static inline int can_use_console(unsigned int cpu)
+{
+ return cpu_online(cpu) || have_callable_console();
+}
+
+/*
+ * Try to get console ownership to actually show the kernel
+ * messages from a 'printk'. Return true (and with the
+ * console_semaphore held, and 'console_locked' set) if it
+ * is successful, false otherwise.
+ *
+ * This gets called with the 'logbuf_lock' spinlock held and
+ * interrupts disabled. It should return with 'lockbuf_lock'
+ * released but interrupts still disabled.
+ */
+static int acquire_console_semaphore_for_printk(unsigned int cpu)
+{
+ int retval = 0;
+
+ if (can_use_console(cpu))
+ retval = !try_acquire_console_sem();
+ printk_cpu = UINT_MAX;
+ spin_unlock(&logbuf_lock);
+ return retval;
+}
+
const char printk_recursion_bug_msg [] =
KERN_CRIT "BUG: recent printk recursion!\n";
static int printk_recursion_bug;
@@ -725,43 +759,22 @@ asmlinkage int vprintk(const char *fmt, va_list args)
log_level_unknown = 1;
}
- if (!down_trylock(&console_sem)) {
- /*
- * We own the drivers. We can drop the spinlock and
- * let release_console_sem() print the text, maybe ...
- */
- console_locked = 1;
- printk_cpu = UINT_MAX;
- spin_unlock(&logbuf_lock);
+ /*
+ * Try to acquire and then immediately release the
+ * console semaphore. The release will do all the
+ * actual magic (print out buffers, wake up klogd,
+ * etc).
+ *
+ * The acquire_console_semaphore_for_printk() function
+ * will release 'logbuf_lock' regardless of whether it
+ * actually gets the semaphore or not.
+ */
+ if (acquire_console_semaphore_for_printk(this_cpu))
+ release_console_sem();
- /*
- * Console drivers may assume that per-cpu resources have
- * been allocated. So unless they're explicitly marked as
- * being able to cope (CON_ANYTIME) don't call them until
- * this CPU is officially up.
- */
- if (cpu_online(smp_processor_id()) || have_callable_console()) {
- console_may_schedule = 0;
- release_console_sem();
- } else {
- /* Release by hand to avoid flushing the buffer. */
- console_locked = 0;
- up(&console_sem);
- }
- lockdep_on();
- raw_local_irq_restore(flags);
- } else {
- /*
- * Someone else owns the drivers. We drop the spinlock, which
- * allows the semaphore holder to proceed and to call the
- * console drivers with the output which we just produced.
- */
- printk_cpu = UINT_MAX;
- spin_unlock(&logbuf_lock);
- lockdep_on();
+ lockdep_on();
out_restore_irqs:
- raw_local_irq_restore(flags);
- }
+ raw_local_irq_restore(flags);
preempt_enable();
return printed_len;
diff --git a/kernel/relay.c b/kernel/relay.c
index d080b9d161a..d6204a48581 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -736,7 +736,7 @@ static int relay_file_open(struct inode *inode, struct file *filp)
kref_get(&buf->kref);
filp->private_data = buf;
- return 0;
+ return nonseekable_open(inode, filp);
}
/**
@@ -1056,6 +1056,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = {
.get = generic_pipe_buf_get,
};
+static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+}
+
/*
* subbuf_splice_actor - splice up to one subbuf's worth of data
*/
@@ -1066,7 +1070,7 @@ static int subbuf_splice_actor(struct file *in,
unsigned int flags,
int *nonpad_ret)
{
- unsigned int pidx, poff, total_len, subbuf_pages, ret;
+ unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
struct rchan_buf *rbuf = in->private_data;
unsigned int subbuf_size = rbuf->chan->subbuf_size;
uint64_t pos = (uint64_t) *ppos;
@@ -1083,6 +1087,7 @@ static int subbuf_splice_actor(struct file *in,
.partial = partial,
.flags = flags,
.ops = &relay_pipe_buf_ops,
+ .spd_release = relay_page_release,
};
if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
@@ -1097,8 +1102,9 @@ static int subbuf_splice_actor(struct file *in,
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
+ nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
- for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) {
+ for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
unsigned int this_len, this_end, private;
unsigned int cur_pos = read_start + total_len;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 16cbec2d5d6..efbfc0fc232 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -113,6 +113,7 @@ ssize_t res_counter_write(struct res_counter *counter, int member,
ret = -EINVAL;
+ strstrip(buf);
if (write_strategy) {
if (write_strategy(buf, &tmp)) {
goto out_free;
diff --git a/kernel/sched.c b/kernel/sched.c
index f06950c8a6c..8dcdec6fe0f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -174,41 +174,6 @@ struct task_group {
struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
-
- /*
- * shares assigned to a task group governs how much of cpu bandwidth
- * is allocated to the group. The more shares a group has, the more is
- * the cpu bandwidth allocated to it.
- *
- * For ex, lets say that there are three task groups, A, B and C which
- * have been assigned shares 1000, 2000 and 3000 respectively. Then,
- * cpu bandwidth allocated by the scheduler to task groups A, B and C
- * should be:
- *
- * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
- * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
- * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
- *
- * The weight assigned to a task group's schedulable entities on every
- * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
- * group's shares. For ex: lets say that task group A has been
- * assigned shares of 1000 and there are two CPUs in a system. Then,
- *
- * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
- *
- * Note: It's not necessary that each of a task's group schedulable
- * entity have the same weight on all CPUs. If the group
- * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
- * better distribution of weight could be:
- *
- * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
- * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
- *
- * rebalance_shares() is responsible for distributing the shares of a
- * task groups like this among the group's schedulable entities across
- * cpus.
- *
- */
unsigned long shares;
#endif
@@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock);
static DEFINE_MUTEX(doms_cur_mutex);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-/* kernel thread that runs rebalance_shares() periodically */
-static struct task_struct *lb_monitor_task;
-static int load_balance_monitor(void *unused);
-#endif
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares);
-
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
#else
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
#endif
-#define MIN_GROUP_SHARES 2
-
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
#endif
@@ -346,7 +301,7 @@ struct cfs_rq {
/* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr;
+ struct sched_entity *curr, *next;
unsigned long nr_spread_over;
@@ -639,18 +594,14 @@ enum {
SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
SCHED_FEAT_WAKEUP_PREEMPT = 2,
SCHED_FEAT_START_DEBIT = 4,
- SCHED_FEAT_TREE_AVG = 8,
- SCHED_FEAT_APPROX_AVG = 16,
- SCHED_FEAT_HRTICK = 32,
- SCHED_FEAT_DOUBLE_TICK = 64,
+ SCHED_FEAT_HRTICK = 8,
+ SCHED_FEAT_DOUBLE_TICK = 16,
};
const_debug unsigned int sysctl_sched_features =
SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
SCHED_FEAT_WAKEUP_PREEMPT * 1 |
SCHED_FEAT_START_DEBIT * 1 |
- SCHED_FEAT_TREE_AVG * 0 |
- SCHED_FEAT_APPROX_AVG * 0 |
SCHED_FEAT_HRTICK * 1 |
SCHED_FEAT_DOUBLE_TICK * 0;
@@ -1101,6 +1052,49 @@ static void resched_cpu(int cpu)
resched_task(cpu_curr(cpu));
spin_unlock_irqrestore(&rq->lock, flags);
}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (cpu == smp_processor_id())
+ return;
+
+ /*
+ * This is safe, as this function is called with the timer
+ * wheel base lock of (cpu) held. When the CPU is on the way
+ * to idle and has not yet set rq->curr to idle then it will
+ * be serialized on the timer wheel base lock and take the new
+ * timer into account automatically.
+ */
+ if (rq->curr != rq->idle)
+ return;
+
+ /*
+ * We can set TIF_RESCHED on the idle task of the other CPU
+ * lockless. The worst case is that the other CPU runs the
+ * idle task through an additional NOOP schedule()
+ */
+ set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+
+ /* NEED_RESCHED must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(rq->idle))
+ smp_send_reschedule(cpu);
+}
+#endif
+
#else
static void __resched_task(struct task_struct *p, int tif_bit)
{
@@ -1129,7 +1123,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
u64 tmp;
if (unlikely(!lw->inv_weight))
- lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
+ lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
tmp = (u64)delta_exec * weight;
/*
@@ -1153,11 +1147,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
+ lw->inv_weight = 0;
}
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
+ lw->inv_weight = 0;
}
/*
@@ -1245,16 +1241,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
#endif
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
-{
- update_load_add(&rq->load, load);
-}
-
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
-{
- update_load_sub(&rq->load, load);
-}
-
#ifdef CONFIG_SMP
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
@@ -1272,14 +1258,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
#define sched_class_highest (&rt_sched_class)
-static void inc_nr_running(struct rq *rq)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+ update_load_add(&rq->load, p->se.load.weight);
+}
+
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+ update_load_sub(&rq->load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running++;
+ inc_load(rq, p);
}
-static void dec_nr_running(struct rq *rq)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running--;
+ dec_load(rq, p);
}
static void set_load_weight(struct task_struct *p)
@@ -1371,7 +1369,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
rq->nr_uninterruptible--;
enqueue_task(rq, p, wakeup);
- inc_nr_running(rq);
+ inc_nr_running(p, rq);
}
/*
@@ -1383,7 +1381,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
rq->nr_uninterruptible++;
dequeue_task(rq, p, sleep);
- dec_nr_running(rq);
+ dec_nr_running(p, rq);
}
/**
@@ -1437,6 +1435,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
{
s64 delta;
+ /*
+ * Buddy candidates are cache hot:
+ */
+ if (&p->se == cfs_rq_of(&p->se)->next)
+ return 1;
+
if (p->sched_class != &fair_sched_class)
return 0;
@@ -1896,10 +1900,11 @@ out_activate:
schedstat_inc(p, se.nr_wakeups_remote);
update_rq_clock(rq);
activate_task(rq, p, 1);
- check_preempt_curr(rq, p);
success = 1;
out_running:
+ check_preempt_curr(rq, p);
+
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
@@ -1933,6 +1938,8 @@ static void __sched_fork(struct task_struct *p)
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
+ p->se.last_wakeup = 0;
+ p->se.avg_overlap = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
@@ -2023,7 +2030,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* management (if any):
*/
p->sched_class->task_new(rq, p);
- inc_nr_running(rq);
+ inc_nr_running(p, rq);
}
check_preempt_curr(rq, p);
#ifdef CONFIG_SMP
@@ -3918,7 +3925,7 @@ need_resched_nonpreemptible:
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
- unlikely(signal_pending(prev)))) {
+ signal_pending(prev))) {
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, 1);
@@ -4311,11 +4318,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
oldprio = p->prio;
on_rq = p->se.on_rq;
running = task_current(rq, p);
- if (on_rq) {
+ if (on_rq)
dequeue_task(rq, p, 0);
- if (running)
- p->sched_class->put_prev_task(rq, p);
- }
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
if (rt_prio(prio))
p->sched_class = &rt_sched_class;
@@ -4324,10 +4330,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
p->prio = prio;
+ if (running)
+ p->sched_class->set_curr_task(rq);
if (on_rq) {
- if (running)
- p->sched_class->set_curr_task(rq);
-
enqueue_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio, running);
@@ -4362,8 +4367,10 @@ void set_user_nice(struct task_struct *p, long nice)
goto out_unlock;
}
on_rq = p->se.on_rq;
- if (on_rq)
+ if (on_rq) {
dequeue_task(rq, p, 0);
+ dec_load(rq, p);
+ }
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -4373,6 +4380,7 @@ void set_user_nice(struct task_struct *p, long nice)
if (on_rq) {
enqueue_task(rq, p, 0);
+ inc_load(rq, p);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -4462,7 +4470,7 @@ int task_nice(const struct task_struct *p)
{
return TASK_NICE(p);
}
-EXPORT_SYMBOL_GPL(task_nice);
+EXPORT_SYMBOL(task_nice);
/**
* idle_cpu - is a given cpu idle currently?
@@ -4621,19 +4629,17 @@ recheck:
update_rq_clock(rq);
on_rq = p->se.on_rq;
running = task_current(rq, p);
- if (on_rq) {
+ if (on_rq)
deactivate_task(rq, p, 0);
- if (running)
- p->sched_class->put_prev_task(rq, p);
- }
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
oldprio = p->prio;
__setscheduler(rq, p, policy, param->sched_priority);
+ if (running)
+ p->sched_class->set_curr_task(rq);
if (on_rq) {
- if (running)
- p->sched_class->set_curr_task(rq);
-
activate_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio, running);
@@ -5140,7 +5146,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
time_slice = 0;
if (p->policy == SCHED_RR) {
time_slice = DEF_TIMESLICE;
- } else {
+ } else if (p->policy != SCHED_FIFO) {
struct sched_entity *se = &p->se;
unsigned long flags;
struct rq *rq;
@@ -5921,7 +5927,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
spin_unlock_irq(&rq->lock);
break;
- case CPU_DOWN_PREPARE:
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
/* Update our root-domain */
rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
@@ -6843,6 +6850,10 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */
*/
static cpumask_t fallback_doms;
+void __attribute__((weak)) arch_update_cpu_topology(void)
+{
+}
+
/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
* For now this just excludes isolated cpus, but could be used to
@@ -6852,6 +6863,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
{
int err;
+ arch_update_cpu_topology();
ndoms_cur = 1;
doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
if (!doms_cur)
@@ -6956,7 +6968,7 @@ match2:
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static int arch_reinit_sched_domains(void)
+int arch_reinit_sched_domains(void)
{
int err;
@@ -7087,21 +7099,6 @@ void __init sched_init_smp(void)
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG();
sched_init_granularity();
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (nr_cpu_ids == 1)
- return;
-
- lb_monitor_task = kthread_create(load_balance_monitor, NULL,
- "group_balance");
- if (!IS_ERR(lb_monitor_task)) {
- lb_monitor_task->flags |= PF_NOFREEZE;
- wake_up_process(lb_monitor_task);
- } else {
- printk(KERN_ERR "Could not create load balance monitor thread"
- "(error = %ld) \n", PTR_ERR(lb_monitor_task));
- }
-#endif
}
#else
void __init sched_init_smp(void)
@@ -7424,157 +7421,6 @@ void set_curr_task(int cpu, struct task_struct *p)
#ifdef CONFIG_GROUP_SCHED
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-/*
- * distribute shares of all task groups among their schedulable entities,
- * to reflect load distribution across cpus.
- */
-static int rebalance_shares(struct sched_domain *sd, int this_cpu)
-{
- struct cfs_rq *cfs_rq;
- struct rq *rq = cpu_rq(this_cpu);
- cpumask_t sdspan = sd->span;
- int balanced = 1;
-
- /* Walk thr' all the task groups that we have */
- for_each_leaf_cfs_rq(rq, cfs_rq) {
- int i;
- unsigned long total_load = 0, total_shares;
- struct task_group *tg = cfs_rq->tg;
-
- /* Gather total task load of this group across cpus */
- for_each_cpu_mask(i, sdspan)
- total_load += tg->cfs_rq[i]->load.weight;
-
- /* Nothing to do if this group has no load */
- if (!total_load)
- continue;
-
- /*
- * tg->shares represents the number of cpu shares the task group
- * is eligible to hold on a single cpu. On N cpus, it is
- * eligible to hold (N * tg->shares) number of cpu shares.
- */
- total_shares = tg->shares * cpus_weight(sdspan);
-
- /*
- * redistribute total_shares across cpus as per the task load
- * distribution.
- */
- for_each_cpu_mask(i, sdspan) {
- unsigned long local_load, local_shares;
-
- local_load = tg->cfs_rq[i]->load.weight;
- local_shares = (local_load * total_shares) / total_load;
- if (!local_shares)
- local_shares = MIN_GROUP_SHARES;
- if (local_shares == tg->se[i]->load.weight)
- continue;
-
- spin_lock_irq(&cpu_rq(i)->lock);
- set_se_shares(tg->se[i], local_shares);
- spin_unlock_irq(&cpu_rq(i)->lock);
- balanced = 0;
- }
- }
-
- return balanced;
-}
-
-/*
- * How frequently should we rebalance_shares() across cpus?
- *
- * The more frequently we rebalance shares, the more accurate is the fairness
- * of cpu bandwidth distribution between task groups. However higher frequency
- * also implies increased scheduling overhead.
- *
- * sysctl_sched_min_bal_int_shares represents the minimum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * sysctl_sched_max_bal_int_shares represents the maximum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * These settings allows for the appropriate trade-off between accuracy of
- * fairness and the associated overhead.
- *
- */
-
-/* default: 8ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
-
-/* default: 128ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
-
-/* kernel thread that runs rebalance_shares() periodically */
-static int load_balance_monitor(void *unused)
-{
- unsigned int timeout = sysctl_sched_min_bal_int_shares;
- struct sched_param schedparm;
- int ret;
-
- /*
- * We don't want this thread's execution to be limited by the shares
- * assigned to default group (init_task_group). Hence make it run
- * as a SCHED_RR RT task at the lowest priority.
- */
- schedparm.sched_priority = 1;
- ret = sched_setscheduler(current, SCHED_RR, &schedparm);
- if (ret)
- printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
- " monitor thread (error = %d) \n", ret);
-
- while (!kthread_should_stop()) {
- int i, cpu, balanced = 1;
-
- /* Prevent cpus going down or coming up */
- get_online_cpus();
- /* lockout changes to doms_cur[] array */
- lock_doms_cur();
- /*
- * Enter a rcu read-side critical section to safely walk rq->sd
- * chain on various cpus and to walk task group list
- * (rq->leaf_cfs_rq_list) in rebalance_shares().
- */
- rcu_read_lock();
-
- for (i = 0; i < ndoms_cur; i++) {
- cpumask_t cpumap = doms_cur[i];
- struct sched_domain *sd = NULL, *sd_prev = NULL;
-
- cpu = first_cpu(cpumap);
-
- /* Find the highest domain at which to balance shares */
- for_each_domain(cpu, sd) {
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
- sd_prev = sd;
- }
-
- sd = sd_prev;
- /* sd == NULL? No load balance reqd in this domain */
- if (!sd)
- continue;
-
- balanced &= rebalance_shares(sd, cpu);
- }
-
- rcu_read_unlock();
-
- unlock_doms_cur();
- put_online_cpus();
-
- if (!balanced)
- timeout = sysctl_sched_min_bal_int_shares;
- else if (timeout < sysctl_sched_max_bal_int_shares)
- timeout *= 2;
-
- msleep_interruptible(timeout);
- }
-
- return 0;
-}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_FAIR_GROUP_SCHED
static void free_fair_sched_group(struct task_group *tg)
{
@@ -7823,47 +7669,46 @@ void sched_move_task(struct task_struct *tsk)
running = task_current(rq, tsk);
on_rq = tsk->se.on_rq;
- if (on_rq) {
+ if (on_rq)
dequeue_task(rq, tsk, 0);
- if (unlikely(running))
- tsk->sched_class->put_prev_task(rq, tsk);
- }
+ if (unlikely(running))
+ tsk->sched_class->put_prev_task(rq, tsk);
set_task_rq(tsk, task_cpu(tsk));
- if (on_rq) {
- if (unlikely(running))
- tsk->sched_class->set_curr_task(rq);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (tsk->sched_class->moved_group)
+ tsk->sched_class->moved_group(tsk);
+#endif
+
+ if (unlikely(running))
+ tsk->sched_class->set_curr_task(rq);
+ if (on_rq)
enqueue_task(rq, tsk, 0);
- }
task_rq_unlock(rq, &flags);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/* rq->lock to be locked by caller */
static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
struct rq *rq = cfs_rq->rq;
int on_rq;
- if (!shares)
- shares = MIN_GROUP_SHARES;
+ spin_lock_irq(&rq->lock);
on_rq = se->on_rq;
- if (on_rq) {
+ if (on_rq)
dequeue_entity(cfs_rq, se, 0);
- dec_cpu_load(rq, se->load.weight);
- }
se->load.weight = shares;
se->load.inv_weight = div64_64((1ULL<<32), shares);
- if (on_rq) {
+ if (on_rq)
enqueue_entity(cfs_rq, se, 0);
- inc_cpu_load(rq, se->load.weight);
- }
+
+ spin_unlock_irq(&rq->lock);
}
static DEFINE_MUTEX(shares_mutex);
@@ -7873,18 +7718,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
int i;
unsigned long flags;
+ /*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * (The default weight is 1024 - so there's no practical
+ * limitation from this.)
+ */
+ if (shares < 2)
+ shares = 2;
+
mutex_lock(&shares_mutex);
if (tg->shares == shares)
goto done;
- if (shares < MIN_GROUP_SHARES)
- shares = MIN_GROUP_SHARES;
-
- /*
- * Prevent any load balance activity (rebalance_shares,
- * load_balance_fair) from referring to this group first,
- * by taking it off the rq->leaf_cfs_rq_list on each cpu.
- */
spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i)
unregister_fair_sched_group(tg, i);
@@ -7898,11 +7743,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* w/o tripping rebalance_share or load_balance_fair.
*/
tg->shares = shares;
- for_each_possible_cpu(i) {
- spin_lock_irq(&cpu_rq(i)->lock);
+ for_each_possible_cpu(i)
set_se_shares(tg->se[i], shares);
- spin_unlock_irq(&cpu_rq(i)->lock);
- }
/*
* Enable load balance activity on this group, by inserting it back on
@@ -7934,9 +7776,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
if (runtime == RUNTIME_INF)
return 1ULL << 16;
- runtime *= (1ULL << 16);
- div64_64(runtime, period);
- return runtime;
+ return div64_64(runtime << 16, period);
}
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
@@ -7960,25 +7800,40 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
return total + to_ratio(period, runtime) < global_ratio;
}
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+ struct task_struct *g, *p;
+ do_each_thread(g, p) {
+ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+ return 1;
+ } while_each_thread(g, p);
+ return 0;
+}
+
int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
u64 rt_runtime, rt_period;
int err = 0;
- rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+ rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
if (rt_runtime_us == -1)
- rt_runtime = rt_period;
+ rt_runtime = RUNTIME_INF;
mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) {
+ err = -EBUSY;
+ goto unlock;
+ }
if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
err = -EINVAL;
goto unlock;
}
- if (rt_runtime_us == -1)
- rt_runtime = RUNTIME_INF;
tg->rt_runtime = rt_runtime;
unlock:
+ read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return err;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4b5e24cf2f4..ef358ba0768 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.exec_start);
PN(se.vruntime);
PN(se.sum_exec_runtime);
+ PN(se.avg_overlap);
nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c8e6492c592..86a93376282 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
/*
* SCHED_OTHER wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
- if (leftmost)
+ if (leftmost) {
cfs_rq->rb_leftmost = &se->run_node;
+ /*
+ * maintain cfs_rq->min_vruntime to be a monotonic increasing
+ * value tracking the leftmost vruntime in the tree.
+ */
+ cfs_rq->min_vruntime =
+ max_vruntime(cfs_rq->min_vruntime, se->vruntime);
+ }
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -184,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->rb_leftmost == &se->run_node)
- cfs_rq->rb_leftmost = rb_next(&se->run_node);
+ if (cfs_rq->rb_leftmost == &se->run_node) {
+ struct rb_node *next_node;
+ struct sched_entity *next;
+
+ next_node = rb_next(&se->run_node);
+ cfs_rq->rb_leftmost = next_node;
+
+ if (next_node) {
+ next = rb_entry(next_node,
+ struct sched_entity, run_node);
+ cfs_rq->min_vruntime =
+ max_vruntime(cfs_rq->min_vruntime,
+ next->vruntime);
+ }
+ }
+
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
@@ -260,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running)
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 slice = __sched_period(cfs_rq->nr_running);
-
- slice *= se->load.weight;
- do_div(slice, cfs_rq->load.weight);
-
- return slice;
+ return calc_delta_mine(__sched_period(cfs_rq->nr_running),
+ se->load.weight, &cfs_rq->load);
}
/*
@@ -283,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
return vslice;
}
-static u64 sched_vslice(struct cfs_rq *cfs_rq)
-{
- return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
-}
-
static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return __sched_vslice(cfs_rq->load.weight + se->load.weight,
@@ -303,7 +317,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta_exec_weighted;
- u64 vruntime;
schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
@@ -315,19 +328,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
&curr->load);
}
curr->vruntime += delta_exec_weighted;
-
- /*
- * maintain cfs_rq->min_vruntime to be a monotonic increasing
- * value tracking the leftmost vruntime in the tree.
- */
- if (first_fair(cfs_rq)) {
- vruntime = min_vruntime(curr->vruntime,
- __pick_next_entity(cfs_rq)->vruntime);
- } else
- vruntime = curr->vruntime;
-
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime, vruntime);
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -493,16 +493,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime;
- vruntime = cfs_rq->min_vruntime;
-
- if (sched_feat(TREE_AVG)) {
- struct sched_entity *last = __pick_last_entity(cfs_rq);
- if (last) {
- vruntime += last->vruntime;
- vruntime >>= 1;
- }
- } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
- vruntime += sched_vslice(cfs_rq)/2;
+ if (first_fair(cfs_rq)) {
+ vruntime = min_vruntime(cfs_rq->min_vruntime,
+ __pick_next_entity(cfs_rq)->vruntime);
+ } else
+ vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
@@ -515,8 +510,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
if (!initial) {
/* sleeps upto a single latency don't count. */
- if (sched_feat(NEW_FAIR_SLEEPERS))
- vruntime -= sysctl_sched_latency;
+ if (sched_feat(NEW_FAIR_SLEEPERS)) {
+ vruntime -= calc_delta_fair(sysctl_sched_latency,
+ &cfs_rq->load);
+ }
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
@@ -545,6 +542,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
account_entity_enqueue(cfs_rq, se);
}
+static void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff >> 3;
+}
+
+static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (!se->last_wakeup)
+ return;
+
+ update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
+ se->last_wakeup = 0;
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -555,6 +567,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
update_stats_dequeue(cfs_rq, se);
if (sleep) {
+ update_avg_stats(cfs_rq, se);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
@@ -616,12 +629,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
+static struct sched_entity *
+pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 diff, gran;
+
+ if (!cfs_rq->next)
+ return se;
+
+ diff = cfs_rq->next->vruntime - se->vruntime;
+ if (diff < 0)
+ return se;
+
+ gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
+ if (diff > gran)
+ return se;
+
+ return cfs_rq->next;
+}
+
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = NULL;
if (first_fair(cfs_rq)) {
se = __pick_next_entity(cfs_rq);
+ se = pick_next(cfs_rq, se);
set_next_entity(cfs_rq, se);
}
@@ -727,8 +760,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
-#define GROUP_IMBALANCE_PCT 20
-
#else /* CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \
@@ -819,26 +850,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se,
- *topse = NULL; /* Highest schedulable entity */
- int incload = 1;
+ struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
- topse = se;
- if (se->on_rq) {
- incload = 0;
+ if (se->on_rq)
break;
- }
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, wakeup);
wakeup = 1;
}
- /* Increment cpu load if we just enqueued the first task of a group on
- * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
- * at the highest grouping level.
- */
- if (incload)
- inc_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr);
}
@@ -851,28 +871,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
{
struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se,
- *topse = NULL; /* Highest schedulable entity */
- int decload = 1;
+ struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
- topse = se;
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, sleep);
/* Don't dequeue parent if it has other entities besides us */
- if (cfs_rq->load.weight) {
- if (parent_entity(se))
- decload = 0;
+ if (cfs_rq->load.weight)
break;
- }
sleep = 1;
}
- /* Decrement cpu load if we just dequeued the last task of a group on
- * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
- * at the highest grouping level.
- */
- if (decload)
- dec_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr);
}
@@ -974,96 +982,121 @@ static inline int wake_idle(int cpu, struct task_struct *p)
#endif
#ifdef CONFIG_SMP
-static int select_task_rq_fair(struct task_struct *p, int sync)
+
+static const struct sched_class fair_sched_class;
+
+static int
+wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+ struct task_struct *p, int prev_cpu, int this_cpu, int sync,
+ int idx, unsigned long load, unsigned long this_load,
+ unsigned int imbalance)
{
- int cpu, this_cpu;
- struct rq *rq;
- struct sched_domain *sd, *this_sd = NULL;
- int new_cpu;
+ struct task_struct *curr = this_rq->curr;
+ unsigned long tl = this_load;
+ unsigned long tl_per_task;
+
+ if (!(this_sd->flags & SD_WAKE_AFFINE))
+ return 0;
+
+ /*
+ * If the currently running task will sleep within
+ * a reasonable amount of time then attract this newly
+ * woken task:
+ */
+ if (sync && curr->sched_class == &fair_sched_class) {
+ if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+ p->se.avg_overlap < sysctl_sched_migration_cost)
+ return 1;
+ }
+
+ schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ tl_per_task = cpu_avg_load_per_task(this_cpu);
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync)
+ tl -= current->se.load.weight;
+
+ if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
+ 100*(tl + p->se.load.weight) <= imbalance*load) {
+ /*
+ * This domain has SD_WAKE_AFFINE and
+ * p is cache cold in this domain, and
+ * there is no bad imbalance.
+ */
+ schedstat_inc(this_sd, ttwu_move_affine);
+ schedstat_inc(p, se.nr_wakeups_affine);
- cpu = task_cpu(p);
- rq = task_rq(p);
- this_cpu = smp_processor_id();
- new_cpu = cpu;
+ return 1;
+ }
+ return 0;
+}
- if (cpu == this_cpu)
- goto out_set_cpu;
+static int select_task_rq_fair(struct task_struct *p, int sync)
+{
+ struct sched_domain *sd, *this_sd = NULL;
+ int prev_cpu, this_cpu, new_cpu;
+ unsigned long load, this_load;
+ struct rq *rq, *this_rq;
+ unsigned int imbalance;
+ int idx;
+
+ prev_cpu = task_cpu(p);
+ rq = task_rq(p);
+ this_cpu = smp_processor_id();
+ this_rq = cpu_rq(this_cpu);
+ new_cpu = prev_cpu;
+ /*
+ * 'this_sd' is the first domain that both
+ * this_cpu and prev_cpu are present in:
+ */
for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
+ if (cpu_isset(prev_cpu, sd->span)) {
this_sd = sd;
break;
}
}
if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
- goto out_set_cpu;
+ goto out;
/*
* Check for affine wakeup and passive balancing possibilities.
*/
- if (this_sd) {
- int idx = this_sd->wake_idx;
- unsigned int imbalance;
- unsigned long load, this_load;
-
- imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
- load = source_load(cpu, idx);
- this_load = target_load(this_cpu, idx);
-
- new_cpu = this_cpu; /* Wake to this CPU if we can */
-
- if (this_sd->flags & SD_WAKE_AFFINE) {
- unsigned long tl = this_load;
- unsigned long tl_per_task;
-
- /*
- * Attract cache-cold tasks on sync wakeups:
- */
- if (sync && !task_hot(p, rq->clock, this_sd))
- goto out_set_cpu;
-
- schedstat_inc(p, se.nr_wakeups_affine_attempts);
- tl_per_task = cpu_avg_load_per_task(this_cpu);
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync)
- tl -= current->se.load.weight;
-
- if ((tl <= load &&
- tl + target_load(cpu, idx) <= tl_per_task) ||
- 100*(tl + p->se.load.weight) <= imbalance*load) {
- /*
- * This domain has SD_WAKE_AFFINE and
- * p is cache cold in this domain, and
- * there is no bad imbalance.
- */
- schedstat_inc(this_sd, ttwu_move_affine);
- schedstat_inc(p, se.nr_wakeups_affine);
- goto out_set_cpu;
- }
- }
+ if (!this_sd)
+ goto out;
- /*
- * Start passive balancing when half the imbalance_pct
- * limit is reached.
- */
- if (this_sd->flags & SD_WAKE_BALANCE) {
- if (imbalance*this_load <= 100*load) {
- schedstat_inc(this_sd, ttwu_move_balance);
- schedstat_inc(p, se.nr_wakeups_passive);
- goto out_set_cpu;
- }
+ idx = this_sd->wake_idx;
+
+ imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
+ load = source_load(prev_cpu, idx);
+ this_load = target_load(this_cpu, idx);
+
+ if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+ load, this_load, imbalance))
+ return this_cpu;
+
+ if (prev_cpu == this_cpu)
+ goto out;
+
+ /*
+ * Start passive balancing when half the imbalance_pct
+ * limit is reached.
+ */
+ if (this_sd->flags & SD_WAKE_BALANCE) {
+ if (imbalance*this_load <= 100*load) {
+ schedstat_inc(this_sd, ttwu_move_balance);
+ schedstat_inc(p, se.nr_wakeups_passive);
+ return this_cpu;
}
}
- new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
-out_set_cpu:
+out:
return wake_idle(new_cpu, p);
}
#endif /* CONFIG_SMP */
@@ -1085,6 +1118,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
resched_task(curr);
return;
}
+
+ se->last_wakeup = se->sum_exec_runtime;
+ if (unlikely(se == pse))
+ return;
+
+ cfs_rq_of(pse)->next = pse;
+
/*
* Batch tasks do not preempt (their preemption is driven by
* the tick):
@@ -1186,6 +1226,25 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr;
+ struct task_struct *p;
+
+ if (!cfs_rq->nr_running || !first_fair(cfs_rq))
+ return MAX_PRIO;
+
+ curr = cfs_rq->curr;
+ if (!curr)
+ curr = __pick_next_entity(cfs_rq);
+
+ p = task_of(curr);
+
+ return p->prio;
+}
+#endif
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@@ -1195,45 +1254,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move;
struct rq_iterator cfs_rq_iterator;
- unsigned long load_moved;
cfs_rq_iterator.start = load_balance_start_fair;
cfs_rq_iterator.next = load_balance_next_fair;
for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
#ifdef CONFIG_FAIR_GROUP_SCHED
- struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
- unsigned long maxload, task_load, group_weight;
- unsigned long thisload, per_task_load;
- struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
+ struct cfs_rq *this_cfs_rq;
+ long imbalance;
+ unsigned long maxload;
- task_load = busy_cfs_rq->load.weight;
- group_weight = se->load.weight;
+ this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
- /*
- * 'group_weight' is contributed by tasks of total weight
- * 'task_load'. To move 'rem_load_move' worth of weight only,
- * we need to move a maximum task load of:
- *
- * maxload = (remload / group_weight) * task_load;
- */
- maxload = (rem_load_move * task_load) / group_weight;
-
- if (!maxload || !task_load)
+ imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+ /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+ if (imbalance <= 0)
continue;
- per_task_load = task_load / busy_cfs_rq->nr_running;
- /*
- * balance_tasks will try to forcibly move atleast one task if
- * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
- * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
- */
- if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
- continue;
+ /* Don't pull more than imbalance/2 */
+ imbalance /= 2;
+ maxload = min(rem_load_move, imbalance);
- /* Disable priority-based load balance */
- *this_best_prio = 0;
- thisload = this_cfs_rq->load.weight;
+ *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
#else
# define maxload rem_load_move
#endif
@@ -1242,33 +1284,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
* load_balance_[start|next]_fair iterators
*/
cfs_rq_iterator.arg = busy_cfs_rq;
- load_moved = balance_tasks(this_rq, this_cpu, busiest,
+ rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
maxload, sd, idle, all_pinned,
this_best_prio,
&cfs_rq_iterator);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * load_moved holds the task load that was moved. The
- * effective (group) weight moved would be:
- * load_moved_eff = load_moved/task_load * group_weight;
- */
- load_moved = (group_weight * load_moved) / task_load;
-
- /* Adjust shares on both cpus to reflect load_moved */
- group_weight -= load_moved;
- set_se_shares(se, group_weight);
-
- se = busy_cfs_rq->tg->se[this_cpu];
- if (!thisload)
- group_weight = load_moved;
- else
- group_weight = se->load.weight + load_moved;
- set_se_shares(se, group_weight);
-#endif
-
- rem_load_move -= load_moved;
-
if (rem_load_move <= 0)
break;
}
@@ -1398,6 +1418,16 @@ static void set_curr_task_fair(struct rq *rq)
set_next_entity(cfs_rq_of(se), se);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void moved_group_fair(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ update_curr(cfs_rq);
+ place_entity(cfs_rq, &p->se, 1);
+}
+#endif
+
/*
* All the scheduling class methods:
*/
@@ -1426,6 +1456,10 @@ static const struct sched_class fair_sched_class = {
.prio_changed = prio_changed_fair,
.switched_to = switched_to_fair,
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ .moved_group = moved_group_fair,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f54792b175b..0a6d2e51642 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -393,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
*/
for_each_sched_rt_entity(rt_se)
enqueue_rt_entity(rt_se);
-
- inc_cpu_load(rq, p->se.load.weight);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -414,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
if (rt_rq && rt_rq->rt_nr_running)
enqueue_rt_entity(rt_se);
}
-
- dec_cpu_load(rq, p->se.load.weight);
}
/*
@@ -1111,9 +1107,11 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
pull_rt_task(rq);
/*
* If there's a higher priority task waiting to run
- * then reschedule.
+ * then reschedule. Note, the above pull_rt_task
+ * can release the rq lock and p could migrate.
+ * Only reschedule if p is still on the same runqueue.
*/
- if (p->prio > rq->rt.highest_prio)
+ if (p->prio > rq->rt.highest_prio && rq->curr == p)
resched_task(p);
#else
/* For UP simply resched on drop of prio */
diff --git a/kernel/signal.c b/kernel/signal.c
index 84917fe507f..6af1210092c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1623,7 +1623,6 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
/* Let the debugger run. */
__set_current_state(TASK_TRACED);
spin_unlock_irq(&current->sighand->siglock);
- try_to_freeze();
read_lock(&tasklist_lock);
if (!unlikely(killed) && may_ptrace_stop()) {
do_notify_parent_cldstop(current, CLD_TRAPPED);
@@ -1641,6 +1640,13 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
}
/*
+ * While in TASK_TRACED, we were considered "frozen enough".
+ * Now that we woke up, it's crucial if we're supposed to be
+ * frozen that we freeze now before running anything substantial.
+ */
+ try_to_freeze();
+
+ /*
* We are back. Now reacquire the siglock before touching
* last_siginfo, so that we are sure to have synchronized with
* any signal-sending on another CPU that wants to examine it.
@@ -1757,9 +1763,15 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
sigset_t *mask = &current->blocked;
int signr = 0;
+relock:
+ /*
+ * We'll jump back here after any time we were stopped in TASK_STOPPED.
+ * While in TASK_STOPPED, we were considered "frozen enough".
+ * Now that we woke up, it's crucial if we're supposed to be
+ * frozen that we freeze now before running anything substantial.
+ */
try_to_freeze();
-relock:
spin_lock_irq(&current->sighand->siglock);
for (;;) {
struct k_sigaction *ka;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8b7e9541179..b2a2d6889ba 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -311,24 +311,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "sched_min_bal_int_shares",
- .data = &sysctl_sched_min_bal_int_shares,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "sched_max_bal_int_shares",
- .data = &sysctl_sched_max_bal_int_shares,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
-#endif
#endif
{
.ctl_name = CTL_UNNUMBERED,
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 548c436a776..7f60097d443 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -141,13 +141,8 @@ static void clocksource_watchdog(unsigned long data)
}
if (!list_empty(&watchdog_list)) {
- /* Cycle through CPUs to check if the CPUs stay synchronized to
- * each other. */
- int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
- if (next_cpu >= NR_CPUS)
- next_cpu = first_cpu(cpu_online_map);
- watchdog_timer.expires += WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, next_cpu);
+ __mod_timer(&watchdog_timer,
+ watchdog_timer.expires + WATCHDOG_INTERVAL);
}
spin_unlock(&watchdog_lock);
}
@@ -169,7 +164,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
if (!started && watchdog) {
watchdog_last = watchdog->read();
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, first_cpu(cpu_online_map));
+ add_timer(&watchdog_timer);
}
} else {
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -179,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
if (watchdog)
del_timer(&watchdog_timer);
watchdog = cs;
- init_timer_deferrable(&watchdog_timer);
+ init_timer(&watchdog_timer);
watchdog_timer.function = clocksource_watchdog;
/* Reset watchdog cycles */
@@ -190,8 +185,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
watchdog_last = watchdog->read();
watchdog_timer.expires =
jiffies + WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer,
- first_cpu(cpu_online_map));
+ add_timer(&watchdog_timer);
}
}
}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c88b5910e7a..5fd9b946977 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
long time_freq; /* frequency offset (scaled ppm)*/
static long time_reftime; /* time at last adjustment (s) */
long time_adjust;
+static long ntp_tick_adj;
static void ntp_update_frequency(void)
{
u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
<< TICK_LENGTH_SHIFT;
- second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+ second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT;
second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
tick_length_base = second_length;
@@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc)
freq_adj = shift_right(freq_adj, time_constant * 2 +
(SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+ u64 utemp64;
temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
if (time_offset < 0) {
- temp64 = -temp64;
- do_div(temp64, mtemp);
- freq_adj -= temp64;
+ utemp64 = -temp64;
+ do_div(utemp64, mtemp);
+ freq_adj -= utemp64;
} else {
- do_div(temp64, mtemp);
- freq_adj += temp64;
+ utemp64 = temp64;
+ do_div(utemp64, mtemp);
+ freq_adj += utemp64;
}
}
freq_adj += time_freq;
@@ -400,3 +403,11 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
notify_cmos_timer();
return(result);
}
+
+static int __init ntp_tick_adj_setup(char *str)
+{
+ ntp_tick_adj = simple_strtol(str, NULL, 0);
+ return 1;
+}
+
+__setup("ntp_tick_adj=", ntp_tick_adj_setup);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2968298f8f3..686da821d37 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -640,7 +640,7 @@ void tick_cancel_sched_timer(int cpu)
if (ts->sched_timer.base)
hrtimer_cancel(&ts->sched_timer);
- ts->tick_stopped = 0;
+
ts->nohz_mode = NOHZ_MODE_INACTIVE;
}
#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1af9fb050fe..a3fa587c350 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -187,13 +187,16 @@ static void change_clocksource(void)
clock->error = 0;
clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock,
- (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
tick_clock_notify();
+ /*
+ * We're holding xtime lock and waking up klogd would deadlock
+ * us on enqueue. So no printing!
printk(KERN_INFO "Time: %s clocksource has been installed.\n",
clock->name);
+ */
}
#else
static inline void change_clocksource(void) { }
@@ -245,8 +248,7 @@ void __init timekeeping_init(void)
ntp_clear();
clock = clocksource_get_next();
- clocksource_calculate_interval(clock,
- (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
clock->cycle_last = clocksource_read(clock);
xtime.tv_sec = sec;
diff --git a/kernel/timer.c b/kernel/timer.c
index 99b00a25f88..b024106daa7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu)
spin_lock_irqsave(&base->lock, flags);
timer_set_base(timer, base);
internal_add_timer(base, timer);
+ /*
+ * Check whether the other CPU is idle and needs to be
+ * triggered to reevaluate the timer wheel when nohz is
+ * active. We are protected against the other CPU fiddling
+ * with the timer by holding the timer base lock. This also
+ * makes sure that a CPU on the way to idle can not evaluate
+ * the timer wheel.
+ */
+ wake_up_idle_cpu(cpu);
spin_unlock_irqrestore(&base->lock, flags);
}
-
/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified